package gov.nih.nlm.nls.nlp.tokenizer;

import gov.nih.nlm.nls.nlp.textfeatures.Category;
import gov.nih.nlm.nls.nlp.textfeatures.Collection;
import gov.nih.nlm.nls.nlp.textfeatures.Document;
import gov.nih.nlm.nls.nlp.textfeatures.Section;
import gov.nih.nlm.nls.nlp.textfeatures.Sentence;
import gov.nih.nlm.nls.utils.Debug;
import gov.nih.nlm.nls.utils.GlobalBehavior;
import gov.nih.nlm.nls.utils.U;
import java.io.BufferedReader;
import java.io.StringReader;
import java.util.Vector;

/* loaded from: input_file:gov/nih/nlm/nls/nlp/tokenizer/FreeTextTokenizer.class */
public class FreeTextTokenizer implements TokenizerInterface {
    public static final String MedlineCitationDelimiter = "UI  -";
    private BufferedReader cBuff;
    private StringBuffer documentBuff;
    private BufferedReader sBuff;
    private StringBuffer sectionBuff;
    private GlobalBehavior settings;
    private boolean isInteractive;
    private int beginOffset;
    private ShapeTokenizer shapeTokenizer;
    private static final int DT12922 = 12922;
    private static final int DF12923 = 12923;
    private static final int DT12530 = 12530;
    private static final int DF12531 = 12531;
    private static final int DT3290 = 3290;
    private static final int DF3291 = 3291;
    private static final int DT12634 = 12634;
    private static final int DF12635 = 12635;
    private static final int DT12928 = 12928;
    private static final int DF12929 = 12929;
    private static final int DT12930 = 12930;
    private static final int DF12931 = 12931;
    private static final int DT12932 = 12932;
    private static final int DF12933 = 12933;
    private static final int DT12934 = 12934;
    private static final int DF12935 = 12935;
    private static final int DT12936 = 12936;
    private static final int DF12937 = 12937;
    private static final int DT12534 = 12534;
    private static final int DF12535 = 12535;
    private static final int DT12938 = 12938;
    private static final int DF12939 = 12939;
    private static final int DT12976 = 12976;
    private static final int DF12977 = 12977;
    private static final int DT12978 = 12978;
    private static final int DF12979 = 12979;

    public FreeTextTokenizer() {
        this.cBuff = null;
        this.documentBuff = null;
        this.sBuff = null;
        this.sectionBuff = null;
        this.settings = null;
        this.isInteractive = false;
        this.beginOffset = 0;
        this.shapeTokenizer = null;
    }

    public FreeTextTokenizer(GlobalBehavior globalBehavior) {
        this.cBuff = null;
        this.documentBuff = null;
        this.sBuff = null;
        this.sectionBuff = null;
        this.settings = null;
        this.isInteractive = false;
        this.beginOffset = 0;
        this.shapeTokenizer = null;
        Debug.dfname("FreeTextTokenizer:Constructor");
        Debug.denter(DT12922);
        this.settings = globalBehavior;
        try {
            this.shapeTokenizer = new ShapeTokenizer(this.settings);
            Debug.dexit(DT12922);
        } catch (Exception e) {
            throw new RuntimeException(e.toString());
        }
    }

    @Override // gov.nih.nlm.nls.nlp.tokenizer.TokenizerInterface
    public void tokenize(Collection collection) {
        Debug.dfname("tokenize:Collection");
        Debug.denter(DT12530);
        Vector vector = null;
        try {
            vector = breakIntoDocuments(collection.getOriginalString());
            if (vector != null) {
                collection.setDocuments(vector);
            }
        } catch (Exception e) {
            Debug.warning(new StringBuffer().append("Not able to tokenize into Documents :").append(e.toString()).toString());
            e.printStackTrace();
        }
        for (int i = 0; i < vector.size(); i++) {
            Document document = (Document) vector.get(i);
            Debug.dpr(DF12531, new StringBuffer().append("----->[").append(document).append("]<-----").toString());
            try {
                tokenize(document);
            } catch (Exception e2) {
                e2.printStackTrace();
            }
        }
        Debug.dexit(DT12530);
    }

    @Override // gov.nih.nlm.nls.nlp.tokenizer.TokenizerInterface
    public void tokenize(Document document) throws Exception {
        Debug.dfname("tokenize:aDocument");
        Debug.denter(DT3290);
        try {
            Vector breakIntoSections = breakIntoSections(document.getOriginalString());
            if (breakIntoSections != null) {
                document.setSections(breakIntoSections);
            }
        } catch (Exception e) {
            Debug.warning(new StringBuffer().append("Not able to tokenize documents into sections:").append(e.toString()).toString());
            e.printStackTrace();
        }
        Section section = null;
        Vector sections = document.getSections();
        for (int i = 0; i < sections.size(); i++) {
            try {
                section = (Section) sections.get(i);
            } catch (Exception e2) {
                e2.printStackTrace();
            }
            Debug.dpr(DF3291, new StringBuffer().append("----->[").append(section).append("]<-----").toString());
            tokenize(section);
        }
        Debug.dexit(DT3290);
    }

    public void tokenize(Section section) throws Exception {
        Debug.dfname("tokenize:Section");
        Debug.denter(DT12634);
        Vector vector = null;
        try {
            vector = new SentenceTokenizer(this.settings).breakIntoSentences(section.getOriginalString());
            if (vector != null) {
                section.setSentences(vector);
            }
        } catch (Exception e) {
            Debug.warning(new StringBuffer().append("Not able to tokenize into Sentences :").append(e.toString()).toString());
            e.printStackTrace();
        }
        if (vector != null) {
            for (int i = 0; i < vector.size(); i++) {
                Sentence sentence = (Sentence) vector.elementAt(i);
                Debug.dpr(DF12635, new StringBuffer().append("----->[").append(sentence).append("]<-----").toString());
                this.shapeTokenizer.shapeTokenize(sentence);
            }
        }
        Debug.dexit(DT12634);
    }

    @Override // gov.nih.nlm.nls.nlp.tokenizer.TokenizerInterface
    public Document breakIntoDocumentsBegin(String str) {
        Debug.dfname("breakIntoDocumentsBegin");
        Debug.denter(DT12928);
        Document document = null;
        try {
            this.cBuff = new BufferedReader(new StringReader(str));
            this.documentBuff = new StringBuffer();
            document = nextDocument();
        } catch (Exception e) {
            Debug.warning(new StringBuffer().append("Something went wrong with reading the collection ").append(e.toString()).toString());
            e.printStackTrace();
        }
        Debug.dexit(DT12928);
        return document;
    }

    @Override // gov.nih.nlm.nls.nlp.tokenizer.TokenizerInterface
    public Document breakIntoDocumentsBegin(Collection collection) {
        Debug.dfname("breakIntoDocumentsBegin");
        Debug.denter(DT12928);
        Document document = null;
        try {
            if (collection.isInteractive()) {
                this.cBuff = collection.getBufferedReader();
                this.isInteractive = true;
                Debug.dpr(DF12929, "This is interactive");
            } else {
                this.cBuff = new BufferedReader(new StringReader(collection.getOriginalString()));
            }
            this.documentBuff = new StringBuffer();
            document = nextDocument();
        } catch (Exception e) {
            Debug.warning(new StringBuffer().append("Something went wrong with reading the collection ").append(e.toString()).toString());
            e.printStackTrace();
        }
        Debug.dexit(DT12928);
        return document;
    }

    @Override // gov.nih.nlm.nls.nlp.tokenizer.TokenizerInterface
    public Document breakIntoDocumentsNext() {
        Debug.dfname("breakIntoDocumentsNext");
        Debug.denter(DT12930);
        Document nextDocument = nextDocument();
        Debug.dexit(DT12930);
        return nextDocument;
    }

    @Override // gov.nih.nlm.nls.nlp.tokenizer.TokenizerInterface
    public Section breakIntoSectionsBegin(String str) {
        Debug.dfname("breakIntoSectionsBegin");
        Debug.denter(DT12932);
        Section section = null;
        try {
            this.sBuff = new BufferedReader(new StringReader(str));
            this.sectionBuff = new StringBuffer();
            section = nextSection();
            this.beginOffset = 0;
        } catch (Exception e) {
            Debug.warning(new StringBuffer().append("Something went wrong with reading the document for sections ").append(e.toString()).toString());
            e.printStackTrace();
        }
        Debug.dexit(DT12932);
        return section;
    }

    @Override // gov.nih.nlm.nls.nlp.tokenizer.TokenizerInterface
    public Section breakIntoSectionsBegin(Document document) {
        Debug.dfname("breakIntoSectionsBegin:Document");
        Debug.denter(DT12932);
        Section section = null;
        try {
            this.sBuff = new BufferedReader(new StringReader(document.getOriginalString()));
            this.sectionBuff = new StringBuffer();
            section = nextSection();
        } catch (Exception e) {
            Debug.warning(new StringBuffer().append("Something went wrong with reading the document for sections ").append(e.toString()).toString());
            e.printStackTrace();
        }
        Debug.dexit(DT12932);
        return section;
    }

    @Override // gov.nih.nlm.nls.nlp.tokenizer.TokenizerInterface
    public Section breakIntoSectionsNext() {
        Debug.dfname("breakIntoSectionsNext");
        Debug.denter(DT12934);
        Section nextSection = nextSection();
        Debug.dexit(DT12934);
        return nextSection;
    }

    @Override // gov.nih.nlm.nls.nlp.tokenizer.TokenizerInterface
    public Vector breakIntoDocuments(String str) {
        Debug.dfname("breakIntoDocuments");
        Debug.denter(DT12936);
        Vector vector = new Vector();
        Document breakIntoDocumentsBegin = breakIntoDocumentsBegin(str);
        while (true) {
            Document document = breakIntoDocumentsBegin;
            if (document == null) {
                Debug.dexit(DT12936);
                return vector;
            }
            vector.addElement(document);
            breakIntoDocumentsBegin = breakIntoDocumentsNext();
        }
    }

    public Vector breakIntoSections(String str) {
        Debug.dfname("breakIntoSections");
        Debug.denter(DT12534);
        Vector vector = new Vector();
        Section breakIntoSectionsBegin = breakIntoSectionsBegin(str);
        while (true) {
            Section section = breakIntoSectionsBegin;
            if (section == null) {
                Debug.dexit(DT12534);
                return vector;
            }
            vector.addElement(section);
            breakIntoSectionsBegin = breakIntoSectionsNext();
        }
    }

    @Override // gov.nih.nlm.nls.nlp.tokenizer.TokenizerInterface
    public void close() {
        Debug.dfname("close");
        Debug.denter(DT12938);
        Debug.dexit(DT12938);
    }

    @Override // gov.nih.nlm.nls.nlp.tokenizer.TokenizerInterface
    public void setInteractiveMode() {
        this.isInteractive = true;
    }

    private Document nextDocument() {
        String readLine;
        Debug.dfname("nextDocument");
        Debug.denter(DT12976);
        Document document = null;
        boolean z = false;
        int i = 0;
        int i2 = 0;
        boolean z2 = false;
        while (this.cBuff != null && !z && (readLine = this.cBuff.readLine()) != null) {
            try {
                if (this.isInteractive) {
                    if (readLine != null && readLine.length() == 0) {
                        Debug.dpr(DF12977, "blank line seen, counter incremented");
                        if (z2) {
                            i2++;
                            Debug.dpr(DF12977, new StringBuffer().append("Previous line was blank ").append(i2).toString());
                        }
                        i++;
                    } else if (readLine != null) {
                        Debug.dpr(DF12977, new StringBuffer().append("line == ").append(readLine).toString());
                        z2 = false;
                        i2 = 0;
                        Debug.dpr(DF12977, "Previous line was not blank");
                    } else if (readLine == null) {
                        i++;
                        i2++;
                        Debug.dpr(DF12977, "null line == ");
                    }
                    if (i == 1 && !z2) {
                        z = true;
                        Debug.dpr(DF12977, "A blank line was seen, and previousLine was not blank");
                    } else if (z2) {
                        Debug.dpr(DF12977, "A blank line was seen, and previousLine was blank");
                        if (i2 == 2) {
                            z = true;
                            this.documentBuff = new StringBuffer();
                        }
                    } else {
                        z = false;
                    }
                }
                if (!z) {
                    this.documentBuff.append(new StringBuffer().append(readLine).append(U.NL).toString());
                } else if (this.documentBuff.length() > 0) {
                    this.documentBuff.append(new StringBuffer().append(readLine).append(U.NL).toString());
                    Debug.dpr(DF12977, new StringBuffer().append("Making a document of |").append(this.documentBuff.toString()).append(Category.CATEGORY_BAR2).toString());
                    document = new Document(this.documentBuff);
                    z = true;
                    this.documentBuff = new StringBuffer();
                }
                if (readLine != null && readLine.length() == 0) {
                    z2 = true;
                    Debug.dpr(DF12977, "previousLineIsBlank being set to true");
                }
            } catch (Exception e) {
                Debug.warning(new StringBuffer().append("Something went wrong when breaking the collection into documents :").append(e.getMessage()).toString());
                e.printStackTrace();
            }
        }
        if (this.cBuff != null && !z && this.documentBuff.toString().trim().length() > 0) {
            document = new Document(this.documentBuff);
            this.documentBuff = null;
            try {
                this.cBuff.close();
            } catch (Exception e2) {
                Debug.warning(new StringBuffer().append("Something went wrong when close the collection :").append(e2.getMessage()).toString());
            }
            this.cBuff = null;
        }
        Debug.dexit(DT12976);
        return document;
    }

    private Section nextSection() {
        String readLine;
        Debug.dfname("nextSection");
        Debug.denter(DT12978);
        Section section = null;
        boolean z = false;
        boolean z2 = false;
        while (this.sBuff != null && !z && (readLine = this.sBuff.readLine()) != null) {
            try {
                try {
                    Debug.dpr(DF12979, new StringBuffer().append("looking at line |").append(readLine).append(Category.CATEGORY_BAR2).toString());
                    if (readLine.length() == 0) {
                        this.sectionBuff.append(new StringBuffer().append(readLine).append(U.NL).toString());
                        z2 = true;
                    } else if (z2) {
                        z2 = false;
                        if (this.sectionBuff.length() > 0) {
                            Debug.dpr(DF12979, new StringBuffer().append("Making a section of |").append(this.sectionBuff.toString()).append(Category.CATEGORY_BAR2).toString());
                            section = new Section(this.settings, Section.PARAGRAPH, null, this.sectionBuff);
                            int length = (this.beginOffset + this.sectionBuff.length()) - 1;
                            section.setSpan(this.beginOffset, length);
                            Debug.dpr(DF12979, new StringBuffer().append("Beginning Offset = ").append(this.beginOffset).toString());
                            Debug.dpr(DF12979, new StringBuffer().append("end Offset = ").append(length).toString());
                            this.beginOffset += this.sectionBuff.length();
                            Debug.dpr(DF12979, new StringBuffer().append("next Beginning Offset = ").append(this.beginOffset).toString());
                            z = true;
                        }
                        this.sectionBuff = new StringBuffer();
                        this.sectionBuff.append(new StringBuffer().append(readLine).append(U.NL).toString());
                    } else {
                        this.sectionBuff.append(new StringBuffer().append(readLine).append(U.NL).toString());
                    }
                } catch (Exception e) {
                    Debug.warning(new StringBuffer().append("Something went wrong with breaking up the collection ").append(e.toString()).toString());
                    e.printStackTrace();
                }
            } catch (Exception e2) {
                Debug.warning(new StringBuffer().append("Something went wrong with reading the collection ").append(e2.toString()).toString());
                e2.printStackTrace();
            }
        }
        if (this.sBuff != null && !z && this.sectionBuff.length() > 0) {
            section = new Section(this.settings, Section.PARAGRAPH, null, this.sectionBuff);
            int length2 = (this.beginOffset + this.sectionBuff.length()) - 1;
            section.setSpan(this.beginOffset, length2);
            Debug.dpr(DF12979, new StringBuffer().append("last Beginning Offset = ").append(this.beginOffset).toString());
            Debug.dpr(DF12979, new StringBuffer().append("last end Offset = ").append(length2).toString());
            this.beginOffset = this.beginOffset + length2 + 1;
            this.sectionBuff = null;
            this.sBuff.close();
            this.sBuff = null;
        }
        Debug.dexit(DT12978);
        return section;
    }
}
