package gov.nih.nlm.nls.nlp.tokenizer;

import gov.nih.nlm.nls.nlp.nlsstrings.MWIUtilities;
import gov.nih.nlm.nls.nlp.textfeatures.Category;
import gov.nih.nlm.nls.nlp.textfeatures.Sentence;
import gov.nih.nlm.nls.nlp.textfeatures.TokenChars;
import gov.nih.nlm.nls.utils.Debug;
import gov.nih.nlm.nls.utils.FileReplace;
import gov.nih.nlm.nls.utils.GlobalBehavior;
import gov.nih.nlm.nls.utils.StringUtils;
import gov.nih.nlm.nls.utils.U;
import java.text.BreakIterator;
import java.util.Locale;
import java.util.Vector;

/* loaded from: input_file:gov/nih/nlm/nls/nlp/tokenizer/SentenceTokenizer.class */
public final class SentenceTokenizer {
    private Locale currentLocale;
    private BreakIterator sentenceIterator;
    private GlobalBehavior settings;
    private ShapeTokenizer shapeTokenizer;
    private AmbiguousAcronyms aas;
    private boolean termProcessing;
    private static final String twoHyphens = "--";
    private static final int DT12984 = 12984;
    private static final int DF12985 = 12985;
    private static final int DT12986 = 12986;
    private static final int DF12987 = 12987;
    private static final int DT12988 = 12988;
    private static final int DF12989 = 12989;
    private static final int DT12990 = 12990;
    private static final int DF12991 = 12991;
    private static final int DT12992 = 12992;
    private static final int DF12993 = 12993;
    private static final int DT12994 = 12994;
    private static final int DF12995 = 12995;
    private static final int DT12996 = 12996;
    private static final int DF12997 = 12997;
    private static final int DT12998 = 12998;
    private static final int DF12999 = 12999;
    private static final int DT3294 = 3294;
    private static final int DF3295 = 3295;
    private static final int DT13000 = 13000;
    private static final int DF13001 = 13001;
    private static final int DT3296 = 3296;
    private static final int DF3297 = 3297;
    private static final int DT13828 = 13828;
    private static final int DF13829 = 13829;
    private static final String twoNewLines = new String(TokenChars.NEWLINE + TokenChars.NEWLINE);
    private static final String colonPattern1 = new String(": ");
    private static final String colonPattern2 = new String(":" + U.NL);
    private static final String semiColonPattern1 = new String("; ");
    private static final String semiColonPattern2 = new String(Category.CATEGORY_SEMICOLON2 + U.NL);
    private static final String periodAtEndOfLine = new String("." + U.NL);
    private static final String periodAtEndOfDOSLine = new String(".\r\n");
    private static final String bangAtEndOfLine = new String(Category.CATEGORY_BANG + U.NL);
    private static final String questionAtEndOfLine = new String(Category.CATEGORY_QUESTIONMARK2 + U.NL);
    public static final String SENTENCE_BREAK = "<SENTENCE_BREAK>";
    private static final String hardSentenceBreak = new String(SENTENCE_BREAK);
    private static final String embeddedPeriod = new String(".\" ");
    private static final String periodFollowedByPound = new String(". # ");
    private static final String BEGIN_POUND_PATTERN = new String(U.NL + '#');
    private static final String FAKE_POUND_PATTERN = new String("<SENTENCE_BREAK>ā");
    private static final String EM_Hyphen = new String(" —");
    private String sectionText = null;
    private Vector fields = null;
    private int boundary = 0;
    private Vector someSentences = null;
    private int someSentencesPtr = 0;
    private int offset = 0;
    private int beginningOfSentence = 0;
    private int endOfSentence = 0;

    public SentenceTokenizer(GlobalBehavior globalBehavior) throws Exception {
        this.currentLocale = null;
        this.sentenceIterator = null;
        this.settings = null;
        this.shapeTokenizer = null;
        this.aas = null;
        this.termProcessing = false;
        Debug.dfname("SentenceTokenizer:Constructor:GlobalSettings:");
        Debug.denter(DT12984);
        this.currentLocale = new Locale("en", "US");
        this.sentenceIterator = BreakIterator.getSentenceInstance();
        this.settings = globalBehavior;
        this.shapeTokenizer = new ShapeTokenizer(this.settings);
        this.aas = new AmbiguousAcronyms(this.settings);
        this.termProcessing = this.settings.getBoolean("--term_processing");
        Debug.dexit(DT12984);
    }

    public Vector breakIntoSentences(String str) throws Exception {
        Vector vector = null;
        Sentence sentence = null;
        Debug.dfname("breakIntoSentences");
        Debug.denter(DT12990);
        if (str != null) {
            sentence = breakIntoSentencesBegin(str, 0);
        }
        if (sentence != null) {
            vector = new Vector();
        }
        while (sentence != null) {
            vector.addElement(sentence);
            sentence.setCtr(vector.size() - 1);
            sentence = breakIntoSentencesNext();
        }
        Debug.dexit(DT12990);
        return vector;
    }

    public Sentence breakIntoSentencesBegin(String str, int i) throws Exception {
        Sentence sentence = null;
        Debug.dfname("breakIntoSentencesBegin");
        Debug.denter(DT12992);
        if (str != null) {
            try {
                String replace = str.replace('*', (char) 254).replace(BEGIN_POUND_PATTERN, FAKE_POUND_PATTERN);
                Debug.dpr(DF12993, "Converted Section = |" + replace + Category.CATEGORY_BAR2 + i);
                this.someSentencesPtr = 0;
                this.beginningOfSentence = 0;
                this.offset = i;
                this.sentenceIterator.setText(replace);
                this.boundary = this.sentenceIterator.first();
                this.sectionText = replace;
                sentence = nextSentence();
            } catch (Exception e) {
                Debug.dexit(DT12992);
                e.printStackTrace();
                throw new Exception("Something went wrong getting the next sentence " + e.toString());
            }
        }
        Debug.dexit(DT12992);
        return sentence;
    }

    public Sentence breakIntoSentencesNext() throws Exception {
        Debug.dfname("breakIntoSentencesNext");
        Debug.denter(DT12994);
        try {
            Sentence nextSentence = nextSentence();
            Debug.dexit(DT12994);
            return nextSentence;
        } catch (Exception e) {
            e.printStackTrace();
            throw new Exception("Something went wrong getting the next sentence " + e.toString());
        }
    }

    public String stripOffNonTokenTokens(String str) {
        String str2 = null;
        Debug.dfname("stripOffNonTokenTokens");
        Debug.denter(DT13828);
        String changeLine = FileReplace.changeLine(str, SENTENCE_BREAK, "");
        if (changeLine != null) {
            str2 = new String(changeLine).replace((char) 254, '*').replace((char) 257, '#');
        }
        Debug.dpr(DF13829, "Returning |" + str2 + Category.CATEGORY_BAR2);
        Debug.dexit(DT13828);
        return str2;
    }

    public void clear() {
        this.sectionText = null;
        this.boundary = 0;
        this.someSentences = null;
        this.someSentencesPtr = 0;
        this.offset = 0;
        this.beginningOfSentence = 0;
        this.endOfSentence = 0;
    }

    private Sentence nextSentence() throws Exception {
        Sentence sentence = null;
        Debug.dfname("nextSentence");
        Debug.denter(DT12996);
        if (this.someSentencesPtr == 0) {
            this.someSentences = nextSentenceAux();
            if (this.someSentences != null) {
                this.someSentencesPtr = this.someSentences.size();
            }
        }
        if (this.someSentencesPtr > 0) {
            sentence = (Sentence) this.someSentences.get(this.someSentences.size() - this.someSentencesPtr);
            this.someSentencesPtr--;
        }
        if (sentence != null) {
            try {
                String stripOffNonTokenTokens = stripOffNonTokenTokens(sentence.getOriginalString());
                if (stripOffNonTokenTokens == null || stripOffNonTokenTokens.length() <= 0) {
                    sentence = nextSentence();
                } else {
                    Debug.dpr(DF12997, "--stripped ->|" + stripOffNonTokenTokens);
                    if (this.termProcessing) {
                        stripOffNonTokenTokens = MWIUtilities.normalizeMetaString(stripOffNonTokenTokens);
                        Debug.dpr(DF12997, "--stripped and uninverted->|" + stripOffNonTokenTokens);
                    }
                    sentence.setOriginalString(stripOffNonTokenTokens);
                    sentence.setTrimmedString();
                    sentence.setStrippedString();
                    Debug.dpr(DF12997, "--az->|" + sentence.getOriginalString() + Category.CATEGORY_BAR2);
                    if (sentence != null) {
                        try {
                            this.shapeTokenizer.shapeTokenize(sentence);
                        } catch (Exception e) {
                            e.printStackTrace();
                            throw new Exception("something went wrong with word tokenizing " + sentence.getOriginalString() + " :" + e.toString());
                        }
                    }
                }
            } catch (Exception e2) {
                e2.printStackTrace();
                throw new Exception("something went wrong with word tokenizing " + sentence.getOriginalString() + TokenChars.SPACE_s + e2.toString());
            }
        }
        if (sentence != null) {
            Debug.dpr(DF12997, "Sentence|" + sentence.getOriginalString());
        }
        Debug.dexit(DT12996);
        return sentence;
    }

    private Vector nextSentenceAux() {
        Debug.dfname("nextSentenceAux");
        Debug.denter(DT12998);
        Vector vector = new Vector();
        boolean z = false;
        Debug.dpr(DF12999, "Section:[" + this.sectionText + Category.CATEGORY_RIGHTBRACKET2);
        while (!z && this.boundary != -1) {
            Debug.dpr(DF12999, Category.CATEGORY_LEFTBRACKET2 + this.sectionText.substring(this.beginningOfSentence, this.boundary) + Category.CATEGORY_RIGHTBRACKET2);
            if (this.boundary > 0 && this.boundary <= this.sectionText.length()) {
                String wordBeforeBoundary = getWordBeforeBoundary(this.sectionText, this.boundary);
                String wordAfterBoundary = getWordAfterBoundary(this.sectionText, this.boundary);
                Debug.dpr(DF12999, "WordBeforeBoundary = " + wordBeforeBoundary);
                Debug.dpr(DF12999, "WordAfterBoundary = " + wordAfterBoundary + TokenChars.SPACE_s + StringUtils.isLowercase(wordAfterBoundary));
                if (!StringUtils.isLowercase(wordAfterBoundary) && (!this.settings.detectAmbiguousAbbreviation() || !this.aas.isAmbiguousAcronym(wordBeforeBoundary))) {
                    String substring = this.sectionText.substring(this.beginningOfSentence, this.boundary);
                    Debug.dpr(DF12999, Category.CATEGORY_LEFTBRACKET2 + substring + "] with boundary at " + this.boundary);
                    while (true) {
                        String[] moreSentencesMarked = moreSentencesMarked(substring);
                        if (moreSentencesMarked == null) {
                            break;
                        }
                        String str = moreSentencesMarked[0];
                        if (str != null && str.trim().length() > 0) {
                            Debug.dpr(DF12999, "1: adding sentence ->[" + str + Category.CATEGORY_RIGHTBRACKET2);
                            this.endOfSentence = this.beginningOfSentence + str.length();
                            vector.addElement(new Sentence(str, this.offset + this.beginningOfSentence, (this.offset + this.endOfSentence) - 1));
                            this.beginningOfSentence = this.endOfSentence;
                            z = true;
                        }
                        substring = moreSentencesMarked[1];
                    }
                    if (substring != null && substring.trim().length() > 0) {
                        Debug.dpr(DF12999, "2: adding sentence ->[" + substring + Category.CATEGORY_RIGHTBRACKET2);
                        vector.addElement(new Sentence(substring, this.beginningOfSentence, this.boundary - 1));
                        z = true;
                    }
                    this.beginningOfSentence = this.boundary;
                }
            }
            this.boundary = this.sentenceIterator.next();
            Debug.dpr(DF12999, "The next boundary comes at " + this.boundary);
        }
        if (!z && this.beginningOfSentence < this.sectionText.length() - 1) {
            Debug.dpr(DF12999, "The beginning of the Sentence is " + this.beginningOfSentence);
            Debug.dpr(DF12999, "The documentLength = " + this.sectionText.length());
            String substring2 = this.sectionText.substring(this.beginningOfSentence);
            if (substring2 != null && substring2.trim().length() > 0) {
                vector.addElement(new Sentence(substring2, this.beginningOfSentence, this.boundary - 1));
                this.beginningOfSentence = this.sectionText.length() - 1;
            }
        }
        if (vector.size() == 0) {
            vector = null;
        }
        Debug.dexit(DT12998);
        return vector;
    }

    private final String[] moreSentencesMarked(String str) {
        String[] strArr = null;
        String str2 = null;
        String str3 = null;
        int[] iArr = new int[14];
        String[] strArr2 = {colonPattern1, colonPattern2, twoNewLines, semiColonPattern1, semiColonPattern2, periodAtEndOfLine, periodAtEndOfLine, bangAtEndOfLine, questionAtEndOfLine, hardSentenceBreak, embeddedPeriod, periodFollowedByPound, twoHyphens, EM_Hyphen};
        Debug.dfname("moreSentencesMarked");
        Debug.denter(DT3294);
        if (str != null && str.length() > 0) {
            Debug.dpr(DF3295, "Entering with -->" + str + "<---");
            StringBuffer stringBuffer = new StringBuffer();
            if (Character.isWhitespace(str.charAt(0))) {
                int i = 0;
                while (i < str.length() && Character.isWhitespace(str.charAt(i))) {
                    stringBuffer.append(str.charAt(i));
                    i++;
                }
                str2 = stringBuffer.toString();
                str3 = str.substring(i);
            } else {
                int i2 = -1;
                int i3 = 9999999;
                for (int i4 = 0; i4 < 14; i4++) {
                    iArr[i4] = str.indexOf(strArr2[i4]);
                    if (iArr[i4] > -1) {
                        Debug.dpr(DF3295, Category.CATEGORY_LEFTCURLYBRACKET2 + strArr2[i4] + "} spotted at " + iArr[i4]);
                    }
                    if (iArr[i4] > -1 && iArr[i4] < i3) {
                        i3 = iArr[i4];
                        i2 = i4;
                    }
                }
                boolean z = false;
                int indexOf = str.indexOf(".");
                if (indexOf > 0) {
                    Debug.dpr(DF3295, "Found a period ");
                    int i5 = indexOf + 1;
                    while (i5 < str.length() && TokenChars.WHITE_SPACE_CHARS.indexOf(str.charAt(i5)) > -1 && indexOf <= i3) {
                        z = true;
                        i5++;
                    }
                    if (z) {
                        Debug.dpr(DF3295, "Found a period and whitespace");
                        if (i5 < str.length() - 1 && (str.charAt(i5) == 254 || str.charAt(i5) == 257 || str.charAt(i5) == '#' || str.charAt(i5) == '-')) {
                            Debug.dpr(DF3295, "Found a period and whitespace, with an *, # or -");
                            i3 = indexOf;
                            int i6 = i5 - indexOf;
                        }
                    }
                }
                if (i3 < 9999999 && i2 > -1) {
                    int length = strArr2[i2].length();
                    Debug.dpr(DF3295, "Going to break the sentence at  " + i3 + " index " + i2 + " leng = " + length);
                    str2 = str.substring(0, i3 + length);
                    str3 = str.substring(i3 + length);
                }
            }
            if (str2 != null) {
                Debug.dpr(DF3295, "Returning with firstSentence [" + str2 + Category.CATEGORY_RIGHTBRACKET2);
                Debug.dpr(DF3295, "Returning with rest of stream [" + str3 + Category.CATEGORY_RIGHTBRACKET2);
                strArr = new String[]{str2, str3};
            }
        }
        Debug.dexit(DT3294);
        return strArr;
    }

    private static String getWordBeforeBoundary(String str, int i) {
        StringBuffer stringBuffer = new StringBuffer();
        Debug.dfname("getWordBeforeBoundary");
        Debug.denter(DT3296);
        Debug.dpr(DF3297, "The text we have so far is " + str);
        for (int i2 = i - 1; i2 >= 0; i2--) {
            if (Character.isWhitespace(str.charAt(i2))) {
                if (stringBuffer.length() > 0) {
                    break;
                }
            } else {
                stringBuffer.insert(0, str.charAt(i2));
                Debug.dpr(DF3297, "adding [" + str.charAt(i2) + "] to " + stringBuffer.toString().trim());
            }
        }
        Debug.dpr(DF3297, "Word before boundary is [" + stringBuffer.toString().trim() + Category.CATEGORY_RIGHTBRACKET2);
        Debug.dexit(DT3296);
        return stringBuffer.toString().trim();
    }

    private static String getWordAfterBoundary(String str, int i) {
        StringBuffer stringBuffer = new StringBuffer();
        Debug.dfname("getWordAfterBoundary");
        Debug.denter(DT3296);
        if (str != null) {
            int length = str.length();
            Debug.dpr(DF3297, "The text we have so far is " + str);
            for (int i2 = i; i2 < length; i2++) {
                if (Character.isWhitespace(str.charAt(i2))) {
                    if (stringBuffer.length() > 0) {
                        break;
                    }
                } else {
                    stringBuffer.append(str.charAt(i2));
                    Debug.dpr(DF3297, "adding [" + str.charAt(i2) + "] to " + stringBuffer.toString().trim());
                }
            }
        }
        Debug.dpr(DF3297, "Word after boundary is [" + stringBuffer.toString().trim() + Category.CATEGORY_RIGHTBRACKET2);
        Debug.dexit(DT3296);
        return stringBuffer.toString().trim();
    }
}
