package gov.nih.nlm.nls.nlp.indexMaker;

import gov.nih.nlm.nls.lvg.Api.LvgLexItemApi;
import gov.nih.nlm.nls.lvg.Lib.LexItem;
import gov.nih.nlm.nls.nlp.LexiconDataPath;
import gov.nih.nlm.nls.nlp.parser.Parse;
import gov.nih.nlm.nls.nlp.taggerservices.TaggerClientMain;
import gov.nih.nlm.nls.nlp.textfeatures.Category;
import gov.nih.nlm.nls.nlp.textfeatures.Collection;
import gov.nih.nlm.nls.nlp.textfeatures.Document;
import gov.nih.nlm.nls.nlp.textfeatures.MmObject;
import gov.nih.nlm.nls.nlp.textfeatures.Phrase;
import gov.nih.nlm.nls.nlp.textfeatures.Span;
import gov.nih.nlm.nls.nlp.tokenizer.TokenizeAPI;
import gov.nih.nlm.nls.utils.Debug;
import gov.nih.nlm.nls.utils.FileUtilities;
import gov.nih.nlm.nls.utils.GlobalBehavior;
import gov.nih.nlm.nls.utils.U;
import gov.nih.nlm.nls.utils.Use;
import gov.nih.nlm.nls.utils.Version;
import gov.nih.nlm.nls.utils.WildCardFilter;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FilenameFilter;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.Charset;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Vector;

/* loaded from: input_file:gov/nih/nlm/nls/nlp/indexMaker/IndexMaker.class */
public class IndexMaker {
    private GlobalBehavior settings;
    private TokenizeAPI tokenizer;
    private TaggerClientMain lexicalLookup;
    private Parse parser;
    private boolean filterOutPunctuation;
    private boolean filterOutNumbers;
    private boolean filterOutStopWords;
    private boolean filterOutSmallWords;
    private static final int TOKENS = 1;
    private static final int TERMS = 2;
    private static final int PHRASES = 3;
    private static final int DT17124 = 17124;
    private static final int DF17125 = 17125;
    private static final int DT17126 = 17126;
    private static final int DF17127 = 17127;
    private static final int DT17128 = 17128;
    private static final int DF17129 = 17129;
    private static final int DT17130 = 17130;
    private static final int DF17131 = 17131;
    private static final int DT17132 = 17132;
    private static final int DF17133 = 17133;
    private static final int DT17134 = 17134;
    private static final int DF17135 = 17135;
    private static final int DT17136 = 17136;
    private static final int DF17137 = 17137;
    private static final int DT17138 = 17138;
    private static final int DF17139 = 17139;
    private static final int DT17140 = 17140;
    private static final int DF17141 = 17141;
    private static final int DT17142 = 17142;
    private static final int DF17143 = 17143;
    private static int AVG_DOC_UNIQ_WORDS = 1000;
    private static final int[] dummyObject = new int[1];
    private String databasePath = null;
    private Connection jdbcConnection = null;
    private PreparedStatement termFreqFromCorpusStats = null;
    private PreparedStatement docFreqFromCorpusStats = null;
    private PreparedStatement insertTermFreqToCorpusStats = null;
    private PreparedStatement updateTermFreqInCorpusStats = null;
    private PreparedStatement updateDocFreqInCorpusStats = null;
    private PreparedStatement insertOffsetsToCorpusOffsets = null;
    private boolean transformTerm = false;
    private LvgLexItemApi lvgAPI = null;
    private LexItem lvgInput = null;
    private int chunkSize = 1;
    private Hashtable stopWordList = null;

    public IndexMaker(GlobalBehavior globalBehavior) throws Exception {
        this.settings = null;
        this.tokenizer = null;
        this.lexicalLookup = null;
        this.parser = null;
        this.filterOutPunctuation = false;
        this.filterOutNumbers = false;
        this.filterOutStopWords = false;
        this.filterOutSmallWords = false;
        Debug.dfname("IndexMaker:Constructor");
        Debug.denter(DT17124);
        this.settings = globalBehavior;
        this.tokenizer = new TokenizeAPI(globalBehavior);
        this.lexicalLookup = new TaggerClientMain(globalBehavior);
        this.parser = new Parse(globalBehavior);
        initTables();
        initLvg();
        setChunkSize();
        this.filterOutPunctuation = this.settings.getBoolean("--filterOutPunctuation");
        this.filterOutNumbers = this.settings.getBoolean("--filterOutNumbers");
        this.filterOutStopWords = this.settings.getBoolean("--filterOutStopWords");
        this.filterOutSmallWords = this.settings.getBoolean("--filterOutSmallWords");
        if (this.filterOutStopWords) {
            loadStopWordList();
        }
        Debug.dexit(DT17124);
    }

    public final void run() throws Exception {
        Debug.dfname("run");
        Debug.denter(DT17130);
        File[] files = FileUtilities.getFiles(this.settings);
        for (int i = 0; i < files.length; i++) {
            System.err.print(new StringBuffer().append("Processing Collection ").append(files[i].getName()).append("..").toString());
            Iterator it = analizeFile(files[i]).iterator();
            while (it.hasNext()) {
                Document document = (Document) it.next();
                System.err.print(new StringBuffer().append(" Processing Document ").append(document.getDocumentName()).append(" .").toString());
                System.err.print(".");
                System.err.print(".");
                indexDocument(document);
                System.err.print(".");
                System.err.println();
            }
        }
        displayFreqIndex();
        finalize();
        Debug.dexit(DT17130);
    }

    Vector analizeFile(File file) throws Exception {
        Debug.dfname("analizeFile");
        Debug.denter(DT17132);
        Collection collection = new Collection(file.getAbsolutePath(), this.settings);
        switch (this.chunkSize) {
            case 1:
                this.tokenizer.processCollection(collection);
                break;
            case 2:
                this.lexicalLookup.processCollection(collection);
                break;
            case 3:
                this.parser.processCollection(collection);
                break;
        }
        Vector documents = collection.getDocuments();
        Debug.dexit(DT17132);
        return documents;
    }

    void initTables() throws Exception {
        Debug.dfname("initTables");
        Debug.denter(DT17134);
        this.databasePath = this.settings.getString("--tmpDatabasePath");
        Properties properties = new Properties();
        properties.setProperty("hsqldb.log_size", "0");
        properties.setProperty("hsqldb.default_table_type", "CACHED");
        properties.setProperty("user", "sa");
        properties.setProperty("password", "");
        try {
            if (this.jdbcConnection == null) {
                Class.forName("org.hsqldb.jdbcDriver");
                this.jdbcConnection = DriverManager.getConnection(new StringBuffer().append("jdbc:hsqldb:").append(this.databasePath).toString(), properties);
                this.jdbcConnection.setAutoCommit(true);
            }
            Statement createStatement = this.jdbcConnection.createStatement();
            createStatement.executeUpdate("CREATE TABLE corpusStats ( name        VARCHAR(255)  NOT NULL, termFreq        INTEGER  NOT NULL, docFreq         INTEGER  NOT NULL, PRIMARY KEY( name ) )");
            createStatement.executeUpdate("CREATE TABLE corpusOffsets ( name        VARCHAR(255)  NOT NULL, docName     VARCHAR(255)  NOT NULL, beginOffset     INTEGER  NOT NULL, endOffset       INTEGER  NOT NULL) ");
            createStatement.executeUpdate("CREATE INDEX T1 ON corpusOffsets (  name, docName, beginOffset ) ");
            this.insertTermFreqToCorpusStats = this.jdbcConnection.prepareStatement("INSERT INTO corpusStats (name, termFreq, docFreq) VALUES (?,?,?)");
            this.termFreqFromCorpusStats = this.jdbcConnection.prepareStatement("SELECT DISTINCT termFreq FROM corpusStats  WHERE name=?");
            this.docFreqFromCorpusStats = this.jdbcConnection.prepareStatement("SELECT DISTINCT docFreq  FROM corpusStats  WHERE name=?");
            this.updateTermFreqInCorpusStats = this.jdbcConnection.prepareStatement("UPDATE corpusStats SET termFreq=? WHERE name=?");
            this.updateDocFreqInCorpusStats = this.jdbcConnection.prepareStatement("UPDATE corpusStats SET docFreq = ? WHERE name=?");
            this.insertOffsetsToCorpusOffsets = this.jdbcConnection.prepareStatement("INSERT INTO corpusOffsets (name, docName, beginOffset, endOffset) VALUES(?,?,?,?)");
        } catch (SQLException e) {
            if (!e.getSQLState().equals("S0002") || e.getErrorCode() != -22) {
                System.out.println(new StringBuffer().append("SQLException: ").append(e.getMessage()).toString());
                System.out.println(new StringBuffer().append("SQLState:     ").append(e.getSQLState()).toString());
                System.out.println(new StringBuffer().append("VendorError:  ").append(e.getErrorCode()).toString());
                throw new RuntimeException();
            }
            System.out.println("-- Table not found, first time load!");
        }
        Debug.dexit(DT17134);
    }

    public void finalize() throws Exception {
        Debug.dfname("finalize");
        Debug.denter(DT17134);
        try {
            if (this.insertTermFreqToCorpusStats != null) {
                this.insertTermFreqToCorpusStats.close();
                this.insertTermFreqToCorpusStats = null;
            }
            if (this.termFreqFromCorpusStats != null) {
                this.termFreqFromCorpusStats.close();
                this.termFreqFromCorpusStats = null;
            }
            if (this.docFreqFromCorpusStats != null) {
                this.docFreqFromCorpusStats.close();
                this.docFreqFromCorpusStats = null;
            }
            if (this.updateTermFreqInCorpusStats != null) {
                this.updateTermFreqInCorpusStats.close();
                this.updateTermFreqInCorpusStats = null;
            }
            if (this.updateDocFreqInCorpusStats != null) {
                this.updateDocFreqInCorpusStats.close();
                this.updateDocFreqInCorpusStats = null;
            }
            if (this.insertOffsetsToCorpusOffsets != null) {
                this.insertOffsetsToCorpusOffsets.close();
                this.insertOffsetsToCorpusOffsets = null;
            }
            this.jdbcConnection.createStatement().execute("SHUTDOWN");
            this.jdbcConnection.close();
            if (!this.settings.getBoolean("--keepDatabase")) {
                for (File file : new File(FileUtilities.getPath(this.databasePath)).listFiles((FilenameFilter) new WildCardFilter(new StringBuffer().append(FileUtilities.getFilePattern(this.databasePath)).append(Category.CATEGORY_ASTERISK2).toString()))) {
                    file.delete();
                }
            }
            if (this.lexicalLookup != null) {
                this.lexicalLookup.cleanup();
            }
            if (this.parser != null) {
                this.parser.finalize();
            }
            Debug.dexit(DT17134);
        } catch (SQLException e) {
            System.out.println(new StringBuffer().append("SQLException: ").append(e.getMessage()).toString());
            System.out.println(new StringBuffer().append("SQLState:     ").append(e.getSQLState()).toString());
            System.out.println(new StringBuffer().append("VendorError:  ").append(e.getErrorCode()).toString());
            throw new RuntimeException();
        }
    }

    void indexDocument(Document document) throws Exception {
        Debug.dfname("indexDocument");
        Debug.denter(DT17134);
        Hashtable hashtable = new Hashtable(AVG_DOC_UNIQ_WORDS);
        List<MmObject> keyList = getKeyList(document);
        if (keyList != null) {
            for (MmObject mmObject : keyList) {
                Iterator it = transformKey(mmObject).iterator();
                while (it.hasNext()) {
                    String str = (String) it.next();
                    Span span = mmObject.getSpan();
                    setCorpusFrequency(str, getCorpusFrequency(str) + 1);
                    if (hashtable.get(str) == null) {
                        setDocumentFrequency(str, getDocumentFrequency(str) + 1);
                        hashtable.put(str, dummyObject);
                    }
                    addToOffsetList(str, document.getDocumentName(), span);
                }
            }
        }
        Debug.dexit(DT17134);
    }

    void addToOffsetList(String str, String str2, Span span) throws Exception {
        Debug.dfname("addToOffsetlist");
        Debug.denter(DT17136);
        int beginCharacter = span.getBeginCharacter();
        int endCharacter = span.getEndCharacter();
        this.insertOffsetsToCorpusOffsets.setString(1, str);
        this.insertOffsetsToCorpusOffsets.setString(2, str2);
        this.insertOffsetsToCorpusOffsets.setInt(3, beginCharacter);
        this.insertOffsetsToCorpusOffsets.setInt(4, endCharacter);
        this.insertOffsetsToCorpusOffsets.executeUpdate();
        Debug.dexit(DT17136);
    }

    int getCorpusFrequency(String str) throws Exception {
        int i;
        Debug.dfname("getCorpusFrequency");
        Debug.denter(DT17136);
        this.termFreqFromCorpusStats.setString(1, str);
        ResultSet executeQuery = this.termFreqFromCorpusStats.executeQuery();
        if (executeQuery == null || !executeQuery.next()) {
            this.insertTermFreqToCorpusStats.setString(1, str);
            this.insertTermFreqToCorpusStats.setInt(2, 0);
            this.insertTermFreqToCorpusStats.setInt(3, 0);
            this.insertTermFreqToCorpusStats.executeUpdate();
            Debug.dpr(DF17137, new StringBuffer().append("Just added |").append(str).append(Category.CATEGORY_BAR2).toString());
            i = 0;
        } else {
            i = executeQuery.getInt(1);
            executeQuery.close();
        }
        Debug.dexit(DT17136);
        return i;
    }

    int getDocumentFrequency(String str) throws Exception {
        int i;
        Debug.dfname("getDocumentFrequency");
        Debug.denter(DT17136);
        this.docFreqFromCorpusStats.setString(1, str);
        ResultSet executeQuery = this.docFreqFromCorpusStats.executeQuery();
        if (executeQuery == null || !executeQuery.next()) {
            i = 0;
        } else {
            i = executeQuery.getInt(1);
            executeQuery.close();
        }
        Debug.dexit(DT17136);
        return i;
    }

    void setCorpusFrequency(String str, int i) throws Exception {
        Debug.dfname("setCorpusFrequency");
        Debug.denter(DT17136);
        try {
            this.updateTermFreqInCorpusStats.setString(2, str);
            this.updateTermFreqInCorpusStats.setInt(1, i);
            this.updateTermFreqInCorpusStats.executeUpdate();
            Debug.dexit(DT17136);
        } catch (SQLException e) {
            System.err.println(new StringBuffer().append("for Pkey = ").append(str).append(Category.CATEGORY_BAR2).append(i).toString());
            System.out.println(new StringBuffer().append("SQLException: ").append(e.getMessage()).toString());
            System.out.println(new StringBuffer().append("SQLState:     ").append(e.getSQLState()).toString());
            System.out.println(new StringBuffer().append("VendorError:  ").append(e.getErrorCode()).toString());
            throw new RuntimeException();
        }
    }

    void setDocumentFrequency(String str, int i) throws Exception {
        Debug.dfname("setDocumentFrequency");
        Debug.denter(DT17136);
        try {
            this.updateDocFreqInCorpusStats.setString(2, str);
            this.updateDocFreqInCorpusStats.setInt(1, i);
            this.updateDocFreqInCorpusStats.executeUpdate();
            Debug.dexit(DT17136);
        } catch (SQLException e) {
            System.out.println(new StringBuffer().append("SQLException: ").append(e.getMessage()).toString());
            System.out.println(new StringBuffer().append("SQLState:     ").append(e.getSQLState()).toString());
            System.out.println(new StringBuffer().append("VendorError:  ").append(e.getErrorCode()).toString());
            throw new RuntimeException();
        }
    }

    final void displayFreqIndex() throws Exception {
        Debug.dfname("displayFreqIndex");
        Debug.denter(DT17136);
        String string = this.settings.getString("--outputFileName");
        PrintWriter printWriter = (string == null || string.equalsIgnoreCase("null")) ? new PrintWriter(new OutputStreamWriter(System.out, Charset.forName("UTF-8"))) : new PrintWriter(new OutputStreamWriter(new FileOutputStream(new StringBuffer().append(string).append(".freq").toString()), Charset.forName("UTF-8")));
        Statement createStatement = this.jdbcConnection.createStatement();
        ResultSet executeQuery = createStatement.executeQuery("SELECT name, termFreq, docFreq FROM corpusStats  ORDER BY termFreq DESC");
        while (executeQuery.next()) {
            printWriter.println(U.concat(U.pad(new StringBuffer().append(executeQuery.getString(1)).append(Category.CATEGORY_BAR2).toString(), 50), Category.CATEGORY_BAR2, U.pad(executeQuery.getInt(2), 10), Category.CATEGORY_BAR2, U.pad(executeQuery.getInt(3), 10)));
            printWriter.flush();
        }
        createStatement.close();
        printWriter.close();
        Debug.dexit(DT17136);
    }

    final void displayOffestIndex(GlobalBehavior globalBehavior) throws Exception {
        Debug.dfname("displayOffestIndex");
        Debug.denter(DT17138);
        Debug.dexit(DT17138);
    }

    final int[] getOffsetsForKey(String str) throws Exception {
        Debug.dfname("getOffsetsForKey");
        Debug.denter(DT17140);
        Debug.dexit(DT17140);
        return null;
    }

    ArrayList transformKey(MmObject mmObject) throws Exception {
        ArrayList arrayList = new ArrayList(1);
        Debug.dfname("getFreqsForKey");
        Debug.denter(DT17142);
        String keyString = getKeyString(mmObject);
        if (keyString != null && keyString.length() > 0) {
            if (this.transformTerm) {
                this.lvgInput.SetSourceTerm(keyString);
                this.lvgInput.SetSourceCategory(2047L);
                this.lvgInput.SetSourceInflection(16777215L);
                this.lvgInput.SetOriginalTerm(keyString);
                try {
                    Iterator it = this.lvgAPI.ProcessLexItem(this.lvgInput).iterator();
                    while (it.hasNext()) {
                        arrayList.add(((LexItem) it.next()).GetTargetTerm());
                    }
                } catch (Exception e) {
                    System.err.println(new StringBuffer().append("Something went wrong with ").append(keyString).toString());
                    System.err.println(e.toString());
                }
            } else {
                arrayList.add(keyString);
            }
        }
        Debug.dexit(DT17142);
        return arrayList;
    }

    List getKeyList(Document document) throws Exception {
        Debug.dfname("getFreqsForKey");
        Debug.denter(DT17142);
        Vector vector = null;
        switch (this.chunkSize) {
            case 1:
                vector = document.getTokens();
                break;
            case 2:
                vector = document.getLexicalElements();
                break;
            case 3:
                vector = document.getPhrases();
                break;
        }
        Debug.dexit(DT17142);
        return vector;
    }

    String getKeyString(MmObject mmObject) throws Exception {
        Debug.dfname("getKeyString");
        Debug.denter(DT17142);
        String str = null;
        switch (this.chunkSize) {
            case 1:
                str = mmObject.getTrimmedString();
                break;
            case 2:
                str = mmObject.getTrimmedString();
                break;
            case 3:
                str = ((Phrase) mmObject).getNpString();
                break;
        }
        String applyFilters = applyFilters(str);
        Debug.dexit(DT17142);
        return applyFilters;
    }

    void setChunkSize() {
        Debug.dfname("setChunkSize");
        Debug.denter(DT17142);
        String string = this.settings.getString("--indexType");
        if (string.equalsIgnoreCase("word")) {
            this.chunkSize = 1;
        } else if (string.equalsIgnoreCase("term")) {
            this.chunkSize = 2;
        } else if (string.equalsIgnoreCase("phrase")) {
            this.chunkSize = 3;
        } else {
            this.chunkSize = 1;
        }
        Debug.dexit(DT17142);
    }

    String applyFilters(String str) {
        Debug.dfname("applyFilters");
        Debug.denter(DT17142);
        String str2 = str;
        if (this.filterOutPunctuation && U.isPunctuation(str)) {
            str2 = null;
        }
        if (this.filterOutNumbers && U.isNumber(str)) {
            str2 = null;
        }
        if (this.filterOutStopWords && this.stopWordList.get(str.toLowerCase()) != null) {
            str2 = null;
        }
        if (this.filterOutSmallWords && str.length() <= 1) {
            str2 = null;
        }
        Debug.dexit(DT17142);
        return str2;
    }

    void initLvg() throws Exception {
        Debug.dfname("initlvg");
        Debug.denter(DT17142);
        String string = this.settings.getString("--lvgCommands");
        if (string != null && string.length() > 0 && !string.equals("null")) {
            this.lvgAPI = new LvgLexItemApi(string);
            this.transformTerm = true;
        } else if (this.settings.getBoolean("--normalize")) {
            this.lvgAPI = new LvgLexItemApi("-f:N");
            this.transformTerm = true;
        }
        if (this.transformTerm) {
            this.lvgInput = new LexItem("null", 2047L, 16777215L);
        }
        Debug.dexit(DT17142);
    }

    void loadStopWordList() throws Exception {
        Debug.dfname("loadStopWordList");
        Debug.denter(DT17142);
        this.stopWordList = new Hashtable(100);
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(U.concat(LexiconDataPath.getLexiconDirPath(this.settings), U.FS, "indexStopWords.txt")));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                } else {
                    this.stopWordList.put(readLine.toLowerCase(), dummyObject);
                }
            }
            bufferedReader.close();
        } catch (Exception e) {
            System.err.println(e.toString());
            System.err.println("Could not open or read the indexStopWord List");
            System.err.println("Place such a file at data/200X/lexicon/indexStopWords.txt");
        }
        Debug.dexit(DT17142);
    }

    public static final void main(String[] strArr) {
        Debug.dfname("main");
        Debug.denter(DT17126);
        try {
            GlobalBehavior globalBehavior = new GlobalBehavior("indexMaker", "NLPRegistry.cfg", "NLP.cfg", strArr);
            if (globalBehavior.getBoolean("--help")) {
                _usage();
            } else if (globalBehavior.getBoolean("--version")) {
                System.out.println(new StringBuffer().append("IndexMaker Version : ").append(Version.getVersion("gov/nih/nlm/nls/nlp/indexMaker/history.txt")).toString());
            } else {
                new IndexMaker(globalBehavior).run();
            }
        } catch (Exception e) {
            System.err.println(new StringBuffer().append("Something went wrong ").append(e.toString()).toString());
            e.printStackTrace();
        }
        Debug.dexit(DT17126);
        System.exit(0);
    }

    private static final void _usage() {
        Debug.dfname("_usage");
        Debug.denter(DT17128);
        ClassLoader.getSystemClassLoader();
        InputStream systemResourceAsStream = ClassLoader.getSystemResourceAsStream("gov/nih/nlm/nls/nlp/indexMaker/IndexMaker.hlp");
        if (systemResourceAsStream != null) {
            Use.usage(systemResourceAsStream);
        }
        Debug.dexit(DT17128);
    }
}
