package edu.ucla.sspace.tools;

import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.dependency.CoNLLDependencyExtractor;
import edu.ucla.sspace.dependency.DependencyExtractor;
import edu.ucla.sspace.dependency.DependencyTreeNode;
import edu.ucla.sspace.dependency.WaCKyDependencyExtractor;
import edu.ucla.sspace.mains.OptionDescriptions;
import edu.ucla.sspace.text.DependencyFileDocumentIterator;
import edu.ucla.sspace.text.Document;
import edu.ucla.sspace.text.Stemmer;
import edu.ucla.sspace.text.TokenFilter;
import edu.ucla.sspace.util.LoggerUtil;
import edu.ucla.sspace.util.TrieMap;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;

/* loaded from: classes.dex */
public class DepTokenCounter {
    private static final Logger LOGGER = Logger.getLogger(DepTokenCounter.class.getName());
    private static final int UPDATE_INTERVAL = 10000;
    private final boolean doLowerCasing;
    private final boolean doPos;
    private final DependencyExtractor extractor;
    private final Map<String, Integer> tokenToCount = new TrieMap();

    public DepTokenCounter(boolean z, boolean z2, DependencyExtractor dependencyExtractor) {
        this.doLowerCasing = z;
        this.doPos = z2;
        this.extractor = dependencyExtractor;
    }

    public static void main(String[] strArr) throws Exception {
        ArgOptions argOptions = new ArgOptions();
        argOptions.addOption('Z', "stemmingAlgorithm", "specifices the stemming algorithm to use on tokens while iterating.  (default: none)", true, "CLASSNAME", "Tokenizing Options");
        argOptions.addOption('F', "tokenFilter", "filters to apply to the input token stream", true, "FILTER_SPEC", "Tokenizing Options");
        argOptions.addOption('L', "lowerCase", "lower-cases each token after all other filtering has been applied", false, null, "Tokenizing Options");
        argOptions.addOption('P', "partOfSpeech", "use part of speech tags for each token.", false, null, "Tokenizing Options");
        argOptions.addOption('H', "discardHeader", "If true, the first line of each dependency document will be discarded.", false, null, "Tokenizing Options");
        argOptions.addOption('v', "verbose", "Print verbose output about counting status", false, null, "Optional");
        argOptions.addOption('D', "dependencyParseFormat", "the name of the dependency parsed format for the corpus (defalt: CoNLL)", true, "STR", "Advanced Dependency Parsing");
        argOptions.parseOptions(strArr);
        if (argOptions.numPositionalArgs() < 2) {
            System.out.println("usage: java DepTokenCounter [options] <output-file> <input-file> [<input-file>]*\n" + argOptions.prettyPrint() + "\n\n" + OptionDescriptions.TOKEN_FILTER_DESCRIPTION);
            return;
        }
        if (argOptions.hasOption("verbose")) {
            LoggerUtil.setLevel(Level.FINE);
        }
        boolean hasOption = argOptions.hasOption("lowerCase");
        boolean hasOption2 = argOptions.hasOption("partOfSpeech");
        boolean hasOption3 = argOptions.hasOption('H');
        DependencyExtractor dependencyExtractor = null;
        TokenFilter loadFromSpecification = argOptions.hasOption("tokenFilter") ? TokenFilter.loadFromSpecification(argOptions.getStringOption('F')) : null;
        Stemmer stemmer = (Stemmer) argOptions.getObjectOption("stemmingAlgorithm", (String) null);
        String stringOption = argOptions.getStringOption("dependencyParseFormat", "CoNLL");
        if (stringOption.equals("CoNLL")) {
            dependencyExtractor = new CoNLLDependencyExtractor(loadFromSpecification, stemmer);
        } else if (stringOption.equals("WaCKy")) {
            dependencyExtractor = new WaCKyDependencyExtractor(loadFromSpecification, stemmer);
        }
        DepTokenCounter depTokenCounter = new DepTokenCounter(hasOption, hasOption2, dependencyExtractor);
        for (int i = 1; i < argOptions.numPositionalArgs(); i++) {
            depTokenCounter.process(new DependencyFileDocumentIterator(argOptions.getPositionalArg(i), hasOption3));
        }
        PrintWriter printWriter = new PrintWriter(argOptions.getPositionalArg(0));
        for (Map.Entry<String, Integer> entry : depTokenCounter.tokenToCount.entrySet()) {
            printWriter.printf("%s %d\n", entry.getKey(), entry.getValue());
        }
        printWriter.close();
    }

    private void process(Iterator<Document> it) throws IOException {
        long j = 0;
        while (it.hasNext()) {
            for (DependencyTreeNode dependencyTreeNode : this.extractor.readNextTree(it.next().reader())) {
                String word = dependencyTreeNode.word();
                if (this.doLowerCasing) {
                    word = word.toLowerCase();
                }
                if (this.doPos) {
                    word = word + "-" + dependencyTreeNode.pos();
                }
                Integer num = this.tokenToCount.get(word);
                this.tokenToCount.put(word, Integer.valueOf(num != null ? 1 + num.intValue() : 1));
                j++;
                if (j % 10000 == 0) {
                    LOGGER.fine("Processed " + j + " tokens.  Currently " + this.tokenToCount.size() + " unique tokens");
                }
            }
        }
    }

    public Map<String, Integer> getTokenCounts() {
        return Collections.unmodifiableMap(this.tokenToCount);
    }
}
