package edu.ucla.sspace.tools;

import com.moms.lib_modules.cpi.Setting_SharePreferences;
import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.mains.OptionDescriptions;
import edu.ucla.sspace.text.IteratorFactory;
import edu.ucla.sspace.util.LoggerUtil;
import edu.ucla.sspace.util.TrieMap;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;

/* loaded from: classes2.dex */
public class TokenCounter {
    private static final Logger LOGGER = Logger.getLogger(TokenCounter.class.getName());
    private static final int UPDATE_INTERVAL = 10000;
    private final boolean doLowerCasing;
    private final Map<String, Integer> tokenToCount;

    public TokenCounter() {
        this(false);
    }

    public TokenCounter(boolean z) {
        this.doLowerCasing = z;
        this.tokenToCount = new TrieMap();
    }

    public static void main(String[] strArr) {
        ArgOptions argOptions = new ArgOptions();
        argOptions.addOption('Z', "stemmingAlgorithm", "specifices the stemming algorithm to use on tokens while iterating.  (default: none)", true, "CLASSNAME", "Tokenizing Options");
        argOptions.addOption('F', "tokenFilter", "filters to apply to the input token stream", true, "FILTER_SPEC", "Tokenizing Options");
        argOptions.addOption('C', "compoundWords", "a file where each line is a recognized compound word", true, "FILE", "Tokenizing Options");
        argOptions.addOption('L', "lowerCase", "lower-cases each token after all other filtering has been applied", false, null, "Tokenizing Options");
        argOptions.addOption('z', "wordLimit", "Set the maximum number of words an document can return", true, "INT", "Tokenizing Options");
        argOptions.addOption('v', "verbose", "Print verbose output about counting status", false, null, "Optional");
        argOptions.parseOptions(strArr);
        if (argOptions.numPositionalArgs() < 2) {
            System.out.println("usage: java TokenCounter [options] <output-file> <input-file> [<input-file>]*\n" + argOptions.prettyPrint() + "\n" + OptionDescriptions.COMPOUND_WORDS_DESCRIPTION + "\n\n" + OptionDescriptions.TOKEN_FILTER_DESCRIPTION);
            return;
        }
        if (argOptions.hasOption("verbose")) {
            LoggerUtil.setLevel(Level.FINE);
        }
        boolean hasOption = argOptions.hasOption("lowerCase");
        Properties properties = System.getProperties();
        if (argOptions.hasOption("tokenFilter")) {
            properties.setProperty(IteratorFactory.TOKEN_FILTER_PROPERTY, argOptions.getStringOption("tokenFilter"));
        }
        if (argOptions.hasOption("stemmingAlgorithm")) {
            properties.setProperty(IteratorFactory.STEMMER_PROPERTY, argOptions.getStringOption("stemmingAlgorithm"));
        }
        if (argOptions.hasOption("compoundWords")) {
            properties.setProperty(IteratorFactory.COMPOUND_TOKENS_FILE_PROPERTY, argOptions.getStringOption("compoundWords"));
        }
        if (argOptions.hasOption("wordLimit")) {
            properties.setProperty(IteratorFactory.TOKEN_COUNT_LIMIT_PROPERTY, argOptions.getStringOption("wordLimit"));
        }
        IteratorFactory.setProperties(properties);
        try {
            TokenCounter tokenCounter = new TokenCounter(hasOption);
            for (int i = 1; i < argOptions.numPositionalArgs(); i++) {
                tokenCounter.processFile(argOptions.getPositionalArg(i));
            }
            PrintWriter printWriter = new PrintWriter(argOptions.getPositionalArg(0));
            for (Map.Entry<String, Integer> entry : tokenCounter.tokenToCount.entrySet()) {
                printWriter.println(entry.getKey() + Setting_SharePreferences.YOIL_SPLIT + entry.getValue());
            }
            printWriter.close();
        } catch (Throwable th) {
            th.printStackTrace();
        }
    }

    private void process(Iterator<String> it) {
        long j = 0;
        while (it.hasNext()) {
            String next = it.next();
            if (this.doLowerCasing) {
                next = next.toLowerCase();
            }
            if (next.matches("[0-9]+")) {
                next = "<NUM>";
            }
            if (!next.matches("[^\\w\\s;:\\(\\)\\[\\]'!/&?\",\\.<>]")) {
                Integer num = this.tokenToCount.get(next);
                this.tokenToCount.put(next, Integer.valueOf(num != null ? 1 + num.intValue() : 1));
                j++;
                if (j % 10000 == 0 && LOGGER.isLoggable(Level.FINE)) {
                    LOGGER.fine("Processed " + j + " tokens.  Currently " + this.tokenToCount.size() + " unique tokens");
                }
            }
        }
    }

    public Map<String, Integer> getTokenCounts() {
        return Collections.unmodifiableMap(this.tokenToCount);
    }

    public void process(BufferedReader bufferedReader) {
        process(IteratorFactory.tokenize(bufferedReader));
    }

    public void process(String str) {
        process(IteratorFactory.tokenize(str));
    }

    public void processFile(File file) throws IOException {
        process(new BufferedReader(new FileReader(file)));
    }

    public void processFile(String str) throws IOException {
        process(new BufferedReader(new FileReader(str)));
    }
}
