package edu.ucla.sspace.text;

import com.moms.lib_modules.cpi.Setting_SharePreferences;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.StringTokenizer;

/* loaded from: classes2.dex */
public class DocumentPreprocessor {
    private final Set<DocHash> processedDocs = Collections.synchronizedSet(new HashSet());
    private final Set<String> validWords = new HashSet();

    /* loaded from: classes2.dex */
    private static class DocHash {
        private final byte[] hash;
        private final int hashCode;

        public DocHash(String str) {
            this.hash = hash(str);
            byte[] bArr = this.hash;
            this.hashCode = bArr[0] | (bArr[3] << 24) | (bArr[2] << 16) | (bArr[1] << 8);
        }

        private static byte[] hash(String str) {
            try {
                return MessageDigest.getInstance("MD5").digest(str.getBytes());
            } catch (NoSuchAlgorithmException e) {
                throw new Error(e);
            }
        }

        public boolean equals(Object obj) {
            return obj != null && (obj instanceof DocHash) && Arrays.equals(this.hash, ((DocHash) obj).hash);
        }

        public int hashCode() {
            return this.hashCode;
        }
    }

    public DocumentPreprocessor() {
    }

    public DocumentPreprocessor(File file) throws IOException {
        WordIterator wordIterator = new WordIterator(new BufferedReader(new FileReader(file)));
        while (wordIterator.hasNext()) {
            this.validWords.add(wordIterator.next());
        }
        addKeyTokens();
    }

    public DocumentPreprocessor(String[] strArr) {
        for (String str : strArr) {
            this.validWords.add(str);
        }
        addKeyTokens();
    }

    private void addKeyTokens() {
        for (String str : new String[]{"'", "!", ".", "?", ",", ";", "(", ")", "[", "]", "/", ":", "\"", "&", "<", ">", "<num", "<url>", "<emote>", "<slash>", "dollars"}) {
            this.validWords.add(str);
        }
    }

    public String process(String str) {
        return process(str, false);
    }

    public String process(String str, boolean z) {
        int i;
        String replaceAll = StringUtils.unescapeHTML(str).replaceAll("<.*?>", "").replaceAll("<", " < ").replaceAll(">", " > ");
        StringTokenizer stringTokenizer = new StringTokenizer(replaceAll);
        StringBuilder sb = new StringBuilder(replaceAll.length());
        while (true) {
            i = 0;
            if (!stringTokenizer.hasMoreTokens()) {
                break;
            }
            String nextToken = stringTokenizer.nextToken();
            if (nextToken.endsWith("?")) {
                sb.append(nextToken.substring(0, nextToken.length() - 1));
                sb.append(" ?");
            } else if (nextToken.endsWith(",")) {
                sb.append(nextToken.substring(0, nextToken.length() - 1));
                sb.append(" ,");
            } else if (nextToken.endsWith(".")) {
                sb.append(nextToken.substring(0, nextToken.length() - 1));
                sb.append(" .");
            } else if (nextToken.contains("@") && nextToken.contains(".")) {
                sb.append("<URL>");
            } else if (nextToken.startsWith("http") || nextToken.startsWith("ftp")) {
                sb.append("<URL>");
            } else if (nextToken.matches("[0-9]+")) {
                sb.append("<NUM>");
            } else if (nextToken.equals("/")) {
                sb.append("<slash>");
            } else if ((nextToken.length() == 2 || nextToken.length() == 3) && (nextToken.equals(":)") || nextToken.equals(":(") || nextToken.equals(":/") || nextToken.equals(":\\") || nextToken.equals(":|") || nextToken.equals(":[") || nextToken.equals(":]") || nextToken.equals(":X") || nextToken.equals(":|") || nextToken.equals(":[") || nextToken.equals(":]") || nextToken.equals(":X") || nextToken.equals(":D"))) {
                sb.append("<EMOTE>");
            } else {
                sb.append(nextToken);
            }
            sb.append(Setting_SharePreferences.YOIL_SPLIT);
        }
        String replaceAll2 = sb.toString().trim().replaceAll("'", " ' ").replaceAll("!", " ! ").replaceAll("\\.", " . ").replaceAll("\\?", " ? ").replaceAll(";", " ; ").replaceAll(",", " , ").replaceAll("\\(", " ( ").replaceAll("\\)", " ) ").replaceAll("\\[", " [ ").replaceAll("\\]", " ] ").replaceAll("/", " / ").replaceAll(":", " : ").replaceAll("\"", " \" ").replaceAll("-", " - ").replaceAll("=", " = ");
        StringTokenizer stringTokenizer2 = new StringTokenizer(replaceAll2);
        StringBuilder sb2 = new StringBuilder(replaceAll2.length());
        while (stringTokenizer2.hasMoreTokens()) {
            String nextToken2 = stringTokenizer2.nextToken();
            if (nextToken2.length() <= 20) {
                sb2.append(nextToken2);
                sb2.append(Setting_SharePreferences.YOIL_SPLIT);
            }
        }
        String lowerCase = sb2.toString().trim().toLowerCase();
        StringTokenizer stringTokenizer3 = new StringTokenizer(lowerCase);
        StringBuilder sb3 = new StringBuilder(lowerCase.length());
        while (stringTokenizer3.hasMoreTokens()) {
            String nextToken3 = stringTokenizer3.nextToken();
            if (!nextToken3.startsWith("$")) {
                sb3.append(nextToken3);
                sb3.append(Setting_SharePreferences.YOIL_SPLIT);
            } else if (nextToken3.substring(1).matches("[0-9]+")) {
                sb3.append("<num>");
                sb3.append(" dollars ");
            }
        }
        String replaceAll3 = sb3.toString().trim().replaceAll("[^\\w\\s;:\\(\\)\\[\\]'!/&?\",\\.<>]", "");
        if (this.validWords.size() <= 0) {
            return replaceAll3;
        }
        StringTokenizer stringTokenizer4 = new StringTokenizer(replaceAll3);
        StringBuilder sb4 = new StringBuilder(replaceAll3.length());
        int i2 = 0;
        while (stringTokenizer4.hasMoreTokens()) {
            String nextToken4 = stringTokenizer4.nextToken();
            i2++;
            if (this.validWords.contains(nextToken4)) {
                i++;
                if (z) {
                    sb4.append(nextToken4);
                    sb4.append(Setting_SharePreferences.YOIL_SPLIT);
                }
            }
        }
        double d = i;
        double d2 = i2;
        Double.isNaN(d);
        Double.isNaN(d2);
        return d / d2 < 0.4d ? "" : z ? sb4.toString() : replaceAll3;
    }
}
