package edu.ucla.sspace.tools;

import com.moms.lib_modules.cpi.Setting_SharePreferences;
import edu.ucla.sspace.common.ArgOptions;
import edu.ucla.sspace.text.DocumentPreprocessor;
import edu.ucla.sspace.text.IteratorFactory;
import edu.ucla.sspace.text.StringUtils;
import edu.ucla.sspace.util.LoggerUtil;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOError;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

/* loaded from: classes.dex */
public class WikipediaCleaner {
    private static final Logger LOGGER = Logger.getLogger(WikipediaCleaner.class.getName());
    private final int minTokensPerArticle;
    private final Set<CleanerOption> options;
    private PrintWriter processedArticleWriter;

    /* loaded from: classes.dex */
    public enum CleanerOption {
        INCLUDE_TITLES,
        INCLUDE_CAPTIONS,
        INCLUDE_LINK_TEXT,
        FILTER_TOKENS,
        USE_PREPROCESSOR
    }

    /* loaded from: classes.dex */
    private static class DocumentBufferedQueue {
        private static final int DOCS_TO_CACHE = 100;
        private static final int TITLE_HTML_LENGTH = 11;
        private final BlockingQueue<WikiDoc> cachedDocs = new LinkedBlockingQueue();
        private final AtomicBoolean isReaderOpen = new AtomicBoolean(true);
        private final BufferedReader wikiReader;

        public DocumentBufferedQueue(String str) throws IOException {
            this.wikiReader = new BufferedReader(new FileReader(str));
            for (int i = 0; i < 100; i++) {
                WikiDoc cacheDoc = cacheDoc();
                if (cacheDoc != null) {
                    this.cachedDocs.offer(cacheDoc);
                }
            }
        }

        /* JADX INFO: Access modifiers changed from: private */
        /* JADX WARN: Code restructure failed: missing block: B:12:0x0025, code lost:
        
            r2 = r5.wikiReader.readLine().substring(edu.ucla.sspace.tools.WikipediaCleaner.DocumentBufferedQueue.TITLE_HTML_LENGTH);
            r4 = r2.indexOf("<");
         */
        /* JADX WARN: Code restructure failed: missing block: B:13:0x0037, code lost:
        
            if (r4 < 0) goto L22;
         */
        /* JADX WARN: Code restructure failed: missing block: B:14:0x0039, code lost:
        
            r1 = r2.substring(0, r4);
         */
        /* JADX WARN: Code restructure failed: missing block: B:15:0x003d, code lost:
        
            r2 = r5.wikiReader.readLine();
         */
        /* JADX WARN: Code restructure failed: missing block: B:16:0x0043, code lost:
        
            if (r2 == null) goto L42;
         */
        /* JADX WARN: Code restructure failed: missing block: B:18:0x004b, code lost:
        
            if (r2.startsWith("  </page>") != false) goto L43;
         */
        /* JADX WARN: Code restructure failed: missing block: B:19:0x004d, code lost:
        
            r0.append(r2);
            r0.append(com.moms.lib_modules.cpi.Setting_SharePreferences.YOIL_SPLIT);
         */
        /* JADX WARN: Code restructure failed: missing block: B:23:0x005c, code lost:
        
            return new edu.ucla.sspace.tools.WikipediaCleaner.WikiDoc(r1, r0);
         */
        /* JADX WARN: Code restructure failed: missing block: B:27:0x0073, code lost:
        
            throw new java.lang.Error("Malformed title: " + r1);
         */
        /*
            Code decompiled incorrectly, please refer to instructions dump.
            To view partially-correct add '--show-bad-code' argument
        */
        public synchronized edu.ucla.sspace.tools.WikipediaCleaner.WikiDoc cacheDoc() throws java.io.IOException {
            /*
                r5 = this;
                monitor-enter(r5)
                java.lang.StringBuilder r0 = new java.lang.StringBuilder     // Catch: java.lang.Throwable -> L7b
                r0.<init>()     // Catch: java.lang.Throwable -> L7b
            L6:
                java.io.BufferedReader r1 = r5.wikiReader     // Catch: java.lang.Throwable -> L7b
                java.lang.String r1 = r1.readLine()     // Catch: java.lang.Throwable -> L7b
                if (r1 == 0) goto L78
                java.lang.String r2 = "</mediawiki>"
                boolean r2 = r1.startsWith(r2)     // Catch: java.lang.Throwable -> L7b
                r3 = 0
                if (r2 == 0) goto L1d
                java.util.concurrent.atomic.AtomicBoolean r1 = r5.isReaderOpen     // Catch: java.lang.Throwable -> L7b
                r1.set(r3)     // Catch: java.lang.Throwable -> L7b
                goto L6
            L1d:
                java.lang.String r2 = "  <page>"
                boolean r2 = r1.startsWith(r2)     // Catch: java.lang.Throwable -> L7b
                if (r2 == 0) goto L6
                java.io.BufferedReader r2 = r5.wikiReader     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                java.lang.String r2 = r2.readLine()     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                int r4 = edu.ucla.sspace.tools.WikipediaCleaner.DocumentBufferedQueue.TITLE_HTML_LENGTH     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                java.lang.String r2 = r2.substring(r4)     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                java.lang.String r4 = "<"
                int r4 = r2.indexOf(r4)     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                if (r4 < 0) goto L5d
                java.lang.String r1 = r2.substring(r3, r4)     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
            L3d:
                java.io.BufferedReader r2 = r5.wikiReader     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                java.lang.String r2 = r2.readLine()     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                if (r2 == 0) goto L56
                java.lang.String r3 = "  </page>"
                boolean r3 = r2.startsWith(r3)     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                if (r3 != 0) goto L56
                r0.append(r2)     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                java.lang.String r2 = " "
                r0.append(r2)     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                goto L3d
            L56:
                edu.ucla.sspace.tools.WikipediaCleaner$WikiDoc r2 = new edu.ucla.sspace.tools.WikipediaCleaner$WikiDoc     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                r2.<init>(r1, r0)     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                monitor-exit(r5)
                return r2
            L5d:
                java.lang.Error r0 = new java.lang.Error     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                java.lang.StringBuilder r2 = new java.lang.StringBuilder     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                r2.<init>()     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                java.lang.String r3 = "Malformed title: "
                r2.append(r3)     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                r2.append(r1)     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                java.lang.String r1 = r2.toString()     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                r0.<init>(r1)     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
                throw r0     // Catch: java.lang.Throwable -> L74 java.lang.Throwable -> L7b
            L74:
                r0 = move-exception
                r0.printStackTrace()     // Catch: java.lang.Throwable -> L7b
            L78:
                r0 = 0
                monitor-exit(r5)
                return r0
            L7b:
                r0 = move-exception
                monitor-exit(r5)
                goto L7f
            L7e:
                throw r0
            L7f:
                goto L7e
            */
            throw new UnsupportedOperationException("Method not decompiled: edu.ucla.sspace.tools.WikipediaCleaner.DocumentBufferedQueue.cacheDoc():edu.ucla.sspace.tools.WikipediaCleaner$WikiDoc");
        }

        public boolean hasNext() {
            return this.cachedDocs.size() > 0 || this.isReaderOpen.get();
        }

        /* JADX WARN: Type inference failed for: r0v0, types: [edu.ucla.sspace.tools.WikipediaCleaner$DocumentBufferedQueue$1] */
        public WikiDoc next() throws InterruptedException {
            new Thread() { // from class: edu.ucla.sspace.tools.WikipediaCleaner.DocumentBufferedQueue.1
                @Override // java.lang.Thread, java.lang.Runnable
                public void run() {
                    try {
                        WikiDoc cacheDoc = DocumentBufferedQueue.this.cacheDoc();
                        if (cacheDoc != null) {
                            DocumentBufferedQueue.this.cachedDocs.offer(cacheDoc);
                        }
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
            }.start();
            return this.cachedDocs.poll(600000L, TimeUnit.MILLISECONDS);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: classes.dex */
    public static class WikiDoc {
        public final String name;
        public final StringBuilder text;

        public WikiDoc(String str, StringBuilder sb) {
            this.name = str;
            this.text = sb;
        }
    }

    public WikipediaCleaner(String str, Set<CleanerOption> set, int i) {
        this.options = set;
        this.minTokensPerArticle = i;
        try {
            this.processedArticleWriter = new PrintWriter(new BufferedOutputStream(new FileOutputStream(str)));
        } catch (IOException e) {
            throw new IOError(e);
        }
    }

    private void extractArticle(StringBuilder sb) {
        int indexOf = sb.indexOf(">", sb.indexOf("<text"));
        int indexOf2 = sb.indexOf("</text");
        if (indexOf2 >= 0) {
            sb.delete(indexOf2, sb.length());
        }
        sb.delete(0, indexOf + 1);
    }

    private String filterTokens(String str) {
        Iterator<String> it = IteratorFactory.tokenize(str);
        StringBuilder sb = new StringBuilder(str.length());
        while (it.hasNext()) {
            sb.append(it.next());
            if (it.hasNext()) {
                sb.append(Setting_SharePreferences.YOIL_SPLIT);
            }
        }
        return sb.toString();
    }

    private int getTokenCount(String str) {
        int i = 0;
        while (Pattern.compile("\\S+").matcher(str).find()) {
            i++;
        }
        return i;
    }

    private static boolean isArticleLink(String str) {
        String lowerCase = str.toLowerCase();
        return (lowerCase.startsWith("image:") || lowerCase.startsWith("wikipedia:") || lowerCase.startsWith("template:") || lowerCase.startsWith("category:") || lowerCase.startsWith("portal:") || lowerCase.contains("(disambiguation)")) ? false : true;
    }

    private static boolean isArticleLink(String str, String str2) {
        if (!isArticleLink(str)) {
            return false;
        }
        int indexOf = str.indexOf(":");
        if (indexOf >= 0 && Pattern.matches("[a-z]*", str.substring(0, indexOf))) {
            return false;
        }
        return !str.endsWith(":" + str2);
    }

    public static void main(String[] strArr) {
        ArgOptions argOptions = new ArgOptions();
        argOptions.addOption('t', "includeTitles", "Prints article and section titles as a part of the document", false, null, "Document Processing");
        argOptions.addOption('c', "includeCaptions", "Prints image and table captions as a part of the document", false, null, "Document Processing");
        argOptions.addOption('w', "includeLinkText", "Prints text in the Wikipedia links as a part of the document", false, null, "Document Processing");
        argOptions.addOption('F', "tokenFilter", "Specifies a filter to remove or retain certain tokens", true, "FILTER_SPEC", "Filtering");
        argOptions.addOption('M', "minTokens", "Records only those documents with at least the minimum number of tokens", true, "INT", "Filtering");
        argOptions.addOption('P', "applyPreprocessor", "Applies the DocumentPreprocessor to the documents", false, null, "Filtering");
        argOptions.addOption('v', "verbose", "Print verbose output about article cleaning", false, null, "Optional");
        argOptions.addOption('V', "veryVerbose", "Print lots of verbose output about article cleaning", false, null, "Optional");
        argOptions.parseOptions(strArr);
        if (argOptions.numPositionalArgs() != 2) {
            System.out.println("usage java [OPTIONS] <wikifile> <output-file>\n" + argOptions.prettyPrint());
            return;
        }
        Level level = null;
        if (argOptions.hasOption("verbose")) {
            level = Level.FINE;
        } else if (argOptions.hasOption("veryVerbose")) {
            level = Level.FINER;
        }
        if (level != null) {
            LoggerUtil.setLevel(level);
        }
        EnumSet noneOf = EnumSet.noneOf(CleanerOption.class);
        if (argOptions.hasOption("includeTitles")) {
            noneOf.add(CleanerOption.INCLUDE_TITLES);
        }
        if (argOptions.hasOption("includeCaptions")) {
            noneOf.add(CleanerOption.INCLUDE_CAPTIONS);
        }
        if (argOptions.hasOption("includeLinkText")) {
            noneOf.add(CleanerOption.INCLUDE_LINK_TEXT);
        }
        if (argOptions.hasOption("tokenFilter")) {
            Properties properties = new Properties();
            properties.setProperty(IteratorFactory.TOKEN_FILTER_PROPERTY, argOptions.getStringOption("tokenFilter"));
            IteratorFactory.setProperties(properties);
            noneOf.add(CleanerOption.FILTER_TOKENS);
        }
        if (argOptions.hasOption("applyPreprocessor")) {
            noneOf.add(CleanerOption.USE_PREPROCESSOR);
        }
        int intOption = argOptions.hasOption("minTokens") ? argOptions.getIntOption("minTokens") : 0;
        try {
            DocumentBufferedQueue documentBufferedQueue = new DocumentBufferedQueue(argOptions.getPositionalArg(0));
            WikipediaCleaner wikipediaCleaner = new WikipediaCleaner(argOptions.getPositionalArg(1), noneOf, intOption);
            while (documentBufferedQueue.hasNext()) {
                wikipediaCleaner.processDocument(documentBufferedQueue.next());
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    private void removeDoubleBraceMarkup(StringBuilder sb) {
        int indexOf = sb.indexOf("{{");
        while (indexOf >= 0) {
            int indexOf2 = sb.indexOf("}}", indexOf);
            int i = indexOf + 1;
            int indexOf3 = sb.indexOf("{{", i);
            while (indexOf3 > indexOf && indexOf3 < indexOf2) {
                removeEmbeddedBrace(sb, indexOf3);
                indexOf2 = sb.indexOf("}}", indexOf);
                indexOf3 = sb.indexOf("{{", i);
            }
            if (indexOf2 < 0) {
                return;
            }
            sb.delete(indexOf, indexOf2 + 2);
            indexOf = sb.indexOf("{{", indexOf);
        }
    }

    private void removeEmbeddedBrace(StringBuilder sb, int i) {
        int indexOf = sb.indexOf("}}", i);
        int i2 = i + 1;
        int indexOf2 = sb.indexOf("{{", i2);
        while (indexOf2 > i && indexOf2 < indexOf) {
            removeEmbeddedBrace(sb, indexOf2);
            indexOf = sb.indexOf("}}", i);
            indexOf2 = sb.indexOf("{{", i2);
        }
        if (indexOf < 0) {
            return;
        }
        sb.delete(i, indexOf + 2);
    }

    private void removeHtmlComments(StringBuilder sb) {
        int indexOf = sb.indexOf("<!--");
        while (indexOf >= 0) {
            int indexOf2 = sb.indexOf("-->", indexOf);
            if (indexOf2 <= indexOf) {
                return;
            }
            sb.delete(indexOf, indexOf2 + 3);
            indexOf = sb.indexOf("<!--", indexOf);
        }
    }

    private void removeTables(StringBuilder sb) {
        int indexOf = sb.indexOf("{|");
        while (indexOf >= 0) {
            int indexOf2 = sb.indexOf("|}", indexOf);
            if (indexOf2 <= indexOf) {
                return;
            }
            sb.delete(indexOf, indexOf2 + 2);
            indexOf = sb.indexOf("{|", indexOf);
        }
    }

    public void processDocument(WikiDoc wikiDoc) {
        String lowerCase = StringUtils.unescapeHTML(wikiDoc.name).trim().toLowerCase();
        if (!isArticleLink(lowerCase)) {
            LOGGER.fine("skipping non-article document: " + lowerCase);
            return;
        }
        if (lowerCase.indexOf("#REDIRECT") >= 0 || wikiDoc.text.indexOf("#REDIRECT") >= 0) {
            LOGGER.fine("skipping redirect: " + lowerCase);
            return;
        }
        LOGGER.log(Level.FINE, "Procesing article {0} with {1} characters", new Object[]{lowerCase, Integer.valueOf(wikiDoc.text.length())});
        StringBuilder sb = wikiDoc.text;
        LOGGER.finer("extracting raw article text");
        extractArticle(sb);
        LOGGER.finer("removing tables");
        removeTables(sb);
        LOGGER.finer("removing {{text}} from article");
        removeDoubleBraceMarkup(sb);
        LOGGER.finer("removing [[wiki-link]] from article");
        removeWikiLinkMarkup(sb, lowerCase);
        LOGGER.finer("removing [external-link] from article");
        removeExternalLinkMarkup(sb);
        LOGGER.finer("unescaping HTML");
        StringUtils.unescapeHTML(sb);
        LOGGER.finer("removing HTML comments");
        removeHtmlComments(sb);
        String sb2 = sb.toString();
        if (this.options.contains(CleanerOption.USE_PREPROCESSOR)) {
            LOGGER.finer("applying preprocessor");
            sb2 = new DocumentPreprocessor().process(sb2);
        }
        if (this.options.contains(CleanerOption.FILTER_TOKENS)) {
            LOGGER.finer("filtering tokens");
            sb2 = filterTokens(sb2);
        }
        int tokenCount = getTokenCount(sb2);
        if (tokenCount < this.minTokensPerArticle) {
            LOGGER.log(Level.FINE, "Document {0} contained only {1} tokens and was not printed", new Object[]{lowerCase, Integer.valueOf(tokenCount)});
            return;
        }
        if (this.options.contains(CleanerOption.INCLUDE_TITLES)) {
            this.processedArticleWriter.print(lowerCase);
            this.processedArticleWriter.print(Setting_SharePreferences.YOIL_SPLIT);
        }
        this.processedArticleWriter.println(sb2);
        this.processedArticleWriter.flush();
    }

    public void removeExternalLinkMarkup(StringBuilder sb) {
        int indexOf = sb.indexOf("[");
        boolean contains = this.options.contains(CleanerOption.INCLUDE_LINK_TEXT);
        while (indexOf >= 0) {
            int indexOf2 = sb.indexOf("]", indexOf);
            if (indexOf2 < 0) {
                return;
            }
            if (contains) {
                int indexOf3 = sb.indexOf(Setting_SharePreferences.YOIL_SPLIT, indexOf);
                if (indexOf3 < 0 || indexOf3 >= indexOf2) {
                    indexOf3 = indexOf + 1;
                }
                sb.replace(indexOf, indexOf2 + 1, sb.substring(indexOf3, indexOf2));
            } else {
                sb.delete(indexOf, indexOf2 + 1);
            }
            indexOf = sb.indexOf("[", indexOf);
        }
    }

    public void removeWikiLinkMarkup(StringBuilder sb, String str) {
        int indexOf = sb.indexOf("[[");
        boolean contains = this.options.contains(CleanerOption.INCLUDE_LINK_TEXT);
        while (indexOf >= 0) {
            int indexOf2 = sb.indexOf("]]", indexOf);
            if (indexOf2 < 0) {
                return;
            }
            if (contains) {
                int i = indexOf + 2;
                if (isArticleLink(sb.substring(i, indexOf2), str)) {
                    int indexOf3 = sb.indexOf("|", indexOf);
                    if (indexOf3 >= 0 && indexOf3 < indexOf2) {
                        i = indexOf3 + 1;
                    }
                    sb.replace(indexOf, indexOf2 + 2, sb.substring(i, indexOf2));
                    indexOf = sb.indexOf("[[", indexOf);
                }
            }
            sb.delete(indexOf, indexOf2 + 2);
            indexOf = sb.indexOf("[[", indexOf);
        }
    }
}
