/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.dev.bigdata;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.text.NumberFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.concurrent.atomic.AtomicLong;
import java.util.zip.GZIPInputStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class FrequencyIndexCreator {
    private static final int MIN_YEAR = 1910;
    private static final String NAME_REGEX1 = "googlebooks-[a-z]{3}-all-[1-5]gram-20120701-(.*?).gz";
    private static final String NAME_REGEX2 = "[a-z0-9]+-[a-z0-9]+-[a-z0-9]+-[a-z0-9]+-[a-z0-9]+[_-](.*?).gz";
    private static final String NAME_REGEX3 = "([_a-z0-9]{1,2}|other|pos|punctuation|_(ADJ|ADP|ADV|CONJ|DET|NOUN|NUM|PRON|PRT|VERB)_)";
    private static final int BUFFER_SIZE = 16384;
    private static final String LT_COMPLETE_MARKER = "languagetool_index_complete";
    private static final boolean IGNORE_POS = true;
    private final AtomicLong bytesProcessed = new AtomicLong(0L);
    private final Mode mode;
    private long totalTokenCount;
    private long inputFileCount;

    public FrequencyIndexCreator(Mode mode) {
        this.mode = mode;
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private void run(File inputDir, File indexBaseDir) throws Exception {
        if (!inputDir.exists()) {
            throw new RuntimeException("Not found: " + inputDir);
        }
        List<File> files = Arrays.asList(inputDir.listFiles());
        long totalBytes = files.stream().mapToLong(File::length).sum();
        System.out.println("Total input bytes: " + totalBytes);
        try (DataWriter dw = this.mode == Mode.PlainText ? new TextDataWriter(indexBaseDir) : new LuceneDataWriter(indexBaseDir);){
            files.parallelStream().forEach(dir -> this.index((File)dir, indexBaseDir, totalBytes, files.size(), dw));
            this.markIndexAsComplete(indexBaseDir);
        }
    }

    private void index(File file, File indexBaseDir, long totalBytes, int inputFiles, DataWriter globalDataWriter) {
        block24: {
            boolean hiveMode;
            File indexDir;
            System.out.println(file);
            String name = file.getName();
            if (name.matches(".*_[A-Z]+_.*")) {
                System.out.println("Skipping POS tag file " + name);
                return;
            }
            if (name.matches(NAME_REGEX1)) {
                indexDir = new File(indexBaseDir, name.replaceAll(NAME_REGEX1, "$1"));
                hiveMode = false;
                System.out.println("Running in corpus mode (i.e. aggregation of years)");
            } else if (name.matches(NAME_REGEX2)) {
                indexDir = new File(indexBaseDir, name.replaceAll(NAME_REGEX2, "$1"));
                hiveMode = true;
                System.out.println("Running in Hive mode (i.e. no aggregation of years)");
            } else if (name.matches(NAME_REGEX3) && file.isDirectory()) {
                file = new File(file, file.getName() + "-output.csv.gz");
                indexDir = new File(indexBaseDir, name.replaceAll(NAME_REGEX1, "$1"));
                hiveMode = true;
                System.out.println("Running in Hive/Text mode (i.e. no aggregation of years)");
            } else {
                System.out.println("Skipping " + name + " - doesn't match regex googlebooks-[a-z]{3}-all-[1-5]gram-20120701-(.*?).gz, [a-z0-9]+-[a-z0-9]+-[a-z0-9]+-[a-z0-9]+-[a-z0-9]+[_-](.*?).gz, or ([_a-z0-9]{1,2}|other|pos|punctuation|_(ADJ|ADP|ADV|CONJ|DET|NOUN|NUM|PRON|PRT|VERB)_)");
                return;
            }
            if (indexDir.exists() && indexDir.isDirectory()) {
                if (this.isIndexComplete(indexDir)) {
                    System.out.println("Skipping " + name + " - index dir '" + indexDir + "' already exists and is complete");
                    this.bytesProcessed.addAndGet(file.length());
                    return;
                }
                System.out.println("Not skipping " + name + " - index dir '" + indexDir + "' already exists but is not complete");
            }
            System.out.println("Index dir: " + indexDir + " - " + ++this.inputFileCount + " of " + inputFiles);
            try {
                if (this.mode == Mode.PlainText) {
                    if (globalDataWriter != null) {
                        this.indexLinesFromGoogleFile(globalDataWriter, file, totalBytes, hiveMode);
                        break block24;
                    }
                    try (TextDataWriter dw = new TextDataWriter(indexDir);){
                        this.indexLinesFromGoogleFile(dw, file, totalBytes, hiveMode);
                    }
                    this.markIndexAsComplete(indexDir);
                    break block24;
                }
                if (globalDataWriter != null) {
                    this.indexLinesFromGoogleFile(globalDataWriter, file, totalBytes, hiveMode);
                    break block24;
                }
                try (LuceneDataWriter dw = new LuceneDataWriter(indexDir);){
                    this.indexLinesFromGoogleFile(dw, file, totalBytes, hiveMode);
                }
                this.markIndexAsComplete(indexDir);
            }
            catch (Exception e) {
                throw new RuntimeException("Could not index " + file, e);
            }
        }
        this.bytesProcessed.addAndGet(file.length());
    }

    private void markIndexAsComplete(File directory) throws IOException {
        try (FileWriter fw = new FileWriter(new File(directory, LT_COMPLETE_MARKER));){
            fw.write(new Date().toString());
        }
    }

    private boolean isIndexComplete(File directory) {
        return new File(directory, LT_COMPLETE_MARKER).exists();
    }

    private void indexLinesFromGoogleFile(DataWriter writer, File inputFile, long totalBytes, boolean hiveMode) throws IOException {
        float progress = (float)this.bytesProcessed.get() / (float)totalBytes * 100.0f;
        System.out.printf("==== Working on " + inputFile + " (%.2f%%) ====\n", Float.valueOf(progress));
        try (FileInputStream fileStream = new FileInputStream(inputFile);
             GZIPInputStream gzipStream = new GZIPInputStream((InputStream)fileStream, 16384);
             InputStreamReader decoder = new InputStreamReader((InputStream)gzipStream, StandardCharsets.UTF_8);
             BufferedReader buffered = new BufferedReader(decoder, 16384);){
            String line;
            int i = 0;
            long docCount = 0L;
            long lineCount = 0L;
            String prevText = null;
            long startTime = System.nanoTime() / 1000L;
            while ((line = buffered.readLine()) != null) {
                ++lineCount;
                String[] parts = line.split("\t");
                String text = parts[0];
                if (this.isRealPosTag(text)) continue;
                if (hiveMode) {
                    if (parts.length <= 1) {
                        System.err.println("Could not index: " + line);
                        continue;
                    }
                    String docCountStr = parts[1];
                    writer.addDoc(text, Long.parseLong(docCountStr));
                    if (++i % 500000 == 0) {
                        this.printStats(i, inputFile, Long.parseLong(docCountStr), lineCount, text, startTime, totalBytes);
                    }
                } else {
                    int year = Integer.parseInt(parts[1]);
                    if (year < 1910) continue;
                    if (prevText == null || prevText.equals(text)) {
                        docCount += Long.parseLong(parts[2]);
                    } else {
                        writer.addDoc(prevText, docCount);
                        if (++i % 5000 == 0) {
                            this.printStats(i, inputFile, docCount, lineCount, prevText, startTime, totalBytes);
                        }
                        docCount = Long.parseLong(parts[2]);
                    }
                }
                prevText = text;
            }
            this.printStats(i, inputFile, docCount, lineCount, prevText, startTime, totalBytes);
        }
        writer.addTotalTokenCountDoc(this.totalTokenCount);
    }

    private boolean isRealPosTag(String text) {
        String tag2;
        String tag;
        int idx = text.indexOf(95);
        if (idx == -1) {
            return false;
        }
        String string = tag = idx + 7 <= text.length() ? text.substring(idx, idx + 7) : "";
        if (tag.equals("_START_")) {
            return false;
        }
        String string2 = tag2 = idx + 5 <= text.length() ? text.substring(idx, idx + 5) : "";
        return !tag2.equals("_END_");
    }

    private void printStats(int i, File inputFile, long docCount, long lineCount, String prevText, long startTimeMicros, long totalBytes) {
        long microsNow = System.nanoTime() / 1000L;
        float millisPerDoc = (microsNow - startTimeMicros) / (long)Math.max(1, i);
        NumberFormat format = NumberFormat.getNumberInstance(Locale.US);
        float progress = (float)this.bytesProcessed.get() / (float)totalBytes * 100.0f;
        System.out.printf("%.2f%% input:%s doc:%s line:%s ngram:%s occ:%s (%.0f\u00b5s/doc)\n", Float.valueOf(progress), inputFile.getName(), format.format(i), format.format(lineCount), prevText, format.format(docCount), Float.valueOf(millisPerDoc));
    }

    public static void main(String[] args) throws Exception {
        Mode mode;
        if (args.length != 3) {
            System.out.println("Usage: " + FrequencyIndexCreator.class.getSimpleName() + " <text|lucene> <inputDir> <outputDir>");
            System.out.println("    <text|lucene> 'text' will write plain text files, 'lucene' will write Lucene indexes");
            System.out.println("    <inputDir> is the Google ngram data, optionally already aggregated by Hive (lucene mode),");
            System.out.println("               please see https://dev.languagetool.org/finding-errors-using-n-gram-data");
            System.exit(1);
        }
        if (args[0].equals("text")) {
            mode = Mode.PlainText;
        } else if (args[0].equals("lucene")) {
            mode = Mode.Lucene;
        } else {
            throw new RuntimeException("Unknown mode: " + args[0]);
        }
        FrequencyIndexCreator creator = new FrequencyIndexCreator(mode);
        System.out.println("Mode: " + mode);
        System.out.println("Minimum year: 1910");
        System.out.println("Ignore POS tags: true");
        creator.run(new File(args[1]), new File(args[2]));
    }

    private static enum Mode {
        PlainText,
        Lucene;

    }

    static class TextDataWriter
    extends DataWriter {
        private final FileWriter fw;
        private final BufferedWriter writer;

        TextDataWriter(File indexDir) throws IOException {
            if (indexDir.exists()) {
                System.out.println("Using existing dir: " + indexDir.getAbsolutePath());
            } else {
                boolean mkdir = indexDir.mkdir();
                if (!mkdir) {
                    throw new RuntimeException("Could not create: " + indexDir.getAbsolutePath());
                }
            }
            this.fw = new FileWriter(new File(indexDir, indexDir.getName() + "-output.csv"));
            this.writer = new BufferedWriter(this.fw);
        }

        @Override
        void addDoc(String text, long count) throws IOException {
            this.fw.write(text + "\t" + count + "\n");
        }

        @Override
        void addTotalTokenCountDoc(long totalTokenCount) throws IOException {
            System.err.println("Note: not writing totalTokenCount (" + totalTokenCount + ") in file mode");
        }

        @Override
        public void close() throws Exception {
            if (this.fw != null) {
                this.fw.close();
            }
            this.writer.close();
        }
    }

    class LuceneDataWriter
    extends DataWriter {
        IndexWriter writer;

        LuceneDataWriter(File indexDir) throws IOException {
            StandardAnalyzer analyzer = new StandardAnalyzer();
            IndexWriterConfig config = new IndexWriterConfig((Analyzer)analyzer);
            config.setUseCompoundFile(false);
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
            FSDirectory directory = FSDirectory.open((Path)indexDir.toPath());
            this.writer = new IndexWriter((Directory)directory, config);
        }

        @Override
        void addDoc(String text, long count) throws IOException {
            if (text.length() > 1000) {
                System.err.println("Ignoring doc, ngram is > 1000 chars: " + text.substring(0, 50) + "...");
            } else {
                Document doc = new Document();
                doc.add((IndexableField)new Field("ngram", text, StringField.TYPE_NOT_STORED));
                FieldType fieldType = new FieldType();
                fieldType.setStored(true);
                Field countField = new Field("count", String.valueOf(count), fieldType);
                doc.add((IndexableField)countField);
                FrequencyIndexCreator.this.totalTokenCount += count;
                this.writer.addDocument((Iterable)doc);
            }
        }

        @Override
        void addTotalTokenCountDoc(long totalTokenCount) throws IOException {
            FieldType fieldType = new FieldType();
            fieldType.setIndexOptions(IndexOptions.DOCS);
            fieldType.setStored(true);
            Field countField = new Field("totalTokenCount", String.valueOf(totalTokenCount), fieldType);
            Document doc = new Document();
            doc.add((IndexableField)countField);
            this.writer.addDocument((Iterable)doc);
        }

        @Override
        public void close() throws Exception {
            if (this.writer != null) {
                this.writer.close();
            }
        }
    }

    static abstract class DataWriter
    implements AutoCloseable {
        DataWriter() {
        }

        abstract void addDoc(String var1, long var2) throws IOException;

        abstract void addTotalTokenCountDoc(long var1) throws IOException;
    }
}

