/*
 * Decompiled with CFR 0.152.
 */
package org.apache.tika.eval.app.tools;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.FileAttribute;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.tika.eval.app.tools.SlowCompositeReaderWrapper;
import org.apache.tika.eval.core.tokens.AnalyzerManager;
import org.apache.tika.utils.ProcessUtils;

public class TopCommonTokenCounter {
    private static final String FIELD = "f";
    static Set<String> INCLUDE_LIST = new HashSet<String>(Arrays.asList("___url___", "___email___"));
    static Set<String> SKIP_LIST = new HashSet<String>(Arrays.asList("span", "table", "href", "head", "title", "body", "html", "tagname", "lang", "style", "script", "strong", "blockquote", "form", "iframe", "section", "colspan", "rowspan"));
    private static String LICENSE = "# Licensed to the Apache Software Foundation (ASF) under one or more\n# contributor license agreements.  See the NOTICE file distributed with\n# this work for additional information regarding copyright ownership.\n# The ASF licenses this file to You under the Apache License, Version 2.0\n# (the \"License\"); you may not use this file except in compliance with\n# the License.  You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n";
    private static int TOP_N = 30000;
    private static int MIN_DOC_FREQ = 10;

    public static void main(String[] args) throws Exception {
        Path commonTokensFile = Paths.get(args[0], new String[0]);
        ArrayList<Path> inputFiles = new ArrayList<Path>();
        for (int i = 1; i < args.length; ++i) {
            inputFiles.add(Paths.get(ProcessUtils.unescapeCommandLine(args[i]), new String[0]));
        }
        TopCommonTokenCounter counter = new TopCommonTokenCounter();
        if (Files.exists(commonTokensFile, new LinkOption[0])) {
            System.err.println(commonTokensFile.getFileName().toString() + " exists. I'm skipping this.");
            return;
        }
        counter.execute(commonTokensFile, inputFiles);
    }

    private static void writeTopN(Path path, long totalDocs, long sumDocFreqs, long sumTotalTermFreqs, long uniqueTerms, AbstractTokenTFDFPriorityQueue queue) throws IOException {
        if (Files.isRegularFile(path, new LinkOption[0])) {
            System.err.println("File " + path.getFileName() + " already exists. Skipping.");
            return;
        }
        Files.createDirectories(path.getParent(), new FileAttribute[0]);
        try (BufferedWriter writer = Files.newBufferedWriter(path, StandardCharsets.UTF_8, new OpenOption[0]);){
            StringBuilder sb = new StringBuilder();
            writer.write(LICENSE);
            writer.write("#DOC_COUNT\t" + totalDocs + "\n");
            writer.write("#SUM_DOC_FREQS\t" + sumDocFreqs + "\n");
            writer.write("#SUM_TERM_FREQS\t" + sumTotalTermFreqs + "\n");
            writer.write("#UNIQUE_TERMS\t" + uniqueTerms + "\n");
            writer.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
            for (String t : INCLUDE_LIST) {
                writer.write(t);
                writer.newLine();
            }
            for (TokenDFTF tp : queue.getArray()) {
                writer.write(TopCommonTokenCounter.getRow(sb, tp) + "\n");
            }
            writer.flush();
        }
    }

    private static String getRow(StringBuilder sb, TokenDFTF tp) {
        sb.setLength(0);
        sb.append(TopCommonTokenCounter.clean(tp.token));
        sb.append("\t").append(tp.df);
        sb.append("\t").append(tp.tf);
        return sb.toString();
    }

    private static String clean(String s) {
        if (s == null) {
            return "";
        }
        return s.replaceAll("\\s+", " ").trim();
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private void execute(Path commonTokensFile, List<Path> inputFiles) throws Exception {
        Path luceneDir = Files.createTempDirectory("tika-eval-lucene-", new FileAttribute[0]);
        TokenDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N);
        long totalDocs = -1L;
        long sumDocFreqs = -1L;
        long sumTotalTermFreqs = -1L;
        long uniqueTerms = -1L;
        try (FSDirectory directory = FSDirectory.open(luceneDir);){
            AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1);
            Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
            int maxLen = 1000000;
            int len = 0;
            try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig);){
                ArrayList<Document> docs = new ArrayList<Document>();
                for (Path inputFile : inputFiles) {
                    boolean isLeipzig = inputFile.getFileName().toString().contains("-sentences.txt");
                    int lines = 0;
                    BufferedReader reader = this.getReader(inputFile);
                    try {
                        String line = reader.readLine();
                        while (line != null) {
                            int tab;
                            if (isLeipzig && (tab = line.indexOf("\t")) > -1) {
                                line = line.substring(tab + 1);
                            }
                            Document document = new Document();
                            document.add(new TextField(FIELD, line, Field.Store.NO));
                            docs.add(document);
                            if ((len += line.length()) > maxLen) {
                                writer.addDocuments(docs);
                                docs.clear();
                                len = 0;
                            }
                            line = reader.readLine();
                            if (++lines % 100000 != 0) continue;
                            System.out.println("processed " + lines + " for " + inputFile.getFileName() + " :: " + commonTokensFile.toAbsolutePath());
                        }
                    }
                    finally {
                        if (reader == null) continue;
                        reader.close();
                    }
                }
                if (docs.size() > 0) {
                    writer.addDocuments(docs);
                }
                writer.commit();
                writer.flush();
            }
            try (DirectoryReader reader = DirectoryReader.open(directory);){
                LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader);
                totalDocs = wrappedReader.getDocCount(FIELD);
                sumDocFreqs = wrappedReader.getSumDocFreq(FIELD);
                sumTotalTermFreqs = wrappedReader.getSumTotalTermFreq(FIELD);
                Terms terms = wrappedReader.terms(FIELD);
                TermsEnum termsEnum = terms.iterator();
                BytesRef bytesRef = termsEnum.next();
                int docsWThisField = wrappedReader.getDocCount(FIELD);
                while (bytesRef != null) {
                    String t;
                    ++uniqueTerms;
                    int df = termsEnum.docFreq();
                    long tf = termsEnum.totalTermFreq();
                    if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) {
                        bytesRef = termsEnum.next();
                        continue;
                    }
                    if (!(queue.top() != null && queue.size() >= TOP_N && df < ((TokenDFTF)queue.top()).df || SKIP_LIST.contains(t = bytesRef.utf8ToString()))) {
                        queue.insertWithOverflow(new TokenDFTF(t, df, tf));
                    }
                    bytesRef = termsEnum.next();
                }
            }
        }
        finally {
            FileUtils.deleteDirectory(luceneDir.toFile());
        }
        TopCommonTokenCounter.writeTopN(commonTokensFile, totalDocs, sumDocFreqs, sumTotalTermFreqs, uniqueTerms, queue);
    }

    private BufferedReader getReader(Path inputFile) throws IOException {
        InputStream is = Files.newInputStream(inputFile, new OpenOption[0]);
        if (inputFile.toString().endsWith(".gz")) {
            is = new GzipCompressorInputStream(is);
        }
        return new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
    }

    private static abstract class AbstractTokenTFDFPriorityQueue
    extends PriorityQueue<TokenDFTF> {
        AbstractTokenTFDFPriorityQueue(int maxSize) {
            super(maxSize);
        }

        public TokenDFTF[] getArray() {
            TokenDFTF[] topN = new TokenDFTF[this.size()];
            TokenDFTF term = (TokenDFTF)this.pop();
            int i = topN.length - 1;
            while (term != null && i > -1) {
                topN[i--] = term;
                term = (TokenDFTF)this.pop();
            }
            return topN;
        }
    }

    private static class TokenDFTF {
        final String token;
        final int df;
        final long tf;

        public TokenDFTF(String token, int df, long tf) {
            this.token = token;
            this.df = df;
            this.tf = tf;
        }

        public long getTF() {
            return this.tf;
        }

        public int getDF() {
            return this.df;
        }

        public String getToken() {
            return this.token;
        }

        public boolean equals(Object o) {
            if (this == o) {
                return true;
            }
            if (o == null || this.getClass() != o.getClass()) {
                return false;
            }
            TokenDFTF tokenDFTF = (TokenDFTF)o;
            if (this.df != tokenDFTF.df) {
                return false;
            }
            if (this.tf != tokenDFTF.tf) {
                return false;
            }
            return Objects.equals(this.token, tokenDFTF.token);
        }

        public int hashCode() {
            int result = this.token != null ? this.token.hashCode() : 0;
            result = 31 * result + this.df;
            result = 31 * result + (int)(this.tf ^ this.tf >>> 32);
            return result;
        }

        public String toString() {
            return "TokenDFTF{token='" + this.token + "', df=" + this.df + ", tf=" + this.tf + "}";
        }
    }

    private static class TokenDFPriorityQueue
    extends AbstractTokenTFDFPriorityQueue {
        TokenDFPriorityQueue(int maxSize) {
            super(maxSize);
        }

        @Override
        protected boolean lessThan(TokenDFTF arg0, TokenDFTF arg1) {
            if (arg0.df < arg1.df) {
                return true;
            }
            if (arg0.df > arg1.df) {
                return false;
            }
            return arg1.token.compareTo(arg0.token) < 0;
        }

        @Override
        public TokenDFTF[] getArray() {
            TokenDFTF[] topN = new TokenDFTF[this.size()];
            TokenDFTF term = (TokenDFTF)this.pop();
            int i = topN.length - 1;
            while (term != null && i > -1) {
                topN[i--] = term;
                term = (TokenDFTF)this.pop();
            }
            return topN;
        }
    }
}

