/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.dev.bigdata;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.Set;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.language.AmericanEnglish;
import org.languagetool.rules.ConfusionSetLoader;

final class NeededNGramCounter {
    private static final String LANG = "en";

    private NeededNGramCounter() {
    }

    public static void main(String[] args) throws IOException {
        BytesRef next;
        Set ngrams;
        if (args.length != 1) {
            System.out.println("Usage: " + NeededNGramCounter.class.getSimpleName() + " <ngramIndexDir>");
            System.exit(1);
        }
        Language lang = Languages.getLanguageForShortCode((String)LANG);
        String path = "/" + lang.getShortCode() + "/confusion_sets.txt";
        try (InputStream confSetStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(path);){
            ngrams = new ConfusionSetLoader((Language)AmericanEnglish.getInstance()).loadConfusionPairs(confSetStream).keySet();
        }
        String ngramIndexDir = args[0];
        FSDirectory fsDir = FSDirectory.open((Path)new File(ngramIndexDir).toPath());
        DirectoryReader reader = DirectoryReader.open((Directory)fsDir);
        Fields fields = MultiFields.getFields((IndexReader)reader);
        Terms terms = fields.terms("ngram");
        TermsEnum termsEnum = terms.iterator();
        int i = 0;
        int needed = 0;
        int notNeeded = 0;
        while ((next = termsEnum.next()) != null) {
            String term = next.utf8ToString();
            String[] tmpTerms = term.split(" ");
            boolean ngramNeeded = false;
            for (String tmpTerm : tmpTerms) {
                if (!ngrams.contains(tmpTerm)) continue;
                ngramNeeded = true;
                break;
            }
            if (ngramNeeded) {
                ++needed;
            } else {
                ++notNeeded;
            }
            if (i % 500000 == 0) {
                System.out.println(i + "/" + terms.getDocCount());
            }
            ++i;
        }
        System.out.println("language         : en");
        System.out.println("ngram index      : " + ngramIndexDir);
        System.out.println("needed ngrams    : " + needed);
        System.out.println("not needed ngrams: " + notNeeded);
    }
}

