/*
 * Decompiled with CFR 0.152.
 */
package ch.epfl.bbp.uima.filter;

import ch.epfl.bbp.io.SVReader;
import ch.epfl.bbp.uima.BlueUima;
import ch.epfl.bbp.uima.types.TooManyOOV;
import com.google.common.collect.Maps;
import de.julielab.jules.types.Token;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;

@TypeCapability(inputs={"de.julielab.jules.types.Token"}, outputs={"ch.epfl.bbp.uima.types.TooManyOOV"})
public class TooMuchOOVFilterAnnotator
extends JCasAnnotator_ImplBase {
    private static final double RATIO_CUTOFF = 0.4;
    final String frequencyFile = BlueUima.BLUE_UTILS_ROOT + "src/main/resources/preprocessing/freq_10000.tsv";
    private Map<String, Integer> frequencyMap = Maps.newHashMap();
    static final Pattern UPPER_THEN_LOWER = Pattern.compile("[A-Z]\\p{L}+");

    public void initialize(UimaContext context) throws ResourceInitializationException {
        super.initialize(context);
        try {
            for (List line : new SVReader.TSVReader(new File(this.frequencyFile), false)) {
                String token = (String)line.get(0);
                if (token.startsWith("#")) continue;
                int freq = Integer.parseInt((String)line.get(1));
                this.frequencyMap.put(token, freq);
            }
        }
        catch (IOException e) {
            throw new ResourceInitializationException((Throwable)e);
        }
    }

    public void process(JCas jCas) throws AnalysisEngineProcessException {
        Collection tokens = JCasUtil.select((JCas)jCas, Token.class);
        if (tokens != null) {
            double tokenCnt = tokens.size();
            double freqTokens = 0.0;
            for (Token t : tokens) {
                String txt = t.getCoveredText();
                if (!this.frequencyMap.containsKey(txt.toLowerCase()) && !TooMuchOOVFilterAnnotator.isOk(txt)) continue;
                freqTokens += 1.0;
            }
            double ratio = freqTokens / tokenCnt;
            if (ratio < 0.4) {
                new TooManyOOV(jCas).addToIndexes();
            }
        }
    }

    private static boolean isOk(String txt) {
        if (UPPER_THEN_LOWER.matcher(txt).matches()) {
            return true;
        }
        int ints = 0;
        for (char c : txt.toCharArray()) {
            if (!Character.isDigit(c)) continue;
            ++ints;
        }
        return ints * 2 > txt.length();
    }
}

