/*
 * Decompiled with CFR 0.152.
 */
package gate.twitter;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.LanguageAnalyser;
import gate.Resource;
import gate.SimpleAnnotation;
import gate.Utils;
import gate.annotation.AnnotationSetImpl;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.ResourceReference;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.InvalidOffsetException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.StringUtils;

@CreoleResource(name="Hashtag Tokenizer", icon="HashtagTokenizer", comment="Tokenizes Multi-Word Hashtags", helpURL="http://gate.ac.uk/userguide/sec:social:twitter:hashtag")
public class HashtagTokenizer
extends AbstractLanguageAnalyser {
    private static final long serialVersionUID = -7848183952807024913L;
    private static Comparator<Annotation> lengthComparator = new Comparator<Annotation>(){

        @Override
        public int compare(Annotation a1, Annotation a2) {
            long l1 = a1.getEndNode().getOffset() - a1.getStartNode().getOffset();
            long l2 = a2.getEndNode().getOffset() - a2.getStartNode().getOffset();
            return (int)(l2 - l1);
        }
    };
    private LanguageAnalyser gaz;
    private ResourceReference gazURL;
    private String inputASName;
    private String outputASName;
    private Boolean debug = Boolean.FALSE;
    private Long timelimit;
    private Long failTime;

    public Boolean getDebug() {
        return this.debug;
    }

    public Long getTimelimit() {
        return this.timelimit;
    }

    @CreoleParameter(defaultValue="5", comment="maximum number of seconds to spend processing each hashtag, set to less than 0 for no limit")
    @RunTime
    public void setTimelimit(Long timelimit) {
        this.timelimit = timelimit;
    }

    @CreoleParameter(defaultValue="false")
    @RunTime
    @Optional
    public void setDebug(Boolean debug) {
        this.debug = debug;
    }

    public ResourceReference getGazetteerURL() {
        return this.gazURL;
    }

    @CreoleParameter(defaultValue="resources/hashtag/gazetteer/lists.def")
    public void setGazetteerURL(ResourceReference gazURL) {
        this.gazURL = gazURL;
    }

    public String getInputASName() {
        return this.inputASName;
    }

    @CreoleParameter
    @RunTime
    @Optional
    public void setInputASName(String inputASName) {
        this.inputASName = inputASName;
    }

    public String getOutputASName() {
        return this.outputASName;
    }

    @CreoleParameter
    @RunTime
    @Optional
    public void setOutputASName(String outputASName) {
        this.outputASName = outputASName;
    }

    public Resource init() throws ResourceInstantiationException {
        FeatureMap hidden = Factory.newFeatureMap();
        Gate.setHiddenAttribute((FeatureMap)hidden, (boolean)true);
        FeatureMap params = Factory.newFeatureMap();
        params.put((Object)"listsURL", (Object)this.gazURL);
        params.put((Object)"caseSensitive", (Object)Boolean.FALSE);
        params.put((Object)"longestMatchOnly", (Object)Boolean.FALSE);
        params.put((Object)"wholeWordsOnly", (Object)Boolean.FALSE);
        if (this.gaz == null) {
            this.gaz = (LanguageAnalyser)Factory.createResource((String)"gate.creole.gazetteer.DefaultGazetteer", (FeatureMap)params, (FeatureMap)hidden, (String)"Hashtag Tokenizer Gazetteer");
        } else {
            this.gaz.setParameterValues(params);
            this.gaz.reInit();
        }
        return this;
    }

    public void execute() throws ExecutionException {
        this.interrupted = false;
        AnnotationSet inputAS = this.document.getAnnotations(this.inputASName);
        AnnotationSet outputAS = this.document.getAnnotations(this.outputASName);
        FeatureMap features = Factory.newFeatureMap();
        long startTime = System.currentTimeMillis();
        this.fireStatusChanged("Tokenizing Hashtags: " + this.document.getName());
        this.fireProgressChanged(0);
        int count = 0;
        AnnotationSetImpl lookups = new AnnotationSetImpl(this.document);
        try {
            this.gaz.setParameterValue("annotationSetName", (Object)this.inputASName);
            this.gaz.setDocument(this.document);
            this.gaz.execute();
            AnnotationSet hashtags = inputAS.get("Hashtag");
            for (Annotation hashtag : hashtags) {
                String tagText;
                this.failTime = this.timelimit > 0L ? System.currentTimeMillis() + this.timelimit * 1000L : Long.MAX_VALUE;
                AnnotationSet contained = inputAS.getContained(hashtag.getStartNode().getOffset(), hashtag.getEndNode().getOffset());
                lookups.clear();
                lookups.addAll((Collection)contained.get("HashtagLookup"));
                lookups.addAll((Collection)contained.get("Lookup"));
                features = Factory.newFeatureMap();
                features.put((Object)"kind", (Object)"number");
                lookups.addAll((Collection)contained.get("Token", features));
                features = Factory.newFeatureMap();
                features.put((Object)"string", (Object)"_");
                lookups.addAll((Collection)contained.get("Token", features));
                if (this.isInterrupted()) {
                    throw new ExecutionInterruptedException("The execution of the hashtag tokenizer has been abruptly interrupted!");
                }
                ArrayList<List<Annotation>> fewestTokens = new ArrayList<List<Annotation>>();
                List<Annotation> start = this.sort(Utils.getAnnotationsAtOffset((AnnotationSet)lookups, (Long)(hashtag.getStartNode().getOffset() + 1L)));
                for (Annotation a : start) {
                    List<List<Annotation>> found = this.search((AnnotationSet)lookups, hashtag.getEndNode().getOffset(), a);
                    if (found == null) continue;
                    if (fewestTokens.isEmpty()) {
                        fewestTokens.addAll(found);
                        continue;
                    }
                    if (found.get(0).size() == ((List)fewestTokens.get(0)).size()) {
                        fewestTokens.addAll(found);
                        continue;
                    }
                    if (found.get(0).size() >= ((List)fewestTokens.get(0)).size()) continue;
                    fewestTokens.clear();
                    fewestTokens.addAll(found);
                }
                if (this.debug.booleanValue() && fewestTokens.size() > 1) {
                    System.out.println(Utils.stringFor((Document)this.document, (SimpleAnnotation)hashtag));
                    this.display(fewestTokens);
                }
                if (fewestTokens.isEmpty() && "mixedCaps".equals(HashtagTokenizer.getTokenType(tagText = Utils.stringFor((Document)this.document, (SimpleAnnotation)hashtag).substring(1))[1])) {
                    ArrayList<Annotation> found = new ArrayList<Annotation>();
                    long begin = hashtag.getStartNode().getOffset() + 1L;
                    for (String token : tagText.split("((?<=[a-z])(?=[A-Z]))|((?<=[A-Z]{2,})(?=[a-z]))")) {
                        found.add(lookups.get(lookups.add(Long.valueOf(begin), Long.valueOf(begin += (long)token.length()), "CamelToken", Factory.newFeatureMap())));
                    }
                    fewestTokens.add(found);
                }
                if (!fewestTokens.isEmpty()) {
                    inputAS.removeAll((Collection)inputAS.get("Token").getContained(hashtag.getStartNode().getOffset(), hashtag.getEndNode().getOffset()));
                    features = Factory.newFeatureMap();
                    features.put((Object)"string", (Object)"#");
                    features.put((Object)"length", (Object)"1");
                    features.put((Object)"kind", (Object)"punctuation");
                    outputAS.add(hashtag.getStartNode().getOffset(), Long.valueOf(hashtag.getStartNode().getOffset() + 1L), "Token", features);
                    int prefered = 0;
                    for (int i = 0; i < fewestTokens.size(); ++i) {
                        boolean okay = true;
                        for (Annotation a : (List)fewestTokens.get(i)) {
                            if (a.getEndNode().getOffset() - a.getStartNode().getOffset() != 1L) continue;
                            okay = false;
                        }
                        if (!okay) continue;
                        prefered = i;
                        break;
                    }
                    StringBuilder normalizedHashtag = new StringBuilder();
                    for (Annotation a : (List)fewestTokens.get(prefered)) {
                        long startOffset = a.getStartNode().getOffset();
                        long endOffset = a.getEndNode().getOffset();
                        String length = Long.toString(endOffset - startOffset);
                        String string = Utils.stringFor((Document)this.document, (SimpleAnnotation)a);
                        if (normalizedHashtag.length() > 0) {
                            normalizedHashtag.append("\u200a");
                        }
                        normalizedHashtag.append(string.toLowerCase());
                        String[] tokenType = HashtagTokenizer.getTokenType(string);
                        String kind = tokenType[0];
                        String orth = tokenType[1];
                        features = Factory.newFeatureMap();
                        features.put((Object)"string", (Object)string);
                        features.put((Object)"length", (Object)length);
                        features.put((Object)"kind", (Object)kind);
                        if (orth != null) {
                            features.put((Object)"orth", (Object)orth);
                        }
                        outputAS.add(Long.valueOf(startOffset), Long.valueOf(endOffset), "Token", features);
                        if (!this.debug.booleanValue()) continue;
                        features = Factory.newFeatureMap();
                        features.put((Object)"string", (Object)string);
                        features.put((Object)"length", (Object)length);
                        features.put((Object)"kind", (Object)kind);
                        if (orth != null) {
                            features.put((Object)"orth", (Object)orth);
                        }
                        outputAS.add(Long.valueOf(startOffset), Long.valueOf(endOffset), "HashtagToken", features);
                    }
                    hashtag.getFeatures().put((Object)"tokenized", (Object)normalizedHashtag.insert(0, "#").toString());
                } else {
                    StringBuilder normalizedHashtag = new StringBuilder();
                    List tokens = Utils.inDocumentOrder((AnnotationSet)inputAS.get("Token").getContained(Long.valueOf(hashtag.getStartNode().getOffset() + 1L), hashtag.getEndNode().getOffset()));
                    for (Annotation token : tokens) {
                        String string = Utils.stringFor((Document)this.document, (SimpleAnnotation)token);
                        if (normalizedHashtag.length() > 0) {
                            normalizedHashtag.append("\u200a");
                        }
                        normalizedHashtag.append(string.toLowerCase());
                        if (!this.debug.booleanValue()) continue;
                        features = Factory.newFeatureMap();
                        features.putAll((Map)token.getFeatures());
                        outputAS.add(token.getStartNode().getOffset(), token.getEndNode().getOffset(), "HashtagToken", features);
                    }
                    hashtag.getFeatures().put((Object)"tokenized", (Object)normalizedHashtag.insert(0, "#").toString());
                }
                this.fireProgressChanged(count++ * 100 / hashtags.size());
            }
            this.fireProcessFinished();
            this.fireStatusChanged("Hashtags in " + this.document.getName() + " tokenized in " + NumberFormat.getInstance().format((double)(System.currentTimeMillis() - startTime) / 1000.0) + " seconds!");
        }
        catch (InvalidOffsetException e) {
            throw new ExecutionException((Throwable)e);
        }
        catch (ResourceInstantiationException e) {
            throw new ExecutionException((Throwable)e);
        }
        finally {
            this.gaz.setDocument(null);
            if (!this.debug.booleanValue()) {
                inputAS.removeAll((Collection)inputAS.get("HashtagLookup"));
            }
        }
    }

    public void cleanup() {
        Factory.deleteResource((Resource)this.gaz);
    }

    private static String[] getTokenType(String string) {
        String kind = "symbol";
        String orth = null;
        if (StringUtils.isAlpha((String)string)) {
            kind = "word";
            orth = StringUtils.isAllLowerCase((String)string) ? "lowercase" : (StringUtils.isAllUpperCase((String)string) ? "allCaps" : (StringUtils.isAllUpperCase((String)string.substring(0, 1)) && StringUtils.isAllLowerCase((String)string.substring(1)) ? "upperInitial" : "mixedCaps"));
        } else if (StringUtils.isNumeric((String)string)) {
            kind = "number";
        } else if (string.matches("(\\p{Punct})+")) {
            kind = "punctuation";
        }
        return new String[]{kind, orth};
    }

    private void display(List<List<Annotation>> found) {
        for (List<Annotation> tokens : found) {
            System.out.print("   ");
            for (Annotation token : tokens) {
                System.out.print(Utils.stringFor((Document)this.document, (SimpleAnnotation)token) + " ");
            }
            System.out.print("\n");
        }
    }

    private List<List<Annotation>> search(AnnotationSet lookups, Long endOffset, Annotation token) throws InvalidOffsetException, ExecutionInterruptedException {
        String rest;
        if (this.isInterrupted()) {
            throw new ExecutionInterruptedException("The execution of the hashtag tokenizer has been abruptly interrupted!");
        }
        if (System.currentTimeMillis() > this.failTime) {
            return null;
        }
        if ("mixedCaps".equals(HashtagTokenizer.getTokenType(Utils.stringFor((Document)lookups.getDocument(), (SimpleAnnotation)token))[1])) {
            return null;
        }
        ArrayList<List<Annotation>> shortest = new ArrayList<List<Annotation>>();
        if (token.getEndNode().getOffset().equals(endOffset)) {
            ArrayList<Annotation> found = new ArrayList<Annotation>();
            found.add(token);
            shortest.add(found);
            return shortest;
        }
        if (endOffset - token.getEndNode().getOffset() > 1L && (rest = lookups.getDocument().getContent().getContent(Long.valueOf(token.getEndNode().getOffset() - 1L), endOffset).toString()).substring(1).matches(rest.substring(0, 1) + "+")) {
            Annotation newToken = lookups.get(lookups.add(token.getStartNode().getOffset(), endOffset, "HashtagLookup", Factory.newFeatureMap()));
            ArrayList<Annotation> arrayList = new ArrayList<Annotation>();
            arrayList.add(newToken);
            shortest.add(arrayList);
            return shortest;
        }
        List<Annotation> next = this.sort(Utils.getAnnotationsAtOffset((AnnotationSet)lookups, (Long)token.getEndNode().getOffset()));
        if (next == null || next.isEmpty()) {
            return null;
        }
        for (Annotation annotation : next) {
            List<List<Annotation>> part = this.search(lookups, endOffset, annotation);
            if (part == null) continue;
            if (shortest.isEmpty()) {
                shortest.addAll(part);
                continue;
            }
            if (part.get(0).size() == ((List)shortest.get(0)).size()) {
                shortest.addAll(part);
                continue;
            }
            if (part.get(0).size() >= ((List)shortest.get(0)).size()) continue;
            shortest.clear();
            shortest.addAll(part);
        }
        if (shortest.isEmpty()) {
            return null;
        }
        for (List list : shortest) {
            list.add(0, token);
        }
        return shortest;
    }

    private List<Annotation> sort(AnnotationSet annotations) {
        ArrayList<Annotation> sorted = new ArrayList<Annotation>();
        if (annotations == null || annotations.isEmpty()) {
            return sorted;
        }
        sorted.addAll((Collection<Annotation>)annotations);
        Collections.sort(sorted, lengthComparator);
        ArrayList<Annotation> filtered = new ArrayList<Annotation>();
        long length = -1L;
        for (Annotation a : sorted) {
            long al = a.getEndNode().getOffset() - a.getStartNode().getOffset();
            if (length != -1L && al == length) continue;
            filtered.add(a);
            length = al;
        }
        return filtered;
    }
}

