/*
 * Decompiled with CFR 0.152.
 */
package gate.creole.tokeniser;

import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.ANNIEConstants;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ExecutionInterruptedException;
import gate.creole.ResourceInstantiationException;
import gate.creole.ResourceReference;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.creole.tokeniser.DFSMState;
import gate.creole.tokeniser.FSMState;
import gate.creole.tokeniser.InvalidRuleException;
import gate.creole.tokeniser.TokeniserException;
import gate.creole.tokeniser.UnicodeType;
import gate.util.BomStrippingInputStreamReader;
import gate.util.Err;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.AbstractCollection;
import java.util.AbstractSet;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;

@CreoleResource(name="GATE Unicode Tokeniser", comment="A customisable Unicode tokeniser.", helpURL="http://gate.ac.uk/userguide/sec:annie:tokeniser", icon="tokeniser")
public class SimpleTokeniser
extends AbstractLanguageAnalyser
implements ANNIEConstants {
    private static final long serialVersionUID = 1411111968361716069L;
    public static final String SIMP_TOK_DOCUMENT_PARAMETER_NAME = "document";
    public static final String SIMP_TOK_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
    public static final String SIMP_TOK_RULES_URL_PARAMETER_NAME = "rulesURL";
    public static final String SIMP_TOK_ENCODING_PARAMETER_NAME = "encoding";
    protected String annotationSetName;
    protected FSMState initialState;
    protected Set<FSMState> fsmStates = new HashSet<FSMState>();
    protected DFSMState dInitialState;
    protected Set<DFSMState> dfsmStates = new HashSet<DFSMState>();
    static String LHStoRHS;
    protected static final Set<String> ignoreTokens;
    protected static final Map<Integer, Integer> typeIds;
    protected static final int maxTypeId;
    protected static final List<String> typeMnemonics;
    protected static final Map<String, Integer> stringTypeIds;
    private String rulesResourceName;
    private ResourceReference rulesURL;
    private String encoding;
    protected transient Map<Set<FSMState>, DFSMState> newStates = new HashMap<Set<FSMState>, DFSMState>();

    public Resource init() throws ResourceInstantiationException {
        if (this.rulesURL == null) {
            throw new ResourceInstantiationException("No URL provided for the rules!");
        }
        try (BufferedReader bRulesReader = new BufferedReader((Reader)new BomStrippingInputStreamReader(this.rulesURL.openStream(), this.encoding));){
            this.initialState = new FSMState(this);
            String line = bRulesReader.readLine();
            StringBuffer toParse = new StringBuffer(1024);
            while (line != null) {
                if (line.endsWith("\\")) {
                    toParse.append(line.substring(0, line.length() - 1));
                } else {
                    toParse.append(line);
                    this.parseRule(toParse.toString());
                    toParse.delete(0, toParse.length());
                }
                line = bRulesReader.readLine();
            }
            this.eliminateVoidTransitions();
        }
        catch (IOException ioe) {
            throw new ResourceInstantiationException((Exception)ioe);
        }
        catch (TokeniserException te) {
            throw new ResourceInstantiationException((Exception)((Object)te));
        }
        return this;
    }

    public void reset() {
        this.document = null;
    }

    void parseRule(String line) throws TokeniserException {
        if (line.startsWith("#")) {
            return;
        }
        if (line.startsWith("//")) {
            return;
        }
        StringTokenizer st = new StringTokenizer(line, "()+*|\" \t\f>", true);
        FSMState newState = new FSMState(this);
        this.initialState.put(null, newState);
        FSMState finalState = this.parseLHS(newState, st, LHStoRHS);
        String rhs = "";
        if (st.hasMoreTokens()) {
            rhs = st.nextToken("\f");
        }
        if (rhs.length() > 0) {
            finalState.setRhs(rhs);
        }
    }

    FSMState parseLHS(FSMState startState, StringTokenizer st, String until) throws TokeniserException {
        FSMState currentState = startState;
        boolean orFound = false;
        LinkedList<FSMState> orList = new LinkedList<FSMState>();
        String token = SimpleTokeniser.skipIgnoreTokens(st);
        if (null == token) {
            return currentState;
        }
        while (!token.equals(until)) {
            UnicodeType uType;
            Integer typeId;
            String sType;
            FSMState newState;
            if (token.equals("(")) {
                newState = this.parseLHS(currentState, st, ")");
            } else if (token.equals("\"")) {
                sType = this.parseQuotedString(st, "\"");
                newState = new FSMState(this);
                typeId = stringTypeIds.get(sType);
                if (null == typeId) {
                    throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
                }
                uType = new UnicodeType(typeId);
                currentState.put(uType, newState);
            } else {
                sType = token;
                newState = new FSMState(this);
                typeId = stringTypeIds.get(sType);
                if (null == typeId) {
                    throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
                }
                uType = new UnicodeType(typeId);
                currentState.put(uType, newState);
            }
            token = SimpleTokeniser.skipIgnoreTokens(st);
            if (null == token) {
                throw new InvalidRuleException("Tokeniser rule ended too soon!");
            }
            if (token.equals("|")) {
                orFound = true;
                orList.add(newState);
                token = SimpleTokeniser.skipIgnoreTokens(st);
                if (null != token) continue;
                throw new InvalidRuleException("Tokeniser rule ended too soon!");
            }
            if (orFound) {
                orFound = false;
                orList.add(newState);
                newState = new FSMState(this);
                Iterator orListIter = orList.iterator();
                while (orListIter.hasNext()) {
                    ((FSMState)orListIter.next()).put(null, newState);
                }
                orList.clear();
            }
            if (token.equals("+")) {
                newState.put(null, currentState);
                currentState = newState;
                newState = new FSMState(this);
                currentState.put(null, newState);
                token = SimpleTokeniser.skipIgnoreTokens(st);
                if (null == token) {
                    throw new InvalidRuleException("Tokeniser rule ended too soon!");
                }
            } else if (token.equals("*")) {
                currentState.put(null, newState);
                newState.put(null, currentState);
                currentState = newState;
                newState = new FSMState(this);
                currentState.put(null, newState);
                token = SimpleTokeniser.skipIgnoreTokens(st);
                if (null == token) {
                    throw new InvalidRuleException("Tokeniser rule ended too soon!");
                }
            }
            currentState = newState;
        }
        return currentState;
    }

    String parseQuotedString(StringTokenizer st, String until) throws TokeniserException {
        if (!st.hasMoreElements()) {
            return null;
        }
        String token = st.nextToken();
        StringBuffer type = new StringBuffer(1024);
        while (!token.equals(until)) {
            type.append(token);
            if (st.hasMoreElements()) {
                token = st.nextToken();
                continue;
            }
            throw new InvalidRuleException("Tokeniser rule ended too soon!");
        }
        return type.toString();
    }

    protected static String skipIgnoreTokens(StringTokenizer st) {
        boolean ignorableFound = false;
        while (st.hasMoreTokens()) {
            String currentToken = st.nextToken();
            Iterator<String> ignorables = ignoreTokens.iterator();
            ignorableFound = false;
            while (!ignorableFound && ignorables.hasNext()) {
                if (!currentToken.equals(ignorables.next())) continue;
                ignorableFound = true;
            }
            if (ignorableFound) continue;
            return currentToken;
        }
        return null;
    }

    private AbstractSet<FSMState> lambdaClosure(Set<FSMState> s) {
        LinkedList<FSMState> list = new LinkedList<FSMState>(s);
        HashSet<FSMState> lambdaClosure = new HashSet<FSMState>(s);
        while (!list.isEmpty()) {
            FSMState top = list.removeFirst();
            Set<FSMState> nextStates = top.nextSet(null);
            if (null == nextStates) continue;
            for (FSMState currentState : nextStates) {
                if (((AbstractCollection)lambdaClosure).contains(currentState)) continue;
                ((AbstractCollection)lambdaClosure).add(currentState);
                list.addFirst(currentState);
            }
        }
        return lambdaClosure;
    }

    void eliminateVoidTransitions() throws TokeniserException {
        String rhs;
        FSMState currentInnerState2;
        this.newStates.clear();
        HashSet<AbstractSet> sdStates = new HashSet<AbstractSet>();
        LinkedList<AbstractSet> unmarkedDStates = new LinkedList<AbstractSet>();
        DFSMState dCurrentState = new DFSMState(this);
        Set<FSMState> sdCurrentState = new HashSet<FSMState>();
        sdCurrentState.add(this.initialState);
        sdCurrentState = this.lambdaClosure(sdCurrentState);
        this.newStates.put(sdCurrentState, dCurrentState);
        sdStates.add((AbstractSet)sdCurrentState);
        Iterator<Object> innerStatesIter = sdCurrentState.iterator();
        HashSet<String> rhsClashSet = new HashSet<String>();
        boolean newRhs = false;
        while (innerStatesIter.hasNext()) {
            currentInnerState2 = (FSMState)innerStatesIter.next();
            if (!currentInnerState2.isFinal()) continue;
            rhs = currentInnerState2.getRhs();
            rhsClashSet.add(rhs);
            dCurrentState.rhs = rhs;
            newRhs = true;
        }
        if (rhsClashSet.size() > 1) {
            Err.println((String)("Warning, rule clash: " + rhsClashSet + "\nSelected last definition: " + dCurrentState.rhs));
        }
        if (newRhs) {
            dCurrentState.buildTokenDesc();
        }
        rhsClashSet.clear();
        unmarkedDStates.addFirst((AbstractSet)sdCurrentState);
        this.dInitialState = dCurrentState;
        while (!unmarkedDStates.isEmpty()) {
            sdCurrentState = (Set)unmarkedDStates.removeFirst();
            for (int type = 0; type < maxTypeId; ++type) {
                AbstractSet nextSet = new HashSet<FSMState>();
                for (FSMState currentInnerState2 : sdCurrentState) {
                    Set<FSMState> tempSet = currentInnerState2.nextSet(type);
                    if (null == tempSet) continue;
                    nextSet.addAll(tempSet);
                }
                if (nextSet.isEmpty()) continue;
                dCurrentState = this.newStates.get(nextSet = this.lambdaClosure(nextSet));
                if (dCurrentState == null) {
                    dCurrentState = new DFSMState(this);
                    sdStates.add(nextSet);
                    unmarkedDStates.add(nextSet);
                    innerStatesIter = nextSet.iterator();
                    newRhs = false;
                    while (innerStatesIter.hasNext()) {
                        currentInnerState2 = (FSMState)innerStatesIter.next();
                        if (!currentInnerState2.isFinal()) continue;
                        rhs = currentInnerState2.getRhs();
                        rhsClashSet.add(rhs);
                        dCurrentState.rhs = rhs;
                        newRhs = true;
                    }
                    if (rhsClashSet.size() > 1) {
                        Err.println((String)("Warning, rule clash: " + rhsClashSet + "\nSelected last definition: " + dCurrentState.rhs));
                    }
                    if (newRhs) {
                        dCurrentState.buildTokenDesc();
                    }
                    rhsClashSet.clear();
                    this.newStates.put(nextSet, dCurrentState);
                }
                this.newStates.get(sdCurrentState).put(type, dCurrentState);
            }
        }
    }

    public String getFSMgml() {
        String res = "graph[ \ndirected 1\n";
        StringBuffer nodes = new StringBuffer(1024);
        StringBuffer edges = new StringBuffer(1024);
        for (FSMState currentState : this.fsmStates) {
            int stateIndex = currentState.getIndex();
            nodes.append("node[ id ");
            nodes.append(stateIndex);
            nodes.append(" label \"");
            nodes.append(stateIndex);
            if (currentState.isFinal()) {
                nodes.append(",F\\n" + currentState.getRhs());
            }
            nodes.append("\"  ]\n");
            edges.append(currentState.getEdgesGML());
        }
        res = res + nodes.toString() + edges.toString() + "]\n";
        return res;
    }

    public String getDFSMgml() {
        String res = "graph[ \ndirected 1\n";
        StringBuffer nodes = new StringBuffer(1024);
        StringBuffer edges = new StringBuffer(1024);
        for (DFSMState currentState : this.dfsmStates) {
            int stateIndex = currentState.getIndex();
            nodes.append("node[ id ");
            nodes.append(stateIndex);
            nodes.append(" label \"");
            nodes.append(stateIndex);
            if (currentState.isFinal()) {
                nodes.append(",F\\n" + currentState.getRhs());
            }
            nodes.append("\"  ]\n");
            edges.append(currentState.getEdgesGML());
        }
        res = res + nodes.toString() + edges.toString() + "]\n";
        return res;
    }

    public void execute() throws ExecutionException {
        String tokenString;
        FeatureMap newTokenFm;
        this.interrupted = false;
        if (this.document == null) {
            throw new ExecutionException("No document to tokenise!");
        }
        AnnotationSet annotationSet = this.annotationSetName == null || this.annotationSetName.equals("") ? this.document.getAnnotations() : this.document.getAnnotations(this.annotationSetName);
        this.fireStatusChanged("Tokenising " + this.document.getName() + "...");
        String content = this.document.getContent().toString();
        int length = content.length();
        int charsInCurrentCP = 1;
        DFSMState graphPosition = this.dInitialState;
        int tokenStart = 0;
        DFSMState lastMatchingState = null;
        int charIdx = 0;
        int oldCharIdx = 0;
        while (charIdx < length) {
            int currentChar = content.codePointAt(charIdx);
            charsInCurrentCP = Character.isSupplementaryCodePoint(currentChar) ? 2 : 1;
            DFSMState nextState = graphPosition.next(typeIds.get(Character.getType(currentChar)));
            if (null != nextState) {
                graphPosition = nextState;
                if (graphPosition.isFinal()) {
                    lastMatchingState = graphPosition;
                }
                charIdx += charsInCurrentCP;
            } else {
                newTokenFm = Factory.newFeatureMap();
                if (null == lastMatchingState) {
                    charIdx = tokenStart + charsInCurrentCP;
                    tokenString = content.substring(tokenStart, charIdx);
                    newTokenFm.put((Object)"type", (Object)"UNKNOWN");
                    newTokenFm.put((Object)"string", (Object)tokenString);
                    newTokenFm.put((Object)"length", (Object)Integer.toString(tokenString.length()));
                    try {
                        annotationSet.add(Long.valueOf(tokenStart), Long.valueOf(charIdx), "DEFAULT_TOKEN", newTokenFm);
                    }
                    catch (InvalidOffsetException ioe) {
                        ioe.printStackTrace(Err.getPrintWriter());
                    }
                } else {
                    tokenString = content.substring(tokenStart, charIdx);
                    newTokenFm.put((Object)"string", (Object)tokenString);
                    newTokenFm.put((Object)"length", (Object)Integer.toString(tokenString.length()));
                    for (int i = 1; i < lastMatchingState.getTokenDesc().length; ++i) {
                        newTokenFm.put((Object)lastMatchingState.getTokenDesc()[i][0], (Object)lastMatchingState.getTokenDesc()[i][1]);
                    }
                    try {
                        annotationSet.add(new Long(tokenStart), new Long(charIdx), lastMatchingState.getTokenDesc()[0][0], newTokenFm);
                    }
                    catch (InvalidOffsetException ioe) {
                        throw new GateRuntimeException(ioe.toString());
                    }
                }
                lastMatchingState = null;
                graphPosition = this.dInitialState;
                tokenStart = charIdx;
            }
            if (charIdx - oldCharIdx <= 256) continue;
            this.fireProgressChanged(100 * charIdx / length);
            oldCharIdx = charIdx;
            if (!this.isInterrupted()) continue;
            throw new ExecutionInterruptedException();
        }
        if (null != lastMatchingState) {
            tokenString = content.substring(tokenStart, charIdx);
            newTokenFm = Factory.newFeatureMap();
            newTokenFm.put((Object)"string", (Object)tokenString);
            newTokenFm.put((Object)"length", (Object)Integer.toString(tokenString.length()));
            for (int i = 1; i < lastMatchingState.getTokenDesc().length; ++i) {
                newTokenFm.put((Object)lastMatchingState.getTokenDesc()[i][0], (Object)lastMatchingState.getTokenDesc()[i][1]);
            }
            try {
                annotationSet.add(new Long(tokenStart), new Long(charIdx), lastMatchingState.getTokenDesc()[0][0], newTokenFm);
            }
            catch (InvalidOffsetException ioe) {
                throw new GateRuntimeException(ioe.toString());
            }
        }
        this.reset();
        this.fireProcessFinished();
        this.fireStatusChanged("Tokenisation complete!");
    }

    @CreoleParameter(defaultValue="resources/tokeniser/DefaultTokeniser.rules", comment="The URL to the rules file", suffixes="rules")
    public void setRulesURL(ResourceReference newRulesURL) {
        this.rulesURL = newRulesURL;
    }

    @Deprecated
    public void setRulesURL(URL newRulesURL) {
        try {
            this.setRulesURL(new ResourceReference(newRulesURL));
        }
        catch (URISyntaxException e) {
            throw new RuntimeException("Error converting URL to ResourceReference", e);
        }
    }

    public ResourceReference getRulesURL() {
        return this.rulesURL;
    }

    @RunTime
    @Optional
    @CreoleParameter(comment="The annotation set to be used for the generated annotations")
    public void setAnnotationSetName(String newAnnotationSetName) {
        this.annotationSetName = newAnnotationSetName;
    }

    public String getAnnotationSetName() {
        return this.annotationSetName;
    }

    public void setRulesResourceName(String newRulesResourceName) {
        this.rulesResourceName = newRulesResourceName;
    }

    public String getRulesResourceName() {
        return this.rulesResourceName;
    }

    @CreoleParameter(defaultValue="UTF-8", comment="The encoding used for reading the definitions")
    public void setEncoding(String newEncoding) {
        this.encoding = newEncoding;
    }

    public String getEncoding() {
        return this.encoding;
    }

    static {
        Field[] characterClassFields;
        LHStoRHS = ">";
        try {
            characterClassFields = Class.forName("java.lang.Character").getFields();
        }
        catch (ClassNotFoundException cnfe) {
            throw new GateRuntimeException("Could not find the java.lang.Character class!");
        }
        LinkedList<Field> staticFields = new LinkedList<Field>();
        for (int i = 0; i < characterClassFields.length; ++i) {
            if (!Modifier.isStatic(characterClassFields[i].getModifiers()) || characterClassFields[i].getName().indexOf("DIRECTIONALITY") != -1) continue;
            staticFields.add(characterClassFields[i]);
        }
        HashMap<Integer, Integer> tempTypeIds = new HashMap<Integer, Integer>();
        maxTypeId = staticFields.size() - 1;
        String[] mnemonics = new String[maxTypeId + 1];
        HashMap<String, Integer> tempStringTypeIds = new HashMap<String, Integer>();
        Iterator staticFieldsIter = staticFields.iterator();
        int currentId = 0;
        try {
            while (staticFieldsIter.hasNext()) {
                Field currentField = (Field)staticFieldsIter.next();
                if (!currentField.getType().toString().equals("byte")) continue;
                String fieldName = currentField.getName();
                tempTypeIds.put(currentField.getInt(null), currentId);
                mnemonics[currentId] = fieldName;
                tempStringTypeIds.put(fieldName, currentId);
                ++currentId;
            }
        }
        catch (Exception e) {
            throw new GateRuntimeException(e.toString());
        }
        typeIds = Collections.unmodifiableMap(tempTypeIds);
        stringTypeIds = Collections.unmodifiableMap(tempStringTypeIds);
        HashSet<String> toIgnore = new HashSet<String>();
        toIgnore.add(" ");
        toIgnore.add("\t");
        toIgnore.add("\f");
        ignoreTokens = Collections.unmodifiableSet(toIgnore);
        typeMnemonics = Collections.unmodifiableList(Arrays.asList(mnemonics));
    }
}

