/*
 * Decompiled with CFR 0.152.
 */
package org.deeplearning4j.text.tokenization.tokenizer;

import java.util.ArrayList;
import java.util.List;
import java.util.NavigableMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
import org.deeplearning4j.text.tokenization.tokenizer.Tokenizer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class BertWordPieceTokenizer
implements Tokenizer {
    private static final Logger log = LoggerFactory.getLogger(BertWordPieceTokenizer.class);
    public static final Pattern splitPattern = Pattern.compile("(\\p{javaWhitespace}|((?<=\\p{Punct})|(?=\\p{Punct})))+");
    private final List<String> tokens;
    private TokenPreProcess tokenPreProcess;
    private AtomicInteger cursor = new AtomicInteger(0);

    public BertWordPieceTokenizer(String tokens, NavigableMap<String, Integer> vocab, boolean lowerCaseOnly) {
        if (vocab.comparator() == null || vocab.comparator().compare("a", "b") < 0) {
            throw new IllegalArgumentException("Vocab must use reverse sort order!");
        }
        this.tokens = this.tokenize(vocab, tokens, lowerCaseOnly);
    }

    @Override
    public boolean hasMoreTokens() {
        return this.cursor.get() < this.tokens.size();
    }

    @Override
    public int countTokens() {
        return this.tokens.size();
    }

    @Override
    public String nextToken() {
        String base = this.tokens.get(this.cursor.getAndIncrement());
        if (this.tokenPreProcess != null) {
            base = this.tokenPreProcess.preProcess(base);
        }
        return base;
    }

    @Override
    public List<String> getTokens() {
        if (this.tokenPreProcess != null) {
            ArrayList<String> result = new ArrayList<String>(this.tokens.size());
            for (String token : this.tokens) {
                result.add(this.tokenPreProcess.preProcess(token));
            }
            return result;
        }
        return this.tokens;
    }

    @Override
    public void setTokenPreProcessor(TokenPreProcess tokenPreProcessor) {
        this.tokenPreProcess = tokenPreProcessor;
    }

    private List<String> tokenize(NavigableMap<String, Integer> vocab, String toTokenzie, boolean lowerCaseOnly) {
        ArrayList<String> output = new ArrayList<String>();
        String fullString = toTokenzie;
        if (lowerCaseOnly) {
            fullString = fullString.toLowerCase();
        }
        String[] stringArray = splitPattern.split(fullString);
        int n = stringArray.length;
        for (int i = 0; i < n; ++i) {
            String basicToken;
            String candidate = basicToken = stringArray[i];
            while (candidate.length() > 0 && !"##".equals(candidate)) {
                String longestSubstring = BertWordPieceTokenizer.findLongestSubstring(vocab, candidate);
                output.add(longestSubstring);
                candidate = "##" + candidate.substring(longestSubstring.length());
            }
        }
        return output;
    }

    protected static String findLongestSubstring(NavigableMap<String, Integer> vocab, String candidate) {
        NavigableMap<String, Integer> tailMap = vocab.tailMap(candidate, true);
        String longestSubstring = (String)tailMap.firstKey();
        int subStringLength = Math.min(candidate.length(), longestSubstring.length());
        while (!candidate.startsWith(longestSubstring)) {
            tailMap = tailMap.tailMap(candidate.substring(0, --subStringLength), true);
            longestSubstring = (String)tailMap.firstKey();
        }
        return longestSubstring;
    }
}

