package org.deeplearning4j.text.tokenization.tokenizer.preprocessor;

import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import it.unimi.dsi.fastutil.ints.IntSet;
import java.text.Normalizer;
import java.util.List;
import java.util.Map;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;

/* loaded from: input_file:org/deeplearning4j/text/tokenization/tokenizer/preprocessor/BertWordPiecePreProcessor.class */
public class BertWordPiecePreProcessor implements TokenPreProcess {
    public static final char REPLACEMENT_CHAR = 65533;
    protected final boolean lowerCase;
    protected final boolean stripAccents;
    protected final IntSet charSet;

    public BertWordPiecePreProcessor() {
        this(false, false, null);
    }

    public BertWordPiecePreProcessor(boolean z, boolean z2, Map<String, Integer> map) {
        this.lowerCase = z;
        this.stripAccents = z2;
        if (map == null) {
            this.charSet = null;
            return;
        }
        this.charSet = new IntOpenHashSet();
        for (String str : map.keySet()) {
            int i = 0;
            int codePointCount = str.codePointCount(0, str.length());
            int i2 = 0;
            while (true) {
                int i3 = i;
                i++;
                if (i3 < codePointCount) {
                    int codePointAt = str.codePointAt(i2);
                    i2 += Character.charCount(codePointAt);
                    this.charSet.add(codePointAt);
                }
            }
        }
    }

    @Override // org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess
    public String preProcess(String str) {
        if (this.stripAccents) {
            str = Normalizer.normalize(str, Normalizer.Form.NFD);
        }
        int codePointCount = str.codePointCount(0, str.length());
        StringBuilder sb = new StringBuilder();
        int i = 0;
        int i2 = 0;
        while (true) {
            int i3 = i2;
            i2++;
            if (i3 >= codePointCount) {
                return sb.toString();
            }
            int codePointAt = str.codePointAt(i);
            i += Character.charCount(codePointAt);
            if (codePointAt != 0 && codePointAt != 65533 && !isControlCharacter(codePointAt) && (!this.stripAccents || Character.getType(codePointAt) != 6)) {
                if (this.lowerCase) {
                    codePointAt = Character.toLowerCase(codePointAt);
                }
                if (isWhiteSpace(codePointAt)) {
                    sb.append(' ');
                } else if (this.charSet == null || this.charSet.contains(codePointAt)) {
                    if (isChineseCharacter(codePointAt)) {
                        sb.append(' ');
                        sb.appendCodePoint(codePointAt);
                        sb.append(' ');
                    } else {
                        sb.appendCodePoint(codePointAt);
                    }
                }
            }
        }
    }

    public static boolean isControlCharacter(int i) {
        if (i == 9 || i == 10 || i == 13) {
            return false;
        }
        int type = Character.getType(i);
        return type == 15 || type == 16;
    }

    public static boolean isWhiteSpace(int i) {
        return i == 9 || i == 10 || i == 13 || Character.getType(i) == 12;
    }

    public static boolean isChineseCharacter(int i) {
        return (i >= 19968 && i <= 40959) || (i >= 13312 && i <= 19903) || ((i >= 131072 && i <= 173791) || ((i >= 173824 && i <= 177983) || ((i >= 177984 && i <= 178207) || ((i >= 178208 && i <= 183983) || ((i >= 63744 && i <= 64255) || (i >= 194560 && i <= 195103))))));
    }

    public static String reconstructFromTokens(List<String> list) {
        StringBuilder sb = new StringBuilder();
        boolean z = true;
        for (String str : list) {
            if (str.startsWith("##")) {
                sb.append(str.substring(2));
            } else {
                if (!z && !".".equals(str)) {
                    sb.append(" ");
                }
                sb.append(str);
                z = false;
            }
        }
        return sb.toString();
    }
}
