package com.robrua.nlp.bert;

import com.google.common.collect.ImmutableSet;
import java.text.Normalizer;
import java.util.Arrays;
import java.util.Set;
import java.util.stream.Stream;

/* loaded from: input_file:com/robrua/nlp/bert/BasicTokenizer.class */
public class BasicTokenizer extends Tokenizer {
    private static final Set<Integer> CONTROL_CATEGORIES = ImmutableSet.of(15, 16, 18, 19, 0);
    private static final Set<Integer> PUNCTUATION_CATEGORIES = ImmutableSet.of(23, 20, 22, 30, 29, 24, new Integer[]{21});
    private static final Set<Integer> SAFE_CONTROL_CHARACTERS = ImmutableSet.of(9, 10, 13);
    private static final Set<Integer> STRIP_CHARACTERS = ImmutableSet.of(0, 65533);
    private static final Set<Integer> WHITESPACE_CHARACTERS = ImmutableSet.of(32, 9, 10, 13);
    private final boolean doLowerCase;

    private static String cleanText(String str) {
        StringBuilder sb = new StringBuilder();
        str.codePoints().filter(i -> {
            return (STRIP_CHARACTERS.contains(Integer.valueOf(i)) || isControl(i)) ? false : true;
        }).map(i2 -> {
            if (isWhitespace(i2)) {
                return 32;
            }
            return i2;
        }).forEachOrdered(i3 -> {
            sb.append(Character.toChars(i3));
        });
        return sb.toString();
    }

    private static boolean isChineseCharacter(int i) {
        return (i >= 19968 && i <= 40959) || (i >= 13312 && i <= 19903) || ((i >= 131072 && i <= 173791) || ((i >= 173824 && i <= 177983) || ((i >= 177984 && i <= 178207) || ((i >= 178208 && i <= 183983) || ((i >= 63744 && i <= 64255) || (i >= 194560 && i <= 195103))))));
    }

    private static boolean isControl(int i) {
        return !SAFE_CONTROL_CHARACTERS.contains(Integer.valueOf(i)) && CONTROL_CATEGORIES.contains(Integer.valueOf(Character.getType(i)));
    }

    private static boolean isPunctuation(int i) {
        return (i >= 33 && i <= 47) || (i >= 58 && i <= 64) || ((i >= 91 && i <= 96) || ((i >= 123 && i <= 126) || PUNCTUATION_CATEGORIES.contains(Integer.valueOf(Character.getType(i)))));
    }

    private static boolean isWhitespace(int i) {
        return WHITESPACE_CHARACTERS.contains(Integer.valueOf(i)) || 12 == Character.getType(i);
    }

    private static Stream<String> splitOnPunctuation(String str) {
        Stream.Builder builder = Stream.builder();
        StringBuilder sb = new StringBuilder();
        str.codePoints().forEachOrdered(i -> {
            if (!isPunctuation(i)) {
                sb.append(Character.toChars(i));
                return;
            }
            builder.accept(sb.toString());
            sb.setLength(0);
            builder.accept(String.valueOf(Character.toChars(i)));
        });
        if (sb.length() > 0) {
            builder.accept(sb.toString());
        }
        return builder.build();
    }

    private static String stripAccents(String str) {
        StringBuilder sb = new StringBuilder();
        Normalizer.normalize(str, Normalizer.Form.NFD).codePoints().filter(i -> {
            return 6 != Character.getType(i);
        }).forEachOrdered(i2 -> {
            sb.append(Character.toChars(i2));
        });
        return sb.toString();
    }

    private static String tokenizeChineseCharacters(String str) {
        StringBuilder sb = new StringBuilder();
        str.codePoints().forEachOrdered(i -> {
            if (!isChineseCharacter(i)) {
                sb.append(Character.toChars(i));
                return;
            }
            sb.append(' ');
            sb.append(Character.toChars(i));
            sb.append(' ');
        });
        return sb.toString();
    }

    public BasicTokenizer(boolean z) {
        this.doLowerCase = z;
    }

    private String stripAndSplit(String str) {
        if (this.doLowerCase) {
            str = stripAccents(str.toLowerCase());
        }
        return String.join(" ", (CharSequence[]) splitOnPunctuation(str).toArray(i -> {
            return new String[i];
        }));
    }

    @Override // com.robrua.nlp.bert.Tokenizer
    public String[][] tokenize(String... strArr) {
        return (String[][]) Arrays.stream(strArr).map(BasicTokenizer::cleanText).map(BasicTokenizer::tokenizeChineseCharacters).map(str -> {
            return (String[]) whitespaceTokenize(str).toArray(i -> {
                return new String[i];
            });
        }).map(strArr2 -> {
            return (String[]) Arrays.stream(strArr2).map(this::stripAndSplit).flatMap(Tokenizer::whitespaceTokenize).toArray(i -> {
                return new String[i];
            });
        }).toArray(i -> {
            return new String[i];
        });
    }

    @Override // com.robrua.nlp.bert.Tokenizer
    public String[] tokenize(String str) {
        return (String[]) whitespaceTokenize(tokenizeChineseCharacters(cleanText(str))).map(this::stripAndSplit).flatMap(Tokenizer::whitespaceTokenize).toArray(i -> {
            return new String[i];
        });
    }
}
