package com.robrua.nlp.bert;

import java.util.Arrays;
import java.util.Map;
import java.util.stream.Stream;

/* loaded from: input_file:com/robrua/nlp/bert/WordpieceTokenizer.class */
public class WordpieceTokenizer extends Tokenizer {
    private static final int DEFAULT_MAX_CHARACTERS_PER_WORD = 200;
    private static final String DEFAULT_UNKNOWN_TOKEN = "[UNK]";
    private final int maxCharactersPerWord;
    private final String unknownToken;
    private final Map<String, Integer> vocabulary;

    public WordpieceTokenizer(Map<String, Integer> map) {
        this.vocabulary = map;
        this.unknownToken = DEFAULT_UNKNOWN_TOKEN;
        this.maxCharactersPerWord = DEFAULT_MAX_CHARACTERS_PER_WORD;
    }

    public WordpieceTokenizer(Map<String, Integer> map, String str, int i) {
        this.vocabulary = map;
        this.unknownToken = str;
        this.maxCharactersPerWord = i;
    }

    private Stream<String> splitToken(String str) {
        char[] charArray = str.toCharArray();
        if (charArray.length > this.maxCharactersPerWord) {
            return Stream.of(this.unknownToken);
        }
        Stream.Builder builder = Stream.builder();
        int i = 0;
        while (true) {
            int i2 = i;
            if (i2 >= charArray.length) {
                break;
            }
            int length = charArray.length;
            boolean z = false;
            while (true) {
                if (i2 >= length) {
                    break;
                }
                String str2 = (i2 > 0 ? "##" : "") + String.valueOf(charArray, i2, length - i2);
                if (this.vocabulary.containsKey(str2)) {
                    builder.accept(str2);
                    z = true;
                    break;
                }
                length--;
            }
            if (!z) {
                builder.accept(this.unknownToken);
                break;
            }
            i = length;
        }
        return builder.build();
    }

    @Override // com.robrua.nlp.bert.Tokenizer
    public String[] tokenize(String str) {
        return (String[]) whitespaceTokenize(str).flatMap(this::splitToken).toArray(i -> {
            return new String[i];
        });
    }

    @Override // com.robrua.nlp.bert.Tokenizer
    public String[][] tokenize(String... strArr) {
        return (String[][]) Arrays.stream(strArr).map(str -> {
            return (String[]) whitespaceTokenize(str).toArray(i -> {
                return new String[i];
            });
        }).map(strArr2 -> {
            return (String[]) Arrays.stream(strArr2).flatMap(this::splitToken).toArray(i -> {
                return new String[i];
            });
        }).toArray(i -> {
            return new String[i];
        });
    }
}
