/*
 * Decompiled with CFR 0.152.
 */
package com.knuddels.jtokkit;

import com.knuddels.jtokkit.Cl100kParser;
import com.knuddels.jtokkit.GptBytePairEncoding;
import com.knuddels.jtokkit.api.Encoding;
import com.knuddels.jtokkit.api.GptBytePairEncodingParams;
import com.knuddels.jtokkit.api.IntArrayList;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

class EncodingFactory {
    private static final Map<String, Integer> SPECIAL_TOKENS_O200K_BASE;
    private static final Map<String, Integer> SPECIAL_TOKENS_CL100K_BASE;
    private static final Map<String, Integer> SPECIAL_TOKENS_X50K_BASE;
    private static final Map<String, Integer> SPECIAL_TOKENS_P50K_EDIT;
    private static final String ENDOFTEXT = "<|endoftext|>";
    private static final String FIM_PREFIX = "<|fim_prefix|>";
    private static final String FIM_MIDDLE = "<|fim_middle|>";
    private static final String FIM_SUFFIX = "<|fim_suffix|>";
    private static final String ENDOFPROMPT = "<|endofprompt|>";

    private EncodingFactory() {
    }

    static Encoding r50kBase() {
        return EncodingFactory.from50kParameters("r50k_base", "/com/knuddels/jtokkit/r50k_base.tiktoken", SPECIAL_TOKENS_X50K_BASE);
    }

    static Encoding p50kBase() {
        return EncodingFactory.from50kParameters("p50k_base", "/com/knuddels/jtokkit/p50k_base.tiktoken", SPECIAL_TOKENS_X50K_BASE);
    }

    static Encoding p50kEdit() {
        return EncodingFactory.from50kParameters("p50k_edit", "/com/knuddels/jtokkit/p50k_base.tiktoken", SPECIAL_TOKENS_P50K_EDIT);
    }

    static Encoding cl100kBase() {
        Map<byte[], Integer> mergeableRanks = EncodingFactory.loadMergeableRanks("/com/knuddels/jtokkit/cl100k_base.tiktoken");
        GptBytePairEncodingParams params = new GptBytePairEncodingParams("cl100k_base", null, mergeableRanks, SPECIAL_TOKENS_CL100K_BASE);
        return new Cl100kGptBytePairEncoding(params);
    }

    static Encoding o200kBase() {
        Map<byte[], Integer> mergeableRanks = EncodingFactory.loadMergeableRanks("/com/knuddels/jtokkit/o200k_base.tiktoken");
        ArrayList<String> patStrList = new ArrayList<String>();
        patStrList.add("[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?");
        patStrList.add("[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?");
        patStrList.add("\\p{N}{1,3}");
        patStrList.add(" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*");
        patStrList.add("\\s*[\\r\\n]+");
        patStrList.add("\\s+(?!\\S)");
        patStrList.add("\\s+");
        Pattern regex = EncodingFactory.compileRegex(patStrList.stream().map(String::valueOf).collect(Collectors.joining("|")), false);
        GptBytePairEncodingParams params = new GptBytePairEncodingParams("o200k_base", regex, mergeableRanks, SPECIAL_TOKENS_O200K_BASE);
        return EncodingFactory.fromParameters(params);
    }

    static Encoding fromParameters(GptBytePairEncodingParams parameters2) {
        return new GptBytePairEncoding(parameters2);
    }

    private static Encoding from50kParameters(String name, String fileName, Map<String, Integer> specialTokens) {
        Pattern regex = EncodingFactory.compileRegex("'(?:[sdmt]|ll|ve|re)| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+", false);
        Map<byte[], Integer> mergeableRanks = EncodingFactory.loadMergeableRanks(fileName);
        GptBytePairEncodingParams params = new GptBytePairEncodingParams(name, regex, mergeableRanks, specialTokens);
        return EncodingFactory.fromParameters(params);
    }

    static Pattern compileRegex(String patternString, boolean caseInsensitive) {
        try {
            int flags = 256;
            if (caseInsensitive) {
                flags |= 2;
            }
            return Pattern.compile(patternString, flags);
        }
        catch (IllegalArgumentException exception) {
            int flags = 0;
            if (caseInsensitive) {
                flags = 2;
            }
            return Pattern.compile(patternString, flags);
        }
    }

    /*
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    static Map<byte[], Integer> loadMergeableRanks(String fileName) {
        try (InputStream in = EncodingFactory.class.getResourceAsStream(fileName);){
            String line;
            if (in == null) {
                throw new IllegalStateException("Could not find " + fileName + " in resources");
            }
            LinkedHashMap<byte[], Integer> mergeableRanks = new LinkedHashMap<byte[], Integer>();
            BufferedReader reader = new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8));
            while ((line = reader.readLine()) != null) {
                int firstSpaceIndex = line.indexOf(32);
                assert (firstSpaceIndex != -1) : "Invalid line in " + fileName + ": " + line;
                byte[] token = Base64.getDecoder().decode(line.substring(0, firstSpaceIndex).getBytes(StandardCharsets.UTF_8));
                int rank = Integer.parseInt(line.substring(firstSpaceIndex + 1));
                mergeableRanks.put(token, rank);
            }
            LinkedHashMap<byte[], Integer> linkedHashMap = mergeableRanks;
            return linkedHashMap;
        }
        catch (IOException e) {
            throw new IllegalStateException("Could not load " + fileName + " from resources", e);
        }
    }

    static {
        HashMap<String, Integer> map2 = new HashMap<String, Integer>();
        map2.put(ENDOFTEXT, 50256);
        SPECIAL_TOKENS_X50K_BASE = Collections.unmodifiableMap(map2);
        map2 = new HashMap();
        map2.put(ENDOFTEXT, 50256);
        map2.put(FIM_PREFIX, 50281);
        map2.put(FIM_MIDDLE, 50282);
        map2.put(FIM_SUFFIX, 50283);
        SPECIAL_TOKENS_P50K_EDIT = Collections.unmodifiableMap(map2);
        map2 = new HashMap();
        map2.put(ENDOFTEXT, 100257);
        map2.put(FIM_PREFIX, 100258);
        map2.put(FIM_MIDDLE, 100259);
        map2.put(FIM_SUFFIX, 100260);
        map2.put(ENDOFPROMPT, 100276);
        SPECIAL_TOKENS_CL100K_BASE = Collections.unmodifiableMap(map2);
        map2 = new HashMap();
        map2.put(ENDOFTEXT, 199999);
        map2.put(ENDOFPROMPT, 200018);
        SPECIAL_TOKENS_O200K_BASE = Collections.unmodifiableMap(map2);
    }

    private static class Cl100kGptBytePairEncoding
    extends GptBytePairEncoding {
        Cl100kGptBytePairEncoding(GptBytePairEncodingParams params) {
            super(params);
        }

        @Override
        int encodeOrdinaryInternal(String text, int maxTokenCount, boolean keepEncodings, IntArrayList out) {
            int[] tokenCount = new int[]{0};
            IntArrayList ranks = new IntArrayList();
            Cl100kParser.split(text, utf8BytesList -> {
                tokenCount[0] = tokenCount[0] + this.encoder.addTokensAndGetCount(maxTokenCount, keepEncodings, utf8BytesList.toArray(), out, ranks);
                return tokenCount[0] >= maxTokenCount;
            });
            return tokenCount[0];
        }
    }
}

