package org.languagetool.tokenizers.fr;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.tagging.fr.FrenchTagger;
import org.languagetool.tokenizers.WordTokenizer;

/* loaded from: input_file:org/languagetool/tokenizers/fr/FrenchWordTokenizer.class */
public class FrenchWordTokenizer extends WordTokenizer {
    private static final String wordCharacters = "§©@€£\\$_\\p{L}\\d\\-̀-ͯ¨⁰-\u209f°%‰‱&�\u00ad¬";
    private final String frTokenizingChars = super.getTokenizingCharacters() + "-";
    private static final Pattern tokenizerPattern = Pattern.compile("[§©@€£\\$_\\p{L}\\d\\-̀-ͯ¨⁰-\u209f°%‰‱&�\u00ad¬]+|[^§©@€£\\$_\\p{L}\\d\\-̀-ͯ¨⁰-\u209f°%‰‱&�\u00ad¬]");
    private static final Pattern SOFT_HYPHEN = Pattern.compile("\u00ad");
    private static final Pattern CURLY_QUOTE = Pattern.compile("’");
    private static final Pattern PATTERN_1 = Pattern.compile("xxFR_APOS_TYPEWxx");
    private static final Pattern PATTERN_2 = Pattern.compile("xxFR_APOS_TYPOGxx");
    private static final Pattern PATTERN_3 = Pattern.compile("xxFR_HYPHENxx");
    private static final Pattern PATTERN_4 = Pattern.compile("xxFR_DECIMALPOINTxx");
    private static final Pattern PATTERN_5 = Pattern.compile("xxFR_DECIMALCOMMAxx");
    private static final Pattern PATTERN_6 = Pattern.compile("xxFR_SPACExx");
    private static final Pattern TYPEWRITER_APOSTROPHE = Pattern.compile("([\\p{L}])'([\\p{L}1\"‘“«])", 66);
    private static final Pattern TYPOGRAPHIC_APOSTROPHE = Pattern.compile("([\\p{L}])’([\\p{L}1\"‘“«])", 66);
    private static final Pattern NEARBY_HYPHENS = Pattern.compile("([\\p{L}])-([\\p{L}])-([\\p{L}])", 66);
    private static final Pattern HYPHENS = Pattern.compile("([\\p{L}])-([\\p{L}\\d])", 66);
    private static final Pattern DECIMAL_POINT = Pattern.compile("([\\d])\\.([\\d])", 66);
    private static final Pattern DECIMAL_COMMA = Pattern.compile("([\\d]),([\\d])", 66);
    private static final Pattern SPACE_DIGITS0 = Pattern.compile("([\\d]{4}) ", 66);
    private static final Pattern SPACE_DIGITS = Pattern.compile("([\\d]) ([\\d][\\d][\\d])\\b", 66);
    private static final Pattern SPACE_DIGITS2 = Pattern.compile("([\\d]) ([\\d][\\d][\\d]) ([\\d][\\d][\\d])\\b", 66);
    private static final Pattern SPACE0 = Pattern.compile("xxFR_SPACE0xx");
    private static final List<String> doNotSplit = Arrays.asList("mers-cov", "mcgraw-hill", "sars-cov-2", "sars-cov", "ph-metre", "ph-metres", "anti-ivg", "anti-uv", "anti-vih", "al-qaïda", "c'est-à-dire", "add-on", "add-ons", "rendez-vous", "garde-à-vous", "chez-eux", "chez-moi", "chez-nous", "chez-soi", "chez-toi", "chez-vous", "m'as-tu-vu");
    static final int maxPatterns = 7;
    static final Pattern[] patterns = new Pattern[maxPatterns];

    public List<String> tokenize(String str) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = SPACE0.matcher(SPACE_DIGITS.matcher(SPACE_DIGITS0.matcher(SPACE_DIGITS2.matcher(DECIMAL_COMMA.matcher(DECIMAL_POINT.matcher(HYPHENS.matcher(NEARBY_HYPHENS.matcher(TYPOGRAPHIC_APOSTROPHE.matcher(TYPEWRITER_APOSTROPHE.matcher(str.replace((char) 8208, '-').replace((char) 8209, '-')).replaceAll("$1xxFR_APOS_TYPEWxx$2")).replaceAll("$1xxFR_APOS_TYPOGxx$2")).replaceAll("$1xxFR_HYPHENxx$2xxFR_HYPHENxx$3")).replaceAll("$1xxFR_HYPHENxx$2")).replaceAll("$1xxFR_DECIMALPOINTxx$2")).replaceAll("$1xxFR_DECIMALCOMMAxx$2")).replaceAll("$1xxFR_SPACExx$2xxFR_SPACExx$3")).replaceAll("$1xxFR_SPACE0xx")).replaceAll("$1xxFR_SPACExx$2"));
        Matcher matcher2 = tokenizerPattern.matcher(matcher.replaceAll(" "));
        while (matcher2.find()) {
            String group = matcher2.group();
            if (arrayList.size() <= 0 || group.length() != 1 || group.codePointAt(0) < 65024 || group.codePointAt(0) > 65039) {
                String replaceAll = PATTERN_6.matcher(PATTERN_5.matcher(PATTERN_4.matcher(PATTERN_3.matcher(PATTERN_2.matcher(PATTERN_1.matcher(group).replaceAll("'")).replaceAll("’")).replaceAll("-")).replaceAll(".")).replaceAll(",")).replaceAll(" ");
                boolean z = false;
                while (replaceAll.length() > 1 && replaceAll.startsWith("-")) {
                    arrayList.add("-");
                    replaceAll = replaceAll.substring(1);
                }
                int i = 0;
                while (replaceAll.length() > 1 && replaceAll.endsWith("-")) {
                    replaceAll = replaceAll.substring(0, replaceAll.length() - 1);
                    i++;
                }
                for (int i2 = 0; i2 < maxPatterns && !z; i2++) {
                    matcher = patterns[i2].matcher(replaceAll);
                    z = matcher.find();
                }
                if (z) {
                    for (int i3 = 1; i3 <= matcher.groupCount(); i3++) {
                        arrayList.addAll(wordsToAdd(matcher.group(i3)));
                    }
                } else {
                    arrayList.addAll(wordsToAdd(replaceAll));
                }
                while (i > 0) {
                    arrayList.add("-");
                    i--;
                }
            } else {
                arrayList.set(arrayList.size() - 1, ((String) arrayList.get(arrayList.size() - 1)) + group);
            }
        }
        return joinEMailsAndUrls(arrayList);
    }

    private List<String> wordsToAdd(String str) {
        ArrayList arrayList = new ArrayList();
        synchronized (this) {
            if (!str.isEmpty()) {
                if (str.contains("-")) {
                    if (FrenchTagger.INSTANCE.tag(Arrays.asList(CURLY_QUOTE.matcher(SOFT_HYPHEN.matcher(str).replaceAll("")).replaceAll("'"))).get(0).isTagged()) {
                        arrayList.add(str);
                    } else if (doNotSplit.contains(str.toLowerCase())) {
                        arrayList.add(str);
                    } else {
                        StringTokenizer stringTokenizer = new StringTokenizer(str, "-", true);
                        while (stringTokenizer.hasMoreElements()) {
                            arrayList.add(stringTokenizer.nextToken());
                        }
                    }
                } else {
                    arrayList.add(str);
                }
            }
        }
        return arrayList;
    }

    static {
        patterns[0] = Pattern.compile("^(c['’]te?|m['’]as-tu-vu|c['’]est-à-dire|add-on|add-ons|rendez-vous|garde-à-vous|chez-eux|chez-moi|chez-nous|chez-soi|chez-toi|chez-vous)$", 66);
        patterns[1] = Pattern.compile("^([cç]['’]|j['’]|n['’]|m['’]|t['’]|s['’]|l['’]|d['’]|qu['’]|jusqu['’]|lorsqu['’]|puisqu['’]|quoiqu['’])([^\\-]*)(-ce|-elle|-t-elle|-elles|-t-elles|-en|-il|-t-il|-ils|-t-ils|-je|-la|-le|-les|-leur|-lui|-moi|-nous|-on|-t-on|-toi|-tu|-vous|-vs|-y)$", 66);
        patterns[2] = Pattern.compile("^([cç]['’]|j['’]|n['’]|m['’]|t['’]|s['’]|l['’]|d['’]|qu['’]|jusqu['’]|lorsqu['’]|puisqu['’]|quoiqu['’])([^'’\\-].*)$", 66);
        patterns[3] = Pattern.compile("^([^\\-\\d]+)(-ce|-t-elle|-t-elles|-elle|-elles|-en|-il|-t-il|-ils|-t-ils|-je|-la|-le|-les|-leur|-lui|-moi|-nous|-on|-t-on|-toi|-tu|-vous|-vs|-y)(-ce|-elle|-t-elle|-elles|-t-elles|-en|-il|-t-il|-ils|-t-ils|-je|-la|-le|-les|-leur|-lui|-moi|-nous|-on|-t-on|-toi|-tu|-vous|-vs|-y)$", 66);
        patterns[4] = Pattern.compile("^([^\\-]*)(-t|-m)(['’]en|['’]y)$", 66);
        patterns[5] = Pattern.compile("^(.*)(-t-elle|-t-elles|-t-il|-t-ils|-t-on)$", 66);
        patterns[6] = Pattern.compile("^(.*)(-ce|-elle|-t-elle|-elles|-t-elles|-en|-il|-t-il|-ils|-t-ils|-je|-la|-le|-les|-leur|-lui|-moi|-nous|-on|-t-on|-toi|-tu|-vous|-vs|-y)$", 66);
    }
}
