package org.languagetool.tokenizers.en;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.tagging.en.EnglishTagger;
import org.languagetool.tokenizers.WordTokenizer;

/* loaded from: input_file:org/languagetool/tokenizers/en/EnglishWordTokenizer.class */
public class EnglishWordTokenizer extends WordTokenizer {
    private static final Pattern SINGLE_QUOTE = Pattern.compile("'");
    private static final Pattern CURLY_QUOTE = Pattern.compile("’");
    private static final Pattern APOSTYPEW = Pattern.compile("\u0001\u0001APOSTYPEW\u0001\u0001");
    private static final Pattern APOSTYPOG = Pattern.compile("\u0001\u0001APOSTYPOG\u0001\u0001");
    private static final Pattern SOFT_HYPHEN = Pattern.compile("\u00ad");
    private static final List<Pattern> patternList = Arrays.asList(Pattern.compile("^(fo['’]c['’]sle|rec['’][ds]|OK['’]d|cc['’][ds]|DJ['’][d]|[pd]m['’]d|rsvp['’]d)$", 66), Pattern.compile("^(['’]?)(are|is|were|was|do|does|did|have|has|had|wo|would|ca|could|sha|should|must|ai|ought|might|need|may|am|dare|das|dass|hai|used|use)(n['’]t)$", 66), Pattern.compile("^(.+)(['’]m|['’]re|['’]ll|['’]ve|['’]d|['’]s)(['’-]?)$", 66), Pattern.compile("^(['’]t)(was|were|is)$", 66));
    private final String enTokenizingChars = super.getTokenizingCharacters() + "_";

    public List<String> tokenize(String str) {
        ArrayList arrayList = new ArrayList();
        StringTokenizer stringTokenizer = new StringTokenizer(CURLY_QUOTE.matcher(SINGLE_QUOTE.matcher(str).replaceAll("\u0001\u0001APOSTYPEW\u0001\u0001")).replaceAll("\u0001\u0001APOSTYPOG\u0001\u0001"), this.enTokenizingChars, true);
        while (stringTokenizer.hasMoreElements()) {
            String replaceAll = APOSTYPOG.matcher(APOSTYPEW.matcher(stringTokenizer.nextToken()).replaceAll("'")).replaceAll("’");
            boolean z = false;
            Matcher matcher = null;
            if (replaceAll.contains("'") || replaceAll.contains("’")) {
                Iterator<Pattern> it = patternList.iterator();
                while (it.hasNext()) {
                    matcher = it.next().matcher(replaceAll);
                    z = matcher.find();
                    if (z) {
                        break;
                    }
                }
            }
            if (z) {
                for (int i = 1; i <= matcher.groupCount(); i++) {
                    arrayList.addAll(wordsToAdd(matcher.group(i)));
                }
            } else {
                arrayList.addAll(wordsToAdd(replaceAll));
            }
        }
        return joinEMailsAndUrls(arrayList);
    }

    private List<String> wordsToAdd(String str) {
        ArrayList arrayList = new ArrayList();
        int i = 0;
        synchronized (this) {
            if (!str.isEmpty()) {
                while (str.startsWith("-")) {
                    arrayList.add("-");
                    str = str.substring(1);
                }
                while (str.endsWith("-")) {
                    str = str.substring(0, str.length() - 1);
                    i++;
                }
                if (!str.isEmpty()) {
                    if (str.contains("-") || str.contains("'") || str.contains("’")) {
                        if (EnglishTagger.INSTANCE.tag(Arrays.asList(CURLY_QUOTE.matcher(SOFT_HYPHEN.matcher(str).replaceAll("")).replaceAll("'"))).get(0).isTagged()) {
                            arrayList.add(str);
                        } else if (str.equalsIgnoreCase("mers-cov") || str.equalsIgnoreCase("mcgraw-hill") || str.equalsIgnoreCase("sars-cov-2") || str.equalsIgnoreCase("sars-cov") || str.equalsIgnoreCase("ph-metre") || str.equalsIgnoreCase("ph-metres") || str.equalsIgnoreCase("anti-ivg") || str.equalsIgnoreCase("anti-uv") || str.equalsIgnoreCase("anti-vih") || str.equalsIgnoreCase("al-qaida")) {
                            arrayList.add(str);
                        } else {
                            StringTokenizer stringTokenizer = new StringTokenizer(str, "’'", true);
                            while (stringTokenizer.hasMoreElements()) {
                                arrayList.add(stringTokenizer.nextToken());
                            }
                        }
                    } else {
                        arrayList.add(str);
                    }
                }
            }
            while (i > 0) {
                arrayList.add("-");
                i--;
            }
        }
        return arrayList;
    }
}
