package org.languagetool.tokenizers;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.languagetool.tools.StringTools;

/* loaded from: input_file:org/languagetool/tokenizers/WordTokenizer.class */
public class WordTokenizer implements Tokenizer {
    private static final List<String> PROTOCOLS = Collections.unmodifiableList(Arrays.asList("http", "https", "ftp"));
    private static final Pattern URL_CHARS = Pattern.compile("[a-zA-ZÄÖÜäöü0-9/%$-_.+!*'(),?#~]+");
    private static final Pattern DOMAIN_CHARS = Pattern.compile("[a-zA-Z0-9][a-zA-Z0-9-]+");
    private static final Pattern NO_PROTOCOL_URL = Pattern.compile("([a-zA-Z0-9][a-zA-Z0-9-]+\\.)?([a-zA-Z0-9][a-zA-Z0-9-]+)\\.([a-zA-Z0-9][a-zA-Z0-9-]+)/.*");
    private static final Pattern E_MAIL = Pattern.compile("(?<!:)@?\\b[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\])|(([a-zA-Z\\-0-9]+\\.)+[a-zA-Z]{2,}))\\b");
    private static final String TOKENIZING_CHARACTERS = "  ᅟᅠ\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006 \u2008\u2009\u200a\u200b\u200c\u200d\u200e\u200f\u2028\u2029\u202a\u202b\u202c\u202d\u202e \u205f\u2060\u2061\u2062\u2063\u206a\u206b\u206c\u206d\u206e\u206f\u3000ㅤ\ufeffﾠ\ufff9\ufffa\ufffb¦‖∣|,.;()[]{}=*#∗+×·÷<>!?:~/\\\"'«»„”“‘’`´‛′›‹…¿¡‼⁇⁈⁉™®‽‒–—―─ㅡ✓●○◆➢■□★❏➔↵❖▪❑•⮚≥→⇾⇉⇒⇨⇛¹²³⁰ⁱ⁴⁵⁶⁷⁸⁹\t\n\r";

    public static List<String> getProtocols() {
        return PROTOCOLS;
    }

    public static boolean isUrl(String str) {
        Iterator<String> it = getProtocols().iterator();
        while (it.hasNext()) {
            if (str.startsWith(it.next() + "://") || str.startsWith("www.")) {
                return true;
            }
        }
        return NO_PROTOCOL_URL.matcher(str).matches();
    }

    public static boolean isEMail(String str) {
        return E_MAIL.matcher(str).matches();
    }

    @Override // org.languagetool.tokenizers.Tokenizer
    public List<String> tokenize(String str) {
        ArrayList arrayList = new ArrayList();
        StringTokenizer stringTokenizer = new StringTokenizer(str, getTokenizingCharacters(), true);
        while (stringTokenizer.hasMoreElements()) {
            arrayList.add(stringTokenizer.nextToken());
        }
        return joinEMailsAndUrls(arrayList);
    }

    public String getTokenizingCharacters() {
        return TOKENIZING_CHARACTERS;
    }

    protected List<String> joinEMailsAndUrls(List<String> list) {
        return joinUrls(joinEMails(list));
    }

    protected List<String> joinEMails(List<String> list) {
        StringBuilder sb = new StringBuilder();
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            sb.append(it.next());
        }
        String sb2 = sb.toString();
        if (!sb2.contains("@") || !E_MAIL.matcher(sb2).find()) {
            return list;
        }
        Matcher matcher = E_MAIL.matcher(sb2);
        ArrayList arrayList = new ArrayList();
        int i = 0;
        int i2 = 0;
        while (matcher.find()) {
            int start = matcher.start();
            int end = matcher.end();
            while (i < end) {
                if (i < start) {
                    arrayList.add(list.get(i2));
                } else if (i == start) {
                    arrayList.add(matcher.group());
                }
                i += list.get(i2).length();
                i2++;
            }
        }
        if (i < sb2.length()) {
            arrayList.addAll(list.subList(i2, list.size()));
        }
        return arrayList;
    }

    protected List<String> joinUrls(List<String> list) {
        ArrayList arrayList = new ArrayList();
        boolean z = false;
        StringBuilder sb = new StringBuilder();
        String str = null;
        for (int i = 0; i < list.size(); i++) {
            if (urlStartsAt(i, list) && !z) {
                z = true;
                if (i - 1 >= 0) {
                    str = list.get(i - 1);
                }
                sb.append(list.get(i));
            } else if (z && urlEndsAt(i, list, str)) {
                z = false;
                str = null;
                arrayList.add(sb.toString());
                sb.setLength(0);
                arrayList.add(list.get(i));
            } else if (z) {
                sb.append(list.get(i));
            } else {
                arrayList.add(list.get(i));
            }
        }
        if (sb.length() > 0) {
            arrayList.add(sb.toString());
        }
        return arrayList;
    }

    private boolean urlStartsAt(int i, List<String> list) {
        String str = list.get(i);
        if (isProtocol(str) && list.size() > i + 3) {
            String str2 = list.get(i + 1);
            String str3 = list.get(i + 2);
            String str4 = list.get(i + 3);
            if (str2.equals(":") && str3.equals("/") && str4.equals("/")) {
                return true;
            }
        }
        if (list.size() > i + 1) {
            String str5 = list.get(i);
            String str6 = list.get(i + 1);
            if (str5.equals("www") && str6.equals(".")) {
                return true;
            }
        }
        if (list.size() > i + 3 && list.get(i + 1).equals(".") && list.get(i + 3).equals("/") && DOMAIN_CHARS.matcher(str).matches() && DOMAIN_CHARS.matcher(list.get(i + 2)).matches()) {
            return true;
        }
        return list.size() > i + 5 && list.get(i + 1).equals(".") && list.get(i + 3).equals(".") && list.get(i + 5).equals("/") && DOMAIN_CHARS.matcher(str).matches() && DOMAIN_CHARS.matcher(list.get(i + 2)).matches() && DOMAIN_CHARS.matcher(list.get(i + 4)).matches();
    }

    private boolean isProtocol(String str) {
        return PROTOCOLS.contains(str);
    }

    private boolean urlEndsAt(int i, List<String> list, String str) {
        String str2 = list.get(i);
        if (StringTools.isWhitespace(str2) || str2.equals(")") || str2.equals("]")) {
            return true;
        }
        if (list.size() <= i + 1) {
            return !URL_CHARS.matcher(str2).matches() || str2.equals(".") || str2.equals(str);
        }
        String str3 = list.get(i + 1);
        return ((StringTools.isWhitespace(str3) || StringUtils.equalsAny(str3, new CharSequence[]{"\"", "»", "«", "‘", "’", "“", "”", "'", "."})) && (StringUtils.equalsAny(str2, new CharSequence[]{".", ",", ";", ":", "!", "?"}) || str2.equals(str))) || !URL_CHARS.matcher(str2).matches();
    }
}
