package com.atlassian.adf.util;

import com.atlassian.adf.model.ex.AdfException.InvalidURI;
import com.atlassian.annotations.Internal;
import com.atlassian.annotations.VisibleForTesting;

import java.net.URI;
import java.net.URISyntaxException;
import java.util.Locale;
import java.util.Set;

import static com.atlassian.adf.util.Char.isAlnum;
import static com.atlassian.adf.util.Char.isAlpha;

/**
 * Utility class for working with URLs in a way that helps keep them from breaking either Java
 * or JavaScript code. The class originally derived from frontend code in
 * {@code editor-wikimarkup-transformer/src/parser/utils/url.ts} and
 * and {@code adf-schema/src/utils/url.ts}, but has grown into something that is mostly unrelated
 * to them, now.
 */
@Internal
public class UrlUtil {
    static final int LONGEST_VALID_SCHEME = 15;

    /**
     * URI schemes that are considered valid without a following "//".
     * For example, "mailto" in this list means that a string starting with "mailto:", regardless
     * of case, will be accepted as valid.
     * Note that for those with encrypted variants, such as "sip" and "sips", both need to be listed.
     */
    // Note: Some of these, like "ssh", really should be requiring the URL format, but the front-end's regex
    // is only requiring a ":", not "://", so they are in this set.
    static final Set<String> URN_VALID_SCHEMES = Set.of(
            "callto",
            "cvs",
            "facetime",
            "feed",
            "git",
            "irc",
            "irc6",
            "itms",
            "magnet",
            "mailto",
            "mvn",
            "news",
            "nntp",
            "notes",
            "rdp",
            "skype",
            "sip",
            "sips",
            "slack",
            "sourcetree",
            "ssh",
            "svn",
            "tel",
            "telnet",
            "urn",
            "vnc",
            "whatsapp",
            "xmpp"
    );

    /**
     * URI schemes that are considered valid only with a following "//".
     * For example, "http" in this list means that a string starting with "http:", regardless
     * of case, will be accepted as valid.
     * Note that for those with encrypted variants, such as "http" and "https", both need to be listed.
     */
    static final Set<String> URL_VALID_SCHEMES = Set.of(
            "dynamicsnav",
            "jamfselfservice",
            "file",
            "ftp",
            "ftps",
            "gopher",
            "hipchat",
            "http",
            "https",
            "integrity",
            "scp",
            "sftp",
            "smb"
    );

    private UrlUtil() {
        // static-only
    }

    /**
     * Sanitizes a URL for maximum compatibility between Java and Javascript.
     * <p>
     * In particular, the URI class in Java is stricter about which characters are permitted in a
     * URL than the JavaScript URL class is. For example, it will not permit a raw space or caret (^)
     * character in the URL, even in segments like the query where Javascript's {@code new URL(url).href}
     * would not bother to escape them. This performs those escapes that are required for correct parsing
     * while leaving everything else as-is.
     *
     * @param url the original URL
     * @return the cleaned URL, which is hopefully suitable for parsing
     * @see <a href="https://datatracker.ietf.org/doc/html/rfc3986#section-2">RFC 3986, Section 2</a>
     */
    public static String escapeSpecialChars(String url) {
        StringBuilder sb = new StringBuilder(url.length());
        int mark = 0;
        for (int i = 0; i < url.length(); ++i) {
            char c = url.charAt(i);
            if (isLegalChar(c)) continue;

            // Copy everything we've done so far and update our mark
            if (mark < i) sb.append(url, mark, i);
            mark = i + 1;
            appendEscaped(sb, c);
        }

        if (mark < url.length()) sb.append(url, mark, url.length());
        return sb.toString();
    }

    @VisibleForTesting
    static void appendEscaped(StringBuilder sb, char c) {
        if (c < 0x80) {
            // 0xxxxxxx
            appendEscapedByte(sb, c);
        } else if (c < 0x800) {
            // 110xxxxx 10xxxxxx
            appendEscapedByte(sb, 0xC0 | (c >> 6));            // Lead byte of 2-byte UTF-8 seq
            appendEscapedByte(sb, 0x80 | (c & 0x3F));          // Ext byte
        } else {
            // 1110xxxx 10xxxxxx 10xxxxxx
            appendEscapedByte(sb, 0xE0 | (c >> 12));           // Lead byte of 3-byte UTF-8 seq
            appendEscapedByte(sb, 0x80 | ((c >> 6) & 0x3F));   // Ext byte
            appendEscapedByte(sb, 0x80 | (c & 0x3F));          // Ext byte
        }
        // No need to deal with 4-byte sequences, as that would require decoding surrogate pairs.
        // The URI class doesn't bother to do that, so we won't either.
    }

    private static void appendEscapedByte(StringBuilder sb, int byteVal) {
        sb.append('%')
                .append(Char.hex(byteVal >> 4))
                .append(Char.hex(byteVal & 0x0F));
    }

    public static boolean isSafeUrl(String url) {
        String urlTrimmed = url.trim();
        if (urlTrimmed.isEmpty()) return true;

        char c = urlTrimmed.charAt(0);
        if (c == '/' || c == '#') return true;
        if (!isAlpha(c)) return false;

        int stop = Integer.min(urlTrimmed.length(), LONGEST_VALID_SCHEME);
        int i = 1;

        while (true) {
            if (i >= stop) return false;
            c = urlTrimmed.charAt(i);
            if (c == ':') break;
            if (!isAlnum(c)) return false;
            ++i;
        }

        String scheme = urlTrimmed.substring(0, i).toLowerCase(Locale.ROOT);
        if (URN_VALID_SCHEMES.contains(scheme)) return true;

        if (!URL_VALID_SCHEMES.contains(scheme)) return false;
        if (i + 2 >= urlTrimmed.length()) return false;
        return urlTrimmed.charAt(i + 1) == '/'
                && urlTrimmed.charAt(i + 2) == '/';
    }

    public static String validateUrl(String url, String propertyName) {
        String urlTrimmed = url.trim();
        if (urlTrimmed.isEmpty()) return "";

        try {
            String parsed = new URI(urlTrimmed).toString();
            if (!isSafeUrl(parsed)) throw new InvalidURI(propertyName, urlTrimmed);
            return urlTrimmed;
        } catch (URISyntaxException e) {
            String escaped = escapeSpecialChars(urlTrimmed);
            if (!escaped.equals(urlTrimmed)) {
                try {
                    String parsed = new URI(escaped).toString();
                    if (!isSafeUrl(parsed)) throw new InvalidURI(propertyName, urlTrimmed);
                    return urlTrimmed;
                } catch (URISyntaxException suppressed) {
                    e.addSuppressed(suppressed);
                }
            }
            throw new InvalidURI(propertyName, urlTrimmed, e);
        }
    }

    // These character classes are as specified by RFC 3986, except that we do not force escaping of non-ASCII chars.
    // See RFC 3986, Section 2.2: https://www.rfc-editor.org/rfc/rfc3986#section-2.2
    private static boolean isLegalChar(char c) {
        switch (c) {
            // Assume existing %-sequences are correct, should be preserved as-is, and that any literal
            // percent chars are already correctly escaped. That is, we are not going to touch these.
            case '%':

                // gen-delims - escaping them changes the meaning of the URL, so don't mess with them
            case ':':
            case '/':
            case '?':
            case '#':
            case '[':
            case ']':
            case '@':

                // sub-delims - escaping these could change the meaning of a subcomponent, such as the path or
                // query, when interpreted by whatever understands them, so don't mess with these, either
            case '!':
            case '$':
            case '&':
            case '\'':
            case '(':
            case ')':
            case '*':
            case '+':
            case ',':
            case ';':
            case '=':

                // unreserved - escaping of these chars is unnecessary and does not change the meaning of the URL
                // this includes alphanumeric chars, which we check below.
            case '-':
            case '.':
            case '_':
            case '~':

                // Any char we listed is permitted in some part(s) of the URL, so we aren't going to quote any
                // of them.
                return true;

            default:
                // Alphanumerics are also "unreserved" chars, so keep them, too
                if (isAlnum(c)) return true;

                // Anything else in the ASCII set is disallowed
                if (c < 128) return false;

                // Anything else in Unicode is not strictly allowed by the standard, but the URI class
                // will tolerate it if it is printable. Space and control chars have to be escaped.
                // The URI class does not know anything about surrogate pairs, so we don't need to
                // check for them, decode them, and do checks against the codepoint instead; this is
                // good enough.
                return !(Character.isSpaceChar(c) || Character.isISOControl(c));
        }
    }
}
