package org.apache.tika.parser.html;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.config.Field;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.utils.CharsetUtils;

/* loaded from: input_file:WEB-INF/lib/tika-parsers-1.27.jar:org/apache/tika/parser/html/HtmlEncodingDetector.class */
public class HtmlEncodingDetector implements EncodingDetector {
    private static Set<String> CHARSETS_UNSUPPORTED_BY_IANA;
    private static final int DEFAULT_MARK_LIMIT = 8192;
    private static final Pattern HTTP_META_PATTERN;
    private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN;
    private static final Charset ASCII;

    @Field
    private int markLimit = 8192;

    @Override // org.apache.tika.detect.EncodingDetector
    public Charset detect(InputStream inputStream, Metadata metadata) throws IOException {
        if (inputStream == null) {
            return null;
        }
        inputStream.mark(this.markLimit);
        byte[] bArr = new byte[this.markLimit];
        int i = 0;
        int read = inputStream.read(bArr);
        while (true) {
            int i2 = read;
            if (i2 == -1 || i >= bArr.length) {
                break;
            }
            i += i2;
            read = inputStream.read(bArr, i, bArr.length - i);
        }
        inputStream.reset();
        String charBuffer = ASCII.decode(ByteBuffer.wrap(bArr, 0, i)).toString();
        Charset findCharset = findCharset(charBuffer.replaceAll("<!--.*?(-->|$)", StringUtils.SPACE));
        return findCharset == null ? findCharset(charBuffer) : findCharset;
    }

    private Charset findCharset(String str) {
        Matcher matcher = HTTP_META_PATTERN.matcher(str);
        Matcher matcher2 = FLEXIBLE_CHARSET_ATTR_PATTERN.matcher("");
        while (matcher.find()) {
            matcher2.reset(matcher.group(1));
            while (matcher2.find()) {
                String group = matcher2.group(1);
                if (!CHARSETS_UNSUPPORTED_BY_IANA.contains(group.toLowerCase(Locale.US))) {
                    if ("x-user-defined".equalsIgnoreCase(group)) {
                        group = "windows-1252";
                    }
                    if (CharsetUtils.isSupported(group)) {
                        try {
                            return CharsetUtils.forName(group);
                        } catch (IllegalArgumentException e) {
                        }
                    } else {
                        continue;
                    }
                }
            }
        }
        return null;
    }

    @Field
    public void setMarkLimit(int i) {
        this.markLimit = i;
    }

    public int getMarkLimit() {
        return this.markLimit;
    }

    /* JADX WARN: Finally extract failed */
    static {
        HashSet hashSet = new HashSet();
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(HtmlEncodingDetector.class.getResourceAsStream("StandardCharsets_unsupported_by_IANA.txt"), StandardCharsets.UTF_8));
            Throwable th = null;
            try {
                String readLine = bufferedReader.readLine();
                while (readLine != null) {
                    if (readLine.startsWith("#")) {
                        readLine = bufferedReader.readLine();
                    } else {
                        String trim = readLine.trim();
                        if (trim.length() > 0) {
                            hashSet.add(trim.toLowerCase(Locale.US));
                        }
                        readLine = bufferedReader.readLine();
                    }
                }
                if (bufferedReader != null) {
                    if (0 != 0) {
                        try {
                            bufferedReader.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    } else {
                        bufferedReader.close();
                    }
                }
                CHARSETS_UNSUPPORTED_BY_IANA = Collections.unmodifiableSet(hashSet);
                HTTP_META_PATTERN = Pattern.compile("(?is)<\\s*meta(?:/|\\s+)([^<>]+)");
                FLEXIBLE_CHARSET_ATTR_PATTERN = Pattern.compile("(?is)\\bcharset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)");
                ASCII = Charset.forName("US-ASCII");
            } catch (Throwable th3) {
                if (bufferedReader != null) {
                    if (0 != 0) {
                        try {
                            bufferedReader.close();
                        } catch (Throwable th4) {
                            th.addSuppressed(th4);
                        }
                    } else {
                        bufferedReader.close();
                    }
                }
                throw th3;
            }
        } catch (IOException e) {
            throw new IllegalArgumentException("couldn't find StandardCharsets_unsupported_by_IANA.txt on the class path");
        }
    }
}
