package com.norconex.collector.core.crawler;

import com.norconex.collector.core.Collector;
import com.norconex.collector.core.CollectorException;
import com.norconex.collector.core.crawler.CrawlerConfig;
import com.norconex.collector.core.crawler.CrawlerEvent;
import com.norconex.collector.core.doc.CrawlDoc;
import com.norconex.collector.core.doc.CrawlDocInfo;
import com.norconex.collector.core.doc.CrawlDocInfoService;
import com.norconex.collector.core.doc.CrawlState;
import com.norconex.collector.core.monitor.CrawlerMonitor;
import com.norconex.collector.core.monitor.CrawlerMonitorJMX;
import com.norconex.collector.core.pipeline.importer.ImporterPipelineContext;
import com.norconex.collector.core.spoil.SpoiledReferenceStrategy;
import com.norconex.collector.core.spoil.impl.GenericSpoiledReferenceStrategizer;
import com.norconex.collector.core.store.DataStoreExporter;
import com.norconex.collector.core.store.DataStoreImporter;
import com.norconex.collector.core.store.IDataStoreEngine;
import com.norconex.committer.core3.CommitterContext;
import com.norconex.commons.lang.Sleeper;
import com.norconex.commons.lang.bean.BeanUtil;
import com.norconex.commons.lang.event.EventManager;
import com.norconex.commons.lang.file.FileUtil;
import com.norconex.commons.lang.io.CachedInputStream;
import com.norconex.commons.lang.io.CachedStreamFactory;
import com.norconex.importer.Importer;
import com.norconex.importer.doc.Doc;
import com.norconex.importer.response.ImporterResponse;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.attribute.FileAttribute;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.input.NullInputStream;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.mutable.MutableLong;
import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/norconex/collector/core/crawler/Crawler.class */
public abstract class Crawler {
    private static final Logger LOG = LoggerFactory.getLogger(Crawler.class);
    private static final int MINIMUM_DELAY = 1;
    private final CrawlerConfig config;
    private final Collector collector;
    private Importer importer;
    private final CrawlerCommitterService committers;
    private Path workDir;
    private Path tempDir;
    private Path downloadDir;
    private boolean stopped;
    private CrawlerMonitor monitor;
    private CrawlProgressLogger progressLogger;
    private IDataStoreEngine dataStoreEngine;
    private CrawlDocInfoService crawlDocInfoService;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/norconex/collector/core/crawler/Crawler$ProcessFlags.class */
    public final class ProcessFlags {
        private boolean delete;
        private boolean orphan;

        private ProcessFlags() {
        }

        /* JADX INFO: Access modifiers changed from: private */
        public ProcessFlags delete() {
            this.delete = true;
            return this;
        }

        /* JADX INFO: Access modifiers changed from: private */
        public ProcessFlags orphan() {
            this.orphan = true;
            return this;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/norconex/collector/core/crawler/Crawler$ProcessReferencesRunnable.class */
    public final class ProcessReferencesRunnable implements Runnable {
        private final ProcessFlags flags;
        private final CountDownLatch latch;
        private final int threadIndex;

        private ProcessReferencesRunnable(CountDownLatch countDownLatch, ProcessFlags processFlags, int i) {
            this.latch = countDownLatch;
            this.flags = processFlags;
            this.threadIndex = i;
        }

        @Override // java.lang.Runnable
        public void run() {
            ReferenceProcessStatus processNextReference;
            Thread.currentThread().setName(Crawler.this.getId() + "#" + this.threadIndex);
            Crawler.LOG.debug("Crawler thread #{} started.", Integer.valueOf(this.threadIndex));
            try {
                Crawler.this.getEventManager().fire(new CrawlerEvent.Builder(CrawlerEvent.CRAWLER_RUN_THREAD_BEGIN, Crawler.this).subject(Thread.currentThread()).m14build());
                while (!Crawler.this.isStopped()) {
                    try {
                        processNextReference = Crawler.this.processNextReference(this.flags);
                    } catch (Exception e) {
                        Crawler.LOG.error("An error occured that could compromise the stability of the crawler. Stopping excution to avoid further issues...", e);
                        Crawler.this.stop();
                    }
                    if (processNextReference == ReferenceProcessStatus.MAX_REACHED) {
                        Crawler.this.stop();
                        break;
                    } else if (processNextReference == ReferenceProcessStatus.QUEUE_EMPTY) {
                        if (Crawler.this.isQueueInitialized()) {
                            break;
                        }
                        Crawler.LOG.info("References are still being queued. Waiting for new references...");
                        Sleeper.sleepSeconds(5);
                    }
                }
            } catch (Exception e2) {
                Crawler.LOG.error("Problem in thread execution.", e2);
            } finally {
                this.latch.countDown();
                Crawler.this.getEventManager().fire(new CrawlerEvent.Builder(CrawlerEvent.CRAWLER_RUN_THREAD_END, Crawler.this).subject(Thread.currentThread()).m14build());
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:com/norconex/collector/core/crawler/Crawler$ReferenceProcessStatus.class */
    public enum ReferenceProcessStatus {
        MAX_REACHED,
        QUEUE_EMPTY,
        OK
    }

    public Crawler(CrawlerConfig crawlerConfig, Collector collector) {
        Objects.requireNonNull(crawlerConfig, "'config' must not be null");
        Objects.requireNonNull(crawlerConfig, "'collector' must not be null");
        this.config = crawlerConfig;
        this.collector = collector;
        this.committers = new CrawlerCommitterService(this);
    }

    public EventManager getEventManager() {
        return this.collector.getEventManager();
    }

    public CrawlerMonitor getMonitor() {
        return this.monitor;
    }

    public CrawlerCommitterService getCommitterService() {
        return this.committers;
    }

    public String getId() {
        return this.config.getId();
    }

    public boolean isStopped() {
        return this.stopped;
    }

    public void stop() {
        getEventManager().fire(new CrawlerEvent.Builder(CrawlerEvent.CRAWLER_STOP_BEGIN, this).m14build());
        this.stopped = true;
        LOG.info("Stopping the crawler.");
    }

    public Importer getImporter() {
        return this.importer;
    }

    public CachedStreamFactory getStreamFactory() {
        return this.collector.getStreamFactory();
    }

    public CrawlerConfig getCrawlerConfig() {
        return this.config;
    }

    public Collector getCollector() {
        return this.collector;
    }

    public Path getWorkDir() {
        if (this.workDir != null) {
            return this.workDir;
        }
        Path resolve = this.collector.getWorkDir().resolve(FileUtil.toSafeFileName(getId()));
        try {
            Files.createDirectories(resolve, new FileAttribute[0]);
            this.workDir = resolve;
            return this.workDir;
        } catch (IOException e) {
            throw new CollectorException("Could not create crawler working directory.", e);
        }
    }

    public Path getTempDir() {
        if (this.tempDir != null) {
            return this.tempDir;
        }
        Path resolve = this.collector.getTempDir().resolve(FileUtil.toSafeFileName(getId()));
        try {
            Files.createDirectories(resolve, new FileAttribute[0]);
            this.tempDir = resolve;
            return this.tempDir;
        } catch (IOException e) {
            throw new CollectorException("Could not create crawler temp directory.", e);
        }
    }

    public Path getDownloadDir() {
        return this.downloadDir;
    }

    public void start() {
        boolean initCrawler = initCrawler();
        this.importer = new Importer(getCrawlerConfig().getImporterConfig(), getEventManager());
        this.monitor = new CrawlerMonitor(this);
        this.progressLogger = new CrawlProgressLogger(this.monitor, 30000L);
        this.progressLogger.startTracking();
        if (Boolean.getBoolean("enableJMX")) {
            CrawlerMonitorJMX.register(this);
        }
        try {
            getEventManager().fire(new CrawlerEvent.Builder(CrawlerEvent.CRAWLER_RUN_BEGIN, this).m14build());
            logUsefulInfo();
            beforeCrawlerExecution(initCrawler);
            if (StringUtils.isBlank(getCrawlerConfig().getId())) {
                throw new CollectorException("Crawler must be given a unique identifier (id).");
            }
            doExecute();
            try {
                afterCrawlerExecution();
                if (Boolean.getBoolean("enableJMX")) {
                    CrawlerMonitorJMX.unregister(this);
                }
                LOG.info("Execution Summary:{}", this.progressLogger.getExecutionSummary());
            } finally {
            }
        } catch (Throwable th) {
            try {
                afterCrawlerExecution();
                if (Boolean.getBoolean("enableJMX")) {
                    CrawlerMonitorJMX.unregister(this);
                }
                LOG.info("Execution Summary:{}", this.progressLogger.getExecutionSummary());
                throw th;
            } finally {
            }
        }
    }

    private void logUsefulInfo() {
        if (Boolean.getBoolean("enableJMX")) {
            LOG.info("JMX support enabled.");
        } else {
            LOG.info("JMX support disabled. To enable, set -DenableJMX=true system property as JVM argument.");
        }
    }

    protected boolean initCrawler() {
        Thread.currentThread().setName(getId());
        getEventManager().fire(((CrawlerEvent.Builder) new CrawlerEvent.Builder(CrawlerEvent.CRAWLER_INIT_BEGIN, this).message("Initializing crawler \"" + getId() + "\"...")).m14build());
        if (StringUtils.isBlank(this.config.getId())) {
            throw new CollectorException("Crawler must be given a unique identifier (id).");
        }
        this.downloadDir = getWorkDir().resolve("downloads");
        this.dataStoreEngine = this.config.getDataStoreEngine();
        this.dataStoreEngine.init(this);
        this.crawlDocInfoService = new CrawlDocInfoService(this);
        this.committers.init(CommitterContext.builder().setEventManager(getEventManager()).setWorkDir(getWorkDir().resolve("committer")).setStreamFactory(getStreamFactory()).build());
        boolean open = this.crawlDocInfoService.open();
        getEventManager().fire(((CrawlerEvent.Builder) new CrawlerEvent.Builder(CrawlerEvent.CRAWLER_INIT_END, this).message("Crawler \"" + getId() + "\" initialized successfully.")).m14build());
        return open;
    }

    protected Class<? extends CrawlDocInfo> getCrawlReferenceType() {
        return CrawlDocInfo.class;
    }

    public IDataStoreEngine getDataStoreEngine() {
        return this.dataStoreEngine;
    }

    public CrawlDocInfoService getDocInfoService() {
        return this.crawlDocInfoService;
    }

    public void clean() {
        initCrawler();
        getEventManager().fire(((CrawlerEvent.Builder) new CrawlerEvent.Builder(CrawlerEvent.CRAWLER_CLEAN_BEGIN, this).message("Cleaning cached crawler \"" + getId() + "\" data...")).m14build());
        try {
            this.committers.clean();
            destroyCrawler();
            FileUtils.deleteDirectory(getTempDir().toFile());
            FileUtils.deleteDirectory(getWorkDir().toFile());
            getEventManager().fire(((CrawlerEvent.Builder) new CrawlerEvent.Builder(CrawlerEvent.CRAWLER_CLEAN_END, this).message("Done cleaning crawler \"" + getId() + "\".")).m14build());
        } catch (IOException e) {
            throw new CollectorException("Could not clean \"" + getId() + "\" crawler directory.", e);
        }
    }

    public void importDataStore(Path path) {
        initCrawler();
        try {
            try {
                DataStoreImporter.importDataStore(this, path);
                destroyCrawler();
            } catch (IOException e) {
                throw new CollectorException("Could not import data store.", e);
            }
        } catch (Throwable th) {
            destroyCrawler();
            throw th;
        }
    }

    public Path exportDataStore(Path path) {
        initCrawler();
        try {
            try {
                Path exportDataStore = DataStoreExporter.exportDataStore(this, path);
                destroyCrawler();
                return exportDataStore;
            } catch (IOException e) {
                throw new CollectorException("Could not export data store.", e);
            }
        } catch (Throwable th) {
            destroyCrawler();
            throw th;
        }
    }

    protected void destroyCrawler() {
        this.crawlDocInfoService.close();
        this.dataStoreEngine.close();
        this.committers.close();
    }

    protected abstract void beforeCrawlerExecution(boolean z);

    protected abstract void afterCrawlerExecution();

    protected void doExecute() {
        LOG.info("Crawling references...");
        processReferences(new ProcessFlags());
        if (!isStopped()) {
            handleOrphans();
        }
        LOG.debug("Removing empty directories");
        FileUtil.deleteEmptyDirs(getDownloadDir().toFile());
        getEventManager().fire(new CrawlerEvent.Builder(isStopped() ? CrawlerEvent.CRAWLER_STOP_END : CrawlerEvent.CRAWLER_RUN_END, this).m14build());
        LOG.info("Crawler {}", isStopped() ? "stopped." : "completed.");
    }

    protected void handleOrphans() {
        CrawlerConfig.OrphansStrategy orphansStrategy = this.config.getOrphansStrategy();
        if (orphansStrategy == null) {
            orphansStrategy = CrawlerConfig.OrphansStrategy.IGNORE;
        }
        if (orphansStrategy == CrawlerConfig.OrphansStrategy.PROCESS) {
            reprocessCacheOrphans();
        } else if (orphansStrategy == CrawlerConfig.OrphansStrategy.DELETE) {
            deleteCacheOrphans();
        }
    }

    protected boolean isMaxDocuments() {
        return getCrawlerConfig().getMaxDocuments() > -1 && this.monitor.getProcessedCount() >= ((long) getCrawlerConfig().getMaxDocuments());
    }

    protected void reprocessCacheOrphans() {
        if (isMaxDocuments()) {
            LOG.info("Max documents reached. Not reprocessing orphans (if any).");
            return;
        }
        LOG.info("Reprocessing any cached/orphan references...");
        MutableLong mutableLong = new MutableLong();
        this.crawlDocInfoService.forEachCached((str, crawlDocInfo) -> {
            executeQueuePipeline(crawlDocInfo);
            mutableLong.increment();
            return true;
        });
        if (mutableLong.longValue() > 0) {
            processReferences(new ProcessFlags().orphan());
        }
        LOG.info("Reprocessed {} cached/orphan references.", mutableLong);
    }

    protected abstract void executeQueuePipeline(CrawlDocInfo crawlDocInfo);

    protected void deleteCacheOrphans() {
        LOG.info("Deleting orphan references (if any)...");
        MutableLong mutableLong = new MutableLong();
        this.crawlDocInfoService.forEachCached((str, crawlDocInfo) -> {
            this.crawlDocInfoService.queue(crawlDocInfo);
            mutableLong.increment();
            return true;
        });
        if (mutableLong.longValue() > 0) {
            processReferences(new ProcessFlags().delete());
        }
        LOG.info("Deleted {} orphan references.", mutableLong);
    }

    protected void processReferences(ProcessFlags processFlags) {
        int numThreads = getCrawlerConfig().getNumThreads();
        CountDownLatch countDownLatch = new CountDownLatch(numThreads);
        ExecutorService newFixedThreadPool = Executors.newFixedThreadPool(numThreads);
        for (int i = 0; i < numThreads; i += MINIMUM_DELAY) {
            int i2 = i + MINIMUM_DELAY;
            LOG.debug("Crawler thread #{} starting...", Integer.valueOf(i2));
            newFixedThreadPool.execute(new ProcessReferencesRunnable(countDownLatch, processFlags, i2));
        }
        try {
            countDownLatch.await();
            newFixedThreadPool.shutdown();
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            throw new CollectorException(e);
        }
    }

    protected ReferenceProcessStatus processNextReference(ProcessFlags processFlags) {
        if (!processFlags.delete && isMaxDocuments()) {
            LOG.info("Maximum documents reached: {}", Integer.valueOf(getCrawlerConfig().getMaxDocuments()));
            return ReferenceProcessStatus.MAX_REACHED;
        }
        Optional<CrawlDocInfo> pollQueue = this.crawlDocInfoService.pollQueue();
        LOG.trace("Processing next reference from Queue: {}", pollQueue);
        if (pollQueue.isPresent()) {
            StopWatch stopWatch = null;
            if (LOG.isDebugEnabled()) {
                stopWatch = new StopWatch();
                stopWatch.start();
            }
            processNextQueuedCrawlData(pollQueue.get(), processFlags);
            if (LOG.isDebugEnabled()) {
                stopWatch.stop();
                LOG.debug("{} to process: {}", stopWatch, pollQueue.get().getReference());
            }
        } else {
            long activeCount = this.crawlDocInfoService.getActiveCount();
            boolean isQueueEmpty = this.crawlDocInfoService.isQueueEmpty();
            if (LOG.isTraceEnabled()) {
                LOG.trace("Number of references currently being processed: {}", Long.valueOf(activeCount));
                LOG.trace("Is reference queue empty? {}", Boolean.valueOf(isQueueEmpty));
            }
            if (activeCount == 0 && isQueueEmpty) {
                return ReferenceProcessStatus.QUEUE_EMPTY;
            }
            Sleeper.sleepMillis(1L);
        }
        return ReferenceProcessStatus.OK;
    }

    protected void initCrawlDoc(CrawlDoc crawlDoc) {
    }

    /* JADX WARN: Incorrect condition in loop: B:28:0x013b */
    /*
        Code decompiled incorrectly, please refer to instructions dump.
        To view partially-correct add '--show-bad-code' argument
    */
    private void processNextQueuedCrawlData(com.norconex.collector.core.doc.CrawlDocInfo r8, com.norconex.collector.core.crawler.Crawler.ProcessFlags r9) {
        /*
            Method dump skipped, instructions count: 350
            To view this dump add '--comments-level debug' option
        */
        throw new UnsupportedOperationException("Method not decompiled: com.norconex.collector.core.crawler.Crawler.processNextQueuedCrawlData(com.norconex.collector.core.doc.CrawlDocInfo, com.norconex.collector.core.crawler.Crawler$ProcessFlags):void");
    }

    private void processImportResponse(ImporterResponse importerResponse, CrawlDoc crawlDoc) {
        CrawlDocInfo m18getDocInfo = crawlDoc.m18getDocInfo();
        String importerStatus = importerResponse.getImporterStatus().toString();
        if (importerResponse.getNestedResponses().length > 0) {
            importerStatus = importerStatus + "(" + importerResponse.getNestedResponses().length + " nested responses.)";
        }
        if (importerResponse.isSuccess()) {
            getEventManager().fire(((CrawlerEvent.Builder) new CrawlerEvent.Builder(CrawlerEvent.DOCUMENT_IMPORTED, this).crawlDocInfo(m18getDocInfo).subject(importerResponse).message(importerStatus)).m14build());
            executeCommitterPipeline(this, crawlDoc);
        } else {
            m18getDocInfo.setState(CrawlState.REJECTED);
            getEventManager().fire(((CrawlerEvent.Builder) new CrawlerEvent.Builder(CrawlerEvent.REJECTED_IMPORT, this).crawlDocInfo(m18getDocInfo).subject(importerResponse).message(importerStatus)).m14build());
            LOG.debug("Importing unsuccessful for \"{}\": {}", m18getDocInfo.getReference(), importerResponse.getImporterStatus().getDescription());
        }
        finalizeDocumentProcessing(crawlDoc);
        ImporterResponse[] nestedResponses = importerResponse.getNestedResponses();
        int length = nestedResponses.length;
        for (int i = 0; i < length; i += MINIMUM_DELAY) {
            ImporterResponse importerResponse2 = nestedResponses[i];
            CrawlDocInfo createChildDocInfo = createChildDocInfo(importerResponse2.getReference(), m18getDocInfo);
            CrawlDocInfo orElse = this.crawlDocInfoService.getCached(importerResponse2.getReference()).orElse(null);
            Doc document = importerResponse2.getDocument();
            CrawlDoc crawlDoc2 = new CrawlDoc(createChildDocInfo, orElse, document == null ? CachedInputStream.cache(new NullInputStream(0L)) : document.getInputStream());
            if (document != null) {
                crawlDoc2.getMetadata().putAll(document.getMetadata());
            }
            processImportResponse(importerResponse2, crawlDoc2);
        }
    }

    private void finalizeDocumentProcessing(CrawlDoc crawlDoc) {
        CrawlDocInfo m18getDocInfo = crawlDoc.m18getDocInfo();
        CrawlDocInfo cachedDocInfo = crawlDoc.getCachedDocInfo();
        if (m18getDocInfo.getState() == null) {
            LOG.warn("Reference status is unknown for \"{}\". This should not happen. Assuming bad status.", m18getDocInfo.getReference());
            m18getDocInfo.setState(CrawlState.BAD_STATUS);
        }
        try {
            beforeFinalizeDocumentProcessing(crawlDoc);
            if (!m18getDocInfo.getState().isNewOrModified() && cachedDocInfo != null) {
                BeanUtil.copyPropertiesOverNulls(m18getDocInfo, cachedDocInfo);
            }
            if (!m18getDocInfo.getState().isGoodState() && !m18getDocInfo.getState().isOneOf(CrawlState.DELETED)) {
                SpoiledReferenceStrategy spoiledStateStrategy = getSpoiledStateStrategy(m18getDocInfo);
                if (spoiledStateStrategy == SpoiledReferenceStrategy.IGNORE) {
                    LOG.debug("Ignoring spoiled reference: {}", m18getDocInfo.getReference());
                } else if (spoiledStateStrategy == SpoiledReferenceStrategy.DELETE) {
                    if (cachedDocInfo != null && !cachedDocInfo.getState().isOneOf(CrawlState.DELETED)) {
                        deleteReference(crawlDoc);
                    }
                } else if (cachedDocInfo != null && !cachedDocInfo.getState().isOneOf(CrawlState.DELETED)) {
                    if (cachedDocInfo.getState().isGoodState()) {
                        LOG.debug("This spoiled reference is being graced once (will be deleted next time if still spoiled): {}", m18getDocInfo.getReference());
                    } else {
                        deleteReference(crawlDoc);
                    }
                }
            }
        } catch (Exception e) {
            LOG.error("Could not finalize processing of: {} ({})", new Object[]{m18getDocInfo.getReference(), e.getMessage(), e});
        }
        try {
            this.crawlDocInfoService.processed(m18getDocInfo);
            markReferenceVariationsAsProcessed(m18getDocInfo);
            this.progressLogger.logProgress();
        } catch (Exception e2) {
            LOG.error("Could not mark reference as processed: {} ({})", new Object[]{m18getDocInfo.getReference(), e2.getMessage(), e2});
        }
        try {
            crawlDoc.getInputStream().dispose();
        } catch (Exception e3) {
            LOG.error("Could not dispose of resources.", e3);
        }
    }

    protected void beforeFinalizeDocumentProcessing(CrawlDoc crawlDoc) {
    }

    protected abstract void markReferenceVariationsAsProcessed(CrawlDocInfo crawlDocInfo);

    protected abstract CrawlDocInfo createChildDocInfo(String str, CrawlDocInfo crawlDocInfo);

    protected abstract ImporterResponse executeImporterPipeline(ImporterPipelineContext importerPipelineContext);

    protected abstract void executeCommitterPipeline(Crawler crawler, CrawlDoc crawlDoc);

    private SpoiledReferenceStrategy getSpoiledStateStrategy(CrawlDocInfo crawlDocInfo) {
        SpoiledReferenceStrategy resolveSpoiledReferenceStrategy = this.config.getSpoiledReferenceStrategizer().resolveSpoiledReferenceStrategy(crawlDocInfo.getReference(), crawlDocInfo.getState());
        if (resolveSpoiledReferenceStrategy == null) {
            resolveSpoiledReferenceStrategy = GenericSpoiledReferenceStrategizer.DEFAULT_FALLBACK_STRATEGY;
        }
        return resolveSpoiledReferenceStrategy;
    }

    private void deleteReference(CrawlDoc crawlDoc) {
        LOG.debug("Deleting reference: {}", crawlDoc.getReference());
        crawlDoc.m18getDocInfo().setState(CrawlState.DELETED);
        this.committers.delete(crawlDoc);
    }

    protected boolean isQueueInitialized() {
        return true;
    }

    public String toString() {
        return getId();
    }
}
