/*
 * Decompiled with CFR 0.152.
 */
package org.apache.lucene.benchmark.utils;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.nio.file.attribute.FileAttribute;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.util.IOUtils;

public class ExtractReuters {
    private Path reutersDir;
    private Path outputDir;
    Pattern EXTRACTION_PATTERN = Pattern.compile("<TITLE>(.*?)</TITLE>|<DATE>(.*?)</DATE>|<BODY>(.*?)</BODY>");
    private static String[] META_CHARS = new String[]{"&", "<", ">", "\"", "'"};
    private static String[] META_CHARS_SERIALIZATIONS = new String[]{"&amp;", "&lt;", "&gt;", "&quot;", "&apos;"};

    public ExtractReuters(Path reutersDir, Path outputDir) throws IOException {
        this.reutersDir = reutersDir;
        this.outputDir = outputDir;
        System.out.println("Deleting all files in " + outputDir);
        IOUtils.rm((Path[])new Path[]{outputDir});
    }

    public void extract() throws IOException {
        long count = 0L;
        Files.createDirectories(this.outputDir, new FileAttribute[0]);
        try (DirectoryStream<Path> stream = Files.newDirectoryStream(this.reutersDir, "*.sgm");){
            for (Path sgmFile : stream) {
                this.extractFile(sgmFile);
                ++count;
            }
        }
        if (count == 0L) {
            System.err.println("No .sgm files in " + this.reutersDir);
        }
    }

    protected void extractFile(Path sgmFile) {
        try (BufferedReader reader = Files.newBufferedReader(sgmFile, StandardCharsets.ISO_8859_1);){
            StringBuilder buffer = new StringBuilder(1024);
            StringBuilder outBuffer = new StringBuilder(1024);
            String line = null;
            int docNumber = 0;
            while ((line = reader.readLine()) != null) {
                if (line.indexOf("</REUTERS") == -1) {
                    buffer.append(line).append(' ');
                    continue;
                }
                Matcher matcher = this.EXTRACTION_PATTERN.matcher(buffer);
                while (matcher.find()) {
                    for (int i = 1; i <= matcher.groupCount(); ++i) {
                        if (matcher.group(i) == null) continue;
                        outBuffer.append(matcher.group(i));
                    }
                    outBuffer.append(System.lineSeparator()).append(System.lineSeparator());
                }
                String out = outBuffer.toString();
                for (int i = 0; i < META_CHARS_SERIALIZATIONS.length; ++i) {
                    out = out.replaceAll(META_CHARS_SERIALIZATIONS[i], META_CHARS[i]);
                }
                Path outFile = this.outputDir.resolve(sgmFile.getFileName() + "-" + docNumber++ + ".txt");
                try (BufferedWriter writer = Files.newBufferedWriter(outFile, StandardCharsets.UTF_8, new OpenOption[0]);){
                    writer.write(out);
                }
                outBuffer.setLength(0);
                buffer.setLength(0);
            }
        }
        catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public static void main(String[] args) throws Exception {
        if (args.length != 2) {
            ExtractReuters.usage("Wrong number of arguments (" + args.length + ")");
            return;
        }
        Path reutersDir = Paths.get(args[0], new String[0]);
        if (!Files.exists(reutersDir, new LinkOption[0])) {
            ExtractReuters.usage("Cannot find Path to Reuters SGM files (" + reutersDir + ")");
            return;
        }
        Path outputDir = Paths.get(args[1] + "-tmp", new String[0]);
        Files.createDirectories(outputDir, new FileAttribute[0]);
        ExtractReuters extractor = new ExtractReuters(reutersDir, outputDir);
        extractor.extract();
        Files.move(outputDir, Paths.get(args[1], new String[0]), StandardCopyOption.ATOMIC_MOVE);
    }

    private static void usage(String msg) {
        System.err.println("Usage: " + msg + " :: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>");
    }
}

