package com.ge.research.semtk.load;

import com.ge.research.semtk.load.dataset.Dataset;
import com.ge.research.semtk.utility.LocalLogger;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.apache.commons.lang.ArrayUtils;
import org.apache.jena.atlas.lib.Chars;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;

/* loaded from: input_file:BOOT-INF/lib/sparqlGraphLibrary-2.2.2.jar:com/ge/research/semtk/load/DataCleaner.class */
public class DataCleaner {
    public static final String JSON_KEY_SPLIT = "SPLIT";
    public static final String JSON_KEY_PAIRED_SPLIT = "PAIRED_SPLIT";
    public static final String JSON_KEY_LOWERCASE = "LOWERCASE";
    public static final String JSON_KEY_REMOVE_NULLS = "REMOVE_NULLS";
    public static final String JSON_KEY_REMOVE_NA = "REMOVE_NA";
    private static final String[] UNSUPPORTED_SPLIT_DELIMITERS = {"\n", "^", " "};
    private Dataset dataset;
    private ArrayList<String> headers;
    private BufferedWriter writer;
    private CSVPrinter csvPrinter;
    private final int BATCH_SIZE = 2;
    private HashMap<String, String> columnsToSplit;
    private HashSet<HashMap<String, String>> columnsToPairedSplit;
    private HashSet<String> columnsToLowerCase;
    private boolean removeNulls;
    private boolean removeNA;
    private int numRowsProcessed;
    private int numRowsProduced;

    public DataCleaner(Dataset dataset, String str) throws Exception {
        this(dataset, str, null);
    }

    public DataCleaner(Dataset dataset, String str, JSONObject jSONObject) throws Exception {
        this.BATCH_SIZE = 2;
        this.columnsToSplit = new HashMap<>();
        this.columnsToPairedSplit = new HashSet<>();
        this.columnsToLowerCase = new HashSet<>();
        this.removeNulls = false;
        this.removeNA = false;
        this.dataset = dataset;
        this.headers = dataset.getColumnNamesinOrder();
        this.writer = new BufferedWriter(new FileWriter(str));
        this.csvPrinter = new CSVPrinter(this.writer, CSVFormat.DEFAULT);
        parseCleanSpecJson(jSONObject);
        this.csvPrinter.printRecord(this.headers);
    }

    private void parseCleanSpecJson(JSONObject jSONObject) throws Exception {
        if (jSONObject == null) {
            return;
        }
        JSONArray jSONArray = (JSONArray) jSONObject.get(JSON_KEY_LOWERCASE);
        if (jSONArray != null) {
            Iterator it = jSONArray.iterator();
            while (it.hasNext()) {
                addToLowerCase((String) it.next());
            }
        }
        JSONObject jSONObject2 = (JSONObject) jSONObject.get(JSON_KEY_SPLIT);
        if (jSONObject2 != null) {
            for (String str : jSONObject2.keySet()) {
                addSplit(str, (String) jSONObject2.get(str));
            }
        }
        JSONArray jSONArray2 = (JSONArray) jSONObject.get(JSON_KEY_PAIRED_SPLIT);
        if (jSONArray2 != null) {
            Iterator it2 = jSONArray2.iterator();
            while (it2.hasNext()) {
                addPairedSplit((HashMap) it2.next());
            }
        }
        String str2 = (String) jSONObject.get(JSON_KEY_REMOVE_NULLS);
        if (str2 != null && str2.toString().toLowerCase().equals("true")) {
            this.removeNulls = true;
        }
        String str3 = (String) jSONObject.get(JSON_KEY_REMOVE_NA);
        if (str3 == null || !str3.toString().toLowerCase().equals("true")) {
            return;
        }
        this.removeNA = true;
    }

    public void addSplit(String str, String str2) throws Exception {
        validateColumnHeader(str);
        validateDelimiter(str2);
        validateNotSplitYet(str);
        this.columnsToSplit.put(str, str2);
    }

    public void addPairedSplit(HashMap<String, String> hashMap) throws Exception {
        for (String str : hashMap.keySet()) {
            validateColumnHeader(str);
            validateDelimiter(hashMap.get(str));
            validateNotSplitYet(str);
        }
        this.columnsToPairedSplit.add(hashMap);
    }

    public void addToLowerCase(String str) throws Exception {
        validateColumnHeader(str);
        this.columnsToLowerCase.add(str);
    }

    public void addRemoveNulls(boolean z) throws Exception {
        this.removeNulls = z;
    }

    public void addRemoveNA(boolean z) throws Exception {
        this.removeNA = z;
    }

    public int cleanData() throws Exception {
        try {
            try {
                this.numRowsProcessed = 0;
                this.numRowsProduced = 0;
                while (true) {
                    ArrayList<ArrayList<String>> nextRecords = this.dataset.getNextRecords(2);
                    if (nextRecords.size() == 0) {
                        this.csvPrinter.flush();
                        LocalLogger.logToStdOut("Processed " + this.numRowsProcessed + " records, produced " + this.numRowsProduced + " clean records. (DONE)");
                        this.writer.close();
                        this.dataset.close();
                        return this.numRowsProduced;
                    }
                    Iterator<ArrayList<String>> it = nextRecords.iterator();
                    while (it.hasNext()) {
                        cleanRow(it.next(), this.headers);
                        this.numRowsProcessed++;
                    }
                }
            } catch (Exception e) {
                throw new Exception("Exception cleaning data: " + e);
            }
        } catch (Throwable th) {
            this.writer.close();
            this.dataset.close();
            throw th;
        }
    }

    private void cleanRow(ArrayList<String> arrayList, ArrayList<String> arrayList2) throws Exception {
        if (arrayList.size() != arrayList2.size()) {
            throw new Exception("Row does not have the same number of fields as the header list");
        }
        ArrayList<String> performLowerCase = performLowerCase(arrayList);
        if (this.removeNulls) {
            performLowerCase = performRemoveNulls(performLowerCase);
        }
        if (this.removeNA) {
            performLowerCase = performRemoveNA(performLowerCase);
        }
        ArrayList<ArrayList<String>> performSplits = performSplits(performLowerCase);
        if (this.columnsToPairedSplit.size() > 0) {
            ArrayList<ArrayList<String>> arrayList3 = new ArrayList<>();
            Iterator<ArrayList<String>> it = performSplits.iterator();
            while (it.hasNext()) {
                Iterator<ArrayList<String>> it2 = performPairedSplits(it.next()).iterator();
                while (it2.hasNext()) {
                    arrayList3.add((ArrayList) it2.next().clone());
                }
            }
            performSplits = arrayList3;
        }
        Iterator<ArrayList<String>> it3 = performSplits.iterator();
        while (it3.hasNext()) {
            this.csvPrinter.printRecord(it3.next());
            this.numRowsProduced++;
        }
    }

    private ArrayList<String> performLowerCase(ArrayList<String> arrayList) {
        for (int i = 0; i < arrayList.size(); i++) {
            String str = arrayList.get(i);
            if (this.columnsToLowerCase.contains(this.headers.get(i))) {
                arrayList.set(i, str.toLowerCase());
            }
        }
        return arrayList;
    }

    private ArrayList<String> performRemoveNulls(ArrayList<String> arrayList) {
        for (int i = 0; i < arrayList.size(); i++) {
            if (arrayList.get(i).trim().equalsIgnoreCase("null")) {
                arrayList.set(i, "");
            }
        }
        return arrayList;
    }

    private ArrayList<String> performRemoveNA(ArrayList<String> arrayList) {
        for (int i = 0; i < arrayList.size(); i++) {
            if (arrayList.get(i).trim().equalsIgnoreCase("n/a")) {
                arrayList.set(i, "");
            }
        }
        return arrayList;
    }

    private ArrayList<ArrayList<String>> performSplits(ArrayList<String> arrayList) {
        ArrayList<ArrayList<String>> arrayList2 = new ArrayList<>();
        arrayList2.add(arrayList);
        for (int i = 0; i < arrayList.size(); i++) {
            String str = arrayList.get(i);
            String str2 = this.columnsToSplit.get(this.headers.get(i));
            if (str2 != null && str.contains(str2)) {
                String[] split = str.split(str2);
                ArrayList<ArrayList<String>> arrayList3 = new ArrayList<>();
                Iterator<ArrayList<String>> it = arrayList2.iterator();
                while (it.hasNext()) {
                    ArrayList<String> next = it.next();
                    for (String str3 : split) {
                        ArrayList<String> arrayList4 = (ArrayList) next.clone();
                        arrayList4.set(i, str3.trim());
                        arrayList3.add(arrayList4);
                    }
                }
                arrayList2 = arrayList3;
            }
        }
        return arrayList2;
    }

    private ArrayList<ArrayList<String>> performPairedSplits(ArrayList<String> arrayList) throws Exception {
        ArrayList<ArrayList<String>> arrayList2 = new ArrayList<>();
        arrayList2.add(arrayList);
        Iterator<HashMap<String, String>> it = this.columnsToPairedSplit.iterator();
        while (it.hasNext()) {
            HashMap<String, String> next = it.next();
            ArrayList<ArrayList<String>> arrayList3 = new ArrayList<>();
            Iterator<ArrayList<String>> it2 = arrayList2.iterator();
            while (it2.hasNext()) {
                arrayList3.addAll(performPairedSplit(it2.next(), next));
            }
            arrayList2 = arrayList3;
        }
        return arrayList2;
    }

    private ArrayList<ArrayList<String>> performPairedSplit(ArrayList<String> arrayList, HashMap<String, String> hashMap) throws Exception {
        ArrayList<ArrayList<String>> arrayList2 = new ArrayList<>();
        int i = -1;
        for (String str : hashMap.keySet()) {
            String str2 = hashMap.get(str);
            int indexOf = this.headers.indexOf(str);
            String str3 = arrayList.get(indexOf);
            if (i == -1) {
                i = str3.split(str2).length;
                for (int i2 = 0; i2 < i; i2++) {
                    arrayList2.add((ArrayList) arrayList.clone());
                }
            } else if (i != str3.split(str2).length) {
                throw new Exception("Mismatched number of split items across columns: cannot split '" + str3 + "' into " + i + " elements");
            }
            for (int i3 = 0; i3 < i; i3++) {
                arrayList2.get(i3).set(indexOf, str3.split(str2)[i3].trim());
            }
        }
        return arrayList2;
    }

    private void validateColumnHeader(String str) throws Exception {
        if (this.headers.indexOf(str) == -1) {
            throw new Exception("Cannot clean nonexistent column " + str);
        }
    }

    private void validateDelimiter(String str) throws Exception {
        if (ArrayUtils.indexOf(UNSUPPORTED_SPLIT_DELIMITERS, str) > -1) {
            throw new Exception("Cannot yet support splitting on delimiter '" + str + Chars.S_QUOTE1);
        }
    }

    private void validateNotSplitYet(String str) throws Exception {
        if (this.columnsToSplit.get(str) != null) {
            throw new Exception("Already splitting column " + str + ", cannot add another split");
        }
        Iterator<HashMap<String, String>> it = this.columnsToPairedSplit.iterator();
        while (it.hasNext()) {
            if (it.next().containsKey(str)) {
                throw new Exception("Already splitting column " + str + ", cannot add another split");
            }
        }
    }
}
