package io.spokestack.spokestack.asr;

import io.spokestack.spokestack.RingBuffer;
import io.spokestack.spokestack.SpeechConfig;
import io.spokestack.spokestack.SpeechContext;
import io.spokestack.spokestack.SpeechProcessor;
import io.spokestack.spokestack.tensorflow.TensorflowModel;
import io.spokestack.spokestack.wakeword.WakewordTrigger;
import java.nio.ByteBuffer;
import org.jtransforms.fft.FloatFFT_1D;

/* loaded from: input_file:io/spokestack/spokestack/asr/KeywordRecognizer.class */
public final class KeywordRecognizer implements SpeechProcessor {
    public static final String FFT_WINDOW_TYPE_HANN = "hann";
    public static final String DEFAULT_FFT_WINDOW_TYPE = "hann";
    public static final float DEFAULT_PRE_EMPHASIS = 0.97f;
    public static final int DEFAULT_FFT_WINDOW_SIZE = 512;
    public static final int DEFAULT_FFT_HOP_LENGTH = 10;
    public static final int DEFAULT_MEL_FRAME_LENGTH = 110;
    public static final int DEFAULT_MEL_FRAME_WIDTH = 40;
    public static final int DEFAULT_ENCODE_LENGTH = 920;
    public static final int DEFAULT_ENCODE_WIDTH = 128;
    public static final float DEFAULT_THRESHOLD = 0.5f;
    private final String[] classes;
    private final float preEmphasis;
    private float prevSample;
    private final FloatFFT_1D fft;
    private final float[] fftWindow;
    private final float[] fftFrame;
    private final int hopLength;
    private final int melWidth;
    private final int encodeWidth;
    private final RingBuffer sampleWindow;
    private final RingBuffer frameWindow;
    private final RingBuffer encodeWindow;
    private final TensorflowModel filterModel;
    private final TensorflowModel encodeModel;
    private final TensorflowModel detectModel;
    private final float threshold;
    private boolean isActive;

    public KeywordRecognizer(SpeechConfig speechConfig) {
        this(speechConfig, new TensorflowModel.Loader());
    }

    public KeywordRecognizer(SpeechConfig speechConfig, TensorflowModel.Loader loader) {
        this.classes = speechConfig.getString("keyword-classes").split(",");
        if (this.classes.length < 1) {
            throw new IllegalArgumentException("keyword-classes");
        }
        this.preEmphasis = (float) speechConfig.getDouble("keyword-pre-emphasis", Double.valueOf(0.9700000286102295d));
        int integer = speechConfig.getInteger("sample-rate");
        int integer2 = speechConfig.getInteger("keyword-fft-window-size", 512);
        this.hopLength = (speechConfig.getInteger("keyword-fft-hop-length", 10) * integer) / WakewordTrigger.DEFAULT_WAKE_ENCODE_LENGTH;
        String string = speechConfig.getString("keyword-fft-window-type", "hann");
        if (integer2 % 2 != 0) {
            throw new IllegalArgumentException("keyword-fft-window-size");
        }
        int integer3 = ((speechConfig.getInteger("keyword-mel-frame-length", Integer.valueOf(DEFAULT_MEL_FRAME_LENGTH)) * integer) / WakewordTrigger.DEFAULT_WAKE_ENCODE_LENGTH) / this.hopLength;
        this.melWidth = speechConfig.getInteger("keyword-mel-frame-width", 40);
        if (!string.equals("hann")) {
            throw new IllegalArgumentException("keyword-fft-window-type");
        }
        this.fftWindow = hannWindow(integer2);
        this.fft = new FloatFFT_1D(integer2);
        this.fftFrame = new float[integer2];
        int integer4 = ((speechConfig.getInteger("keyword-encode-length", Integer.valueOf(DEFAULT_ENCODE_LENGTH)) * integer) / WakewordTrigger.DEFAULT_WAKE_ENCODE_LENGTH) / this.hopLength;
        this.encodeWidth = speechConfig.getInteger("keyword-encode-width", 128);
        speechConfig.getInteger("keyword-state-width", Integer.valueOf(this.encodeWidth));
        this.sampleWindow = new RingBuffer(integer2);
        this.frameWindow = new RingBuffer(integer3 * this.melWidth);
        this.encodeWindow = new RingBuffer(integer4 * this.encodeWidth);
        this.frameWindow.fill(0.0f);
        this.encodeWindow.fill(-1.0f);
        this.filterModel = loader.setPath(speechConfig.getString("keyword-filter-path")).load();
        loader.reset();
        this.encodeModel = loader.setPath(speechConfig.getString("keyword-encode-path")).setStatePosition(1).load();
        loader.reset();
        this.detectModel = loader.setPath(speechConfig.getString("keyword-detect-path")).load();
        this.threshold = (float) speechConfig.getDouble("keyword-threshold", Double.valueOf(0.5d));
    }

    @Override // java.lang.AutoCloseable
    public void close() throws Exception {
        this.filterModel.close();
        this.encodeModel.close();
        this.detectModel.close();
    }

    @Override // io.spokestack.spokestack.SpeechProcessor
    public void reset() {
        this.sampleWindow.reset();
        this.frameWindow.reset().fill(0.0f);
        this.encodeWindow.reset().fill(-1.0f);
        while (this.encodeModel.states().hasRemaining()) {
            this.encodeModel.states().putFloat(0.0f);
        }
    }

    @Override // io.spokestack.spokestack.SpeechProcessor
    public void process(SpeechContext speechContext, ByteBuffer byteBuffer) throws Exception {
        sample(speechContext, byteBuffer);
        if (!speechContext.isActive() && this.isActive) {
            detect(speechContext);
        }
        this.isActive = speechContext.isActive();
    }

    private void sample(SpeechContext speechContext, ByteBuffer byteBuffer) {
        byteBuffer.rewind();
        while (byteBuffer.hasRemaining()) {
            float max = Math.max(-1.0f, Math.min(byteBuffer.getShort() / 32767.0f, 1.0f));
            float f = max - (this.preEmphasis * this.prevSample);
            this.prevSample = max;
            this.sampleWindow.write(f);
            if (this.sampleWindow.isFull()) {
                if (speechContext.isActive()) {
                    analyze(speechContext);
                }
                this.sampleWindow.rewind().seek(this.hopLength);
            }
        }
    }

    private void analyze(SpeechContext speechContext) {
        for (int i = 0; i < this.fftFrame.length; i++) {
            this.fftFrame[i] = this.sampleWindow.read() * this.fftWindow[i];
        }
        this.fft.realForward(this.fftFrame);
        filter(speechContext);
    }

    private void filter(SpeechContext speechContext) {
        this.filterModel.inputs(0).rewind();
        this.filterModel.inputs(0).putFloat(this.fftFrame[0]);
        for (int i = 1; i < this.fftFrame.length / 2; i++) {
            float f = this.fftFrame[(i * 2) + 0];
            float f2 = this.fftFrame[(i * 2) + 1];
            this.filterModel.inputs(0).putFloat((float) Math.sqrt((f * f) + (f2 * f2)));
        }
        this.filterModel.inputs(0).putFloat(this.fftFrame[1]);
        this.filterModel.run();
        this.frameWindow.rewind().seek(this.melWidth);
        while (this.filterModel.outputs(0).hasRemaining()) {
            this.frameWindow.write(this.filterModel.outputs(0).getFloat());
        }
        encode(speechContext);
    }

    private void encode(SpeechContext speechContext) {
        this.frameWindow.rewind();
        this.encodeModel.inputs(0).rewind();
        while (!this.frameWindow.isEmpty()) {
            this.encodeModel.inputs(0).putFloat(this.frameWindow.read());
        }
        this.encodeModel.run();
        this.encodeWindow.rewind().seek(this.encodeWidth);
        while (this.encodeModel.outputs(0).hasRemaining()) {
            this.encodeWindow.write(this.encodeModel.outputs(0).getFloat());
        }
    }

    private void detect(SpeechContext speechContext) {
        String str = null;
        float f = 0.0f;
        this.encodeWindow.rewind();
        this.detectModel.inputs(0).rewind();
        while (!this.encodeWindow.isEmpty()) {
            this.detectModel.inputs(0).putFloat(this.encodeWindow.read());
        }
        this.detectModel.run();
        for (int i = 0; i < this.classes.length; i++) {
            float f2 = this.detectModel.outputs(0).getFloat();
            if (f2 > f) {
                str = this.classes[i];
                f = f2;
            }
        }
        speechContext.traceInfo("keyword: %.3f %s", Float.valueOf(f), str);
        if (f >= this.threshold) {
            speechContext.setTranscript(str);
            speechContext.setConfidence(f);
            speechContext.dispatch(SpeechContext.Event.RECOGNIZE);
        } else {
            speechContext.dispatch(SpeechContext.Event.TIMEOUT);
        }
        reset();
    }

    private float[] hannWindow(int i) {
        float[] fArr = new float[i];
        for (int i2 = 0; i2 < i; i2++) {
            fArr[i2] = (float) Math.pow(Math.sin((3.141592653589793d * i2) / (i - 1)), 2.0d);
        }
        return fArr;
    }
}
