package io.spokestack.spokestack.wakeword;

import io.spokestack.spokestack.RingBuffer;
import io.spokestack.spokestack.SpeechConfig;
import io.spokestack.spokestack.SpeechContext;
import io.spokestack.spokestack.SpeechProcessor;
import io.spokestack.spokestack.tensorflow.TensorflowModel;
import java.nio.ByteBuffer;
import org.jtransforms.fft.FloatFFT_1D;

/* loaded from: input_file:io/spokestack/spokestack/wakeword/WakewordTrigger.class */
public final class WakewordTrigger implements SpeechProcessor {
    public static final String FFT_WINDOW_TYPE_HANN = "hann";
    public static final String DEFAULT_FFT_WINDOW_TYPE = "hann";
    public static final float DEFAULT_RMS_TARGET = 0.08f;
    public static final float DEFAULT_RMS_ALPHA = 0.0f;
    public static final float DEFAULT_PRE_EMPHASIS = 0.0f;
    public static final int DEFAULT_FFT_WINDOW_SIZE = 512;
    public static final int DEFAULT_FFT_HOP_LENGTH = 10;
    public static final int DEFAULT_MEL_FRAME_LENGTH = 10;
    public static final int DEFAULT_MEL_FRAME_WIDTH = 40;
    public static final int DEFAULT_WAKE_ENCODE_LENGTH = 1000;
    public static final int DEFAULT_WAKE_ENCODE_WIDTH = 128;
    public static final float DEFAULT_WAKE_THRESHOLD = 0.5f;
    private boolean isSpeech;
    private final float rmsTarget;
    private final float rmsAlpha;
    private final float preEmphasis;
    private float rmsValue;
    private float prevSample;
    private final FloatFFT_1D fft;
    private final float[] fftWindow;
    private final float[] fftFrame;
    private final int hopLength;
    private final int melWidth;
    private final int encodeWidth;
    private final RingBuffer sampleWindow;
    private final RingBuffer frameWindow;
    private final RingBuffer encodeWindow;
    private final TensorflowModel filterModel;
    private final TensorflowModel encodeModel;
    private final TensorflowModel detectModel;
    private final float posteriorThreshold;
    private float posteriorMax;

    public WakewordTrigger(SpeechConfig speechConfig) {
        this(speechConfig, new TensorflowModel.Loader());
    }

    public WakewordTrigger(SpeechConfig speechConfig, TensorflowModel.Loader loader) {
        this.rmsTarget = (float) speechConfig.getDouble("rms-target", Double.valueOf(0.07999999821186066d));
        this.rmsAlpha = (float) speechConfig.getDouble("rms-alpha", Double.valueOf(0.0d));
        this.preEmphasis = (float) speechConfig.getDouble("pre-emphasis", Double.valueOf(0.0d));
        this.rmsValue = this.rmsTarget;
        int integer = speechConfig.getInteger("sample-rate");
        int integer2 = speechConfig.getInteger("fft-window-size", 512);
        this.hopLength = (speechConfig.getInteger("fft-hop-length", 10) * integer) / 1000;
        String string = speechConfig.getString("fft-window-type", "hann");
        if (integer2 % 2 != 0) {
            throw new IllegalArgumentException("fft-window-size");
        }
        int integer3 = ((speechConfig.getInteger("mel-frame-length", 10) * integer) / 1000) / this.hopLength;
        this.melWidth = speechConfig.getInteger("mel-frame-width", 40);
        if (!string.equals("hann")) {
            throw new IllegalArgumentException("fft-window-type");
        }
        this.fftWindow = hannWindow(integer2);
        this.fft = new FloatFFT_1D(integer2);
        this.fftFrame = new float[integer2];
        int integer4 = ((speechConfig.getInteger("wake-encode-length", 1000) * integer) / 1000) / this.hopLength;
        this.encodeWidth = speechConfig.getInteger("wake-encode-width", 128);
        speechConfig.getInteger("wake-state-width", Integer.valueOf(this.encodeWidth));
        this.sampleWindow = new RingBuffer(integer2);
        this.frameWindow = new RingBuffer(integer3 * this.melWidth);
        this.encodeWindow = new RingBuffer(integer4 * this.encodeWidth);
        this.frameWindow.fill(0.0f);
        this.encodeWindow.fill(-1.0f);
        this.filterModel = loader.setPath(speechConfig.getString("wake-filter-path")).load();
        loader.reset();
        this.encodeModel = loader.setPath(speechConfig.getString("wake-encode-path")).setStatePosition(1).load();
        loader.reset();
        this.detectModel = loader.setPath(speechConfig.getString("wake-detect-path")).load();
        this.posteriorThreshold = (float) speechConfig.getDouble("wake-threshold", Double.valueOf(0.5d));
    }

    @Override // java.lang.AutoCloseable
    public void close() throws Exception {
        this.filterModel.close();
        this.encodeModel.close();
        this.detectModel.close();
    }

    @Override // io.spokestack.spokestack.SpeechProcessor
    public void reset() {
        this.sampleWindow.reset();
        this.frameWindow.reset().fill(0.0f);
        this.encodeWindow.reset().fill(-1.0f);
        while (this.encodeModel.states().hasRemaining()) {
            this.encodeModel.states().putFloat(0.0f);
        }
        this.posteriorMax = 0.0f;
    }

    @Override // io.spokestack.spokestack.SpeechProcessor
    public void process(SpeechContext speechContext, ByteBuffer byteBuffer) throws Exception {
        boolean z = this.isSpeech && !speechContext.isSpeech();
        this.isSpeech = speechContext.isSpeech();
        if (!speechContext.isActive()) {
            sample(speechContext, byteBuffer);
        }
        if (z) {
            if (!speechContext.isActive()) {
                trace(speechContext);
            }
            reset();
        }
    }

    private void sample(SpeechContext speechContext, ByteBuffer byteBuffer) {
        if (speechContext.isSpeech() && this.rmsAlpha > 0.0f) {
            this.rmsValue = (this.rmsAlpha * rms(byteBuffer)) + ((1.0f - this.rmsAlpha) * this.rmsValue);
        }
        byteBuffer.rewind();
        while (byteBuffer.hasRemaining()) {
            float max = Math.max(-1.0f, Math.min(((byteBuffer.getShort() / 32767.0f) * this.rmsTarget) / this.rmsValue, 1.0f));
            float f = max - (this.preEmphasis * this.prevSample);
            this.prevSample = max;
            this.sampleWindow.write(f);
            if (this.sampleWindow.isFull()) {
                if (speechContext.isSpeech()) {
                    analyze(speechContext);
                }
                this.sampleWindow.rewind().seek(this.hopLength);
            }
        }
    }

    private void analyze(SpeechContext speechContext) {
        for (int i = 0; i < this.fftFrame.length; i++) {
            this.fftFrame[i] = this.sampleWindow.read() * this.fftWindow[i];
        }
        this.fft.realForward(this.fftFrame);
        filter(speechContext);
    }

    private void filter(SpeechContext speechContext) {
        this.filterModel.inputs(0).rewind();
        this.filterModel.inputs(0).putFloat(this.fftFrame[0]);
        for (int i = 1; i < this.fftFrame.length / 2; i++) {
            float f = this.fftFrame[(i * 2) + 0];
            float f2 = this.fftFrame[(i * 2) + 1];
            this.filterModel.inputs(0).putFloat((float) Math.sqrt((f * f) + (f2 * f2)));
        }
        this.filterModel.inputs(0).putFloat(this.fftFrame[1]);
        this.filterModel.run();
        this.frameWindow.rewind().seek(this.melWidth);
        while (this.filterModel.outputs(0).hasRemaining()) {
            this.frameWindow.write(this.filterModel.outputs(0).getFloat());
        }
        encode(speechContext);
    }

    private void encode(SpeechContext speechContext) {
        this.frameWindow.rewind();
        this.encodeModel.inputs(0).rewind();
        while (!this.frameWindow.isEmpty()) {
            this.encodeModel.inputs(0).putFloat(this.frameWindow.read());
        }
        this.encodeModel.run();
        this.encodeWindow.rewind().seek(this.encodeWidth);
        while (this.encodeModel.outputs(0).hasRemaining()) {
            this.encodeWindow.write(this.encodeModel.outputs(0).getFloat());
        }
        detect(speechContext);
    }

    private void detect(SpeechContext speechContext) {
        this.encodeWindow.rewind();
        this.detectModel.inputs(0).rewind();
        while (!this.encodeWindow.isEmpty()) {
            this.detectModel.inputs(0).putFloat(this.encodeWindow.read());
        }
        this.detectModel.run();
        float f = this.detectModel.outputs(0).getFloat();
        if (f > this.posteriorMax) {
            this.posteriorMax = f;
        }
        if (f > this.posteriorThreshold) {
            activate(speechContext);
        }
    }

    private void activate(SpeechContext speechContext) {
        trace(speechContext);
        speechContext.setActive(true);
    }

    private void trace(SpeechContext speechContext) {
        speechContext.traceInfo(String.format("wake: %f", Float.valueOf(this.posteriorMax)), new Object[0]);
    }

    private float[] hannWindow(int i) {
        float[] fArr = new float[i];
        for (int i2 = 0; i2 < i; i2++) {
            fArr[i2] = (float) Math.pow(Math.sin((3.141592653589793d * i2) / (i - 1)), 2.0d);
        }
        return fArr;
    }

    private float rms(ByteBuffer byteBuffer) {
        float f = 0.0f;
        int i = 0;
        byteBuffer.rewind();
        while (byteBuffer.hasRemaining()) {
            float f2 = byteBuffer.getShort() / 32767.0f;
            f += f2 * f2;
            i++;
        }
        return (float) Math.sqrt(f / i);
    }
}
