/*
 * Decompiled with CFR 0.152.
 */
package io.github.jbellis.jvector.vector;

import io.github.jbellis.jvector.pq.LocallyAdaptiveVectorQuantization;
import io.github.jbellis.jvector.vector.MemorySegmentByteSequence;
import io.github.jbellis.jvector.vector.MemorySegmentVectorFloat;
import io.github.jbellis.jvector.vector.types.VectorFloat;
import java.lang.foreign.MemorySegment;
import java.nio.ByteOrder;
import java.util.List;
import jdk.incubator.vector.ByteVector;
import jdk.incubator.vector.FloatVector;
import jdk.incubator.vector.IntVector;
import jdk.incubator.vector.LongVector;
import jdk.incubator.vector.ShortVector;
import jdk.incubator.vector.Vector;
import jdk.incubator.vector.VectorOperators;
import jdk.incubator.vector.VectorSpecies;

final class VectorSimdOps {
    static final boolean HAS_AVX512 = IntVector.SPECIES_PREFERRED == IntVector.SPECIES_512;

    VectorSimdOps() {
    }

    static float sum(MemorySegmentVectorFloat vector) {
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            sum = sum.add((Vector)a);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < vector.length(); ++i) {
            res += vector.get(i);
        }
        return res;
    }

    static VectorFloat<?> sum(List<VectorFloat<?>> vectors) {
        if (vectors == null || vectors.isEmpty()) {
            throw new IllegalArgumentException("Input list cannot be null or empty");
        }
        int dimension = vectors.get(0).length();
        MemorySegmentVectorFloat sum = new MemorySegmentVectorFloat(dimension);
        for (VectorFloat<?> vector : vectors) {
            VectorSimdOps.addInPlace(sum, (MemorySegmentVectorFloat)vector);
        }
        return sum;
    }

    static void scale(MemorySegmentVectorFloat vector, float multiplier) {
        int i;
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector divResult = a.mul(multiplier);
            divResult.intoMemorySegment(vector.get(), (long)vector.offset(i), ByteOrder.LITTLE_ENDIAN);
        }
        for (i = vectorizedLength; i < vector.length(); ++i) {
            vector.set(i, vector.get(i) * multiplier);
        }
    }

    static float dot64(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v2.get(), (long)v1.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dot128(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v2.get(), (long)v2.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dot256(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v2.get(), (long)v2.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dotPreferred(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        return a.mul((Vector)b).reduceLanes(VectorOperators.ADD);
    }

    static float dotProduct(MemorySegmentVectorFloat v1, MemorySegmentVectorFloat v2) {
        return VectorSimdOps.dotProduct(v1, 0, v2, 0, v1.length());
    }

    static float dotProduct(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        if (length >= FloatVector.SPECIES_PREFERRED.length()) {
            return VectorSimdOps.dotProductPreferred(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_128.length()) {
            return VectorSimdOps.dotProduct64(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_256.length()) {
            return VectorSimdOps.dotProduct128(v1, v1offset, v2, v2offset, length);
        }
        return VectorSimdOps.dotProduct256(v1, v1offset, v2, v2offset, length);
    }

    static float dotProduct64(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_64.length()) {
            return VectorSimdOps.dot64(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_64.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_64);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_64.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float dotProduct128(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_128.length()) {
            return VectorSimdOps.dot128(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_128.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_128);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_128.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float dotProduct256(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_256.length()) {
            return VectorSimdOps.dot256(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_256.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_256.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v2.get(), (long)v1.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float dotProductPreferred(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_PREFERRED.length()) {
            return VectorSimdOps.dotPreferred(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            sum = a.fma((Vector)b, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += v1.get(v1offset + i) * v2.get(v2offset + i);
            ++i;
        }
        return res;
    }

    static float cosineSimilarity(MemorySegmentVectorFloat v1, MemorySegmentVectorFloat v2) {
        if (v1.length() != v2.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        FloatVector vsum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vaMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vbMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            vsum = vsum.add((Vector)a.mul((Vector)b));
            vaMagnitude = a.fma((Vector)a, (Vector)vaMagnitude);
            vbMagnitude = b.fma((Vector)b, (Vector)vbMagnitude);
        }
        float sum = vsum.reduceLanes(VectorOperators.ADD);
        float aMagnitude = vaMagnitude.reduceLanes(VectorOperators.ADD);
        float bMagnitude = vbMagnitude.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < v1.length(); ++i) {
            sum += v1.get(i) * v2.get(i);
            aMagnitude += v1.get(i) * v1.get(i);
            bMagnitude += v2.get(i) * v2.get(i);
        }
        return (float)((double)sum / Math.sqrt(aMagnitude * bMagnitude));
    }

    static float cosineSimilarity(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        FloatVector vsum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vaMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        FloatVector vbMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            vsum = vsum.add((Vector)a.mul((Vector)b));
            vaMagnitude = a.fma((Vector)a, (Vector)vaMagnitude);
            vbMagnitude = b.fma((Vector)b, (Vector)vbMagnitude);
        }
        float sum = vsum.reduceLanes(VectorOperators.ADD);
        float aMagnitude = vaMagnitude.reduceLanes(VectorOperators.ADD);
        float bMagnitude = vbMagnitude.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < length; ++i) {
            sum += v1.get(v1offset + i) * v2.get(v2offset + i);
            aMagnitude += v1.get(v1offset + i) * v1.get(v1offset + i);
            bMagnitude += v2.get(v2offset + i) * v2.get(v2offset + i);
        }
        return (float)((double)sum / Math.sqrt(aMagnitude * bMagnitude));
    }

    static float squareDistance64(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v2.get(), (long)v2.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistance128(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v2.get(), (long)v2.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistance256(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v2.get(), (long)v2.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistancePreferred(MemorySegmentVectorFloat v1, int offset1, MemorySegmentVectorFloat v2, int offset2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(offset1), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(offset2), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector diff = a.sub((Vector)b);
        return diff.mul((Vector)diff).reduceLanes(VectorOperators.ADD);
    }

    static float squareDistance(MemorySegmentVectorFloat v1, MemorySegmentVectorFloat v2) {
        return VectorSimdOps.squareDistance(v1, 0, v2, 0, v1.length());
    }

    static float squareDistance(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        if (length >= FloatVector.SPECIES_PREFERRED.length()) {
            return VectorSimdOps.squareDistancePreferred(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_128.length()) {
            return VectorSimdOps.squareDistance64(v1, v1offset, v2, v2offset, length);
        }
        if (length < FloatVector.SPECIES_256.length()) {
            return VectorSimdOps.squareDistance128(v1, v1offset, v2, v2offset, length);
        }
        return VectorSimdOps.squareDistance256(v1, v1offset, v2, v2offset, length);
    }

    static float squareDistance64(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_64.length()) {
            return VectorSimdOps.squareDistance64(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_64.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_64);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_64.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static float squareDistance128(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_128.length()) {
            return VectorSimdOps.squareDistance128(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_128.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_128);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_128.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_128, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static float squareDistance256(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_256.length()) {
            return VectorSimdOps.squareDistance256(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_256.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_256.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static float squareDistancePreferred(MemorySegmentVectorFloat v1, int v1offset, MemorySegmentVectorFloat v2, int v2offset, int length) {
        int i;
        if (length == FloatVector.SPECIES_PREFERRED.length()) {
            return VectorSimdOps.squareDistancePreferred(v1, v1offset, v2, v2offset);
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_PREFERRED);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(v1offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(v2offset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector diff = a.sub((Vector)b);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = v1.get(v1offset + i) - v2.get(v2offset + i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    static void addInPlace64(MemorySegmentVectorFloat v1, MemorySegmentVectorFloat v2) {
        FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v1.get(), (long)0L, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_64, (MemorySegment)v2.get(), (long)0L, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
        a.add((Vector)b).intoMemorySegment(v1.get(), (long)v1.offset(0), ByteOrder.LITTLE_ENDIAN);
    }

    static void addInPlace(MemorySegmentVectorFloat v1, MemorySegmentVectorFloat v2) {
        int i;
        if (v1.length() != v2.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        if (v1.length() == 2) {
            VectorSimdOps.addInPlace64(v1, v2);
            return;
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            a.add((Vector)b).intoMemorySegment(v1.get(), (long)v1.offset(i), ByteOrder.LITTLE_ENDIAN);
        }
        for (i = vectorizedLength; i < v1.length(); ++i) {
            v1.set(i, v1.get(i) + v2.get(i));
        }
    }

    static VectorFloat<?> sub(MemorySegmentVectorFloat a, int aOffset, MemorySegmentVectorFloat b, int bOffset, int length) {
        int i;
        MemorySegmentVectorFloat result = new MemorySegmentVectorFloat(length);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(length);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector lhs = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)a.get(), (long)a.offset(aOffset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector rhs = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)b.get(), (long)b.offset(bOffset + i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector subResult = lhs.sub((Vector)rhs);
            subResult.intoMemorySegment(result.get(), (long)result.offset(i), ByteOrder.LITTLE_ENDIAN);
        }
        for (i = vectorizedLength; i < length; ++i) {
            result.set(i, a.get(aOffset + i) - b.get(bOffset + i));
        }
        return result;
    }

    static void subInPlace(MemorySegmentVectorFloat v1, MemorySegmentVectorFloat v2) {
        int i;
        if (v1.length() != v2.length()) {
            throw new IllegalArgumentException("Vectors must have the same length");
        }
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(v1.length());
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v1.get(), (long)v1.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector b = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)v2.get(), (long)v2.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            a.sub((Vector)b).intoMemorySegment(v1.get(), (long)v1.offset(i), ByteOrder.LITTLE_ENDIAN);
        }
        for (i = vectorizedLength; i < v1.length(); ++i) {
            v1.set(i, v1.get(i) - v2.get(i));
        }
    }

    public static int hammingDistance(long[] a, long[] b) {
        LongVector sum = LongVector.zero((VectorSpecies)LongVector.SPECIES_PREFERRED);
        int vectorizedLength = LongVector.SPECIES_PREFERRED.loopBound(a.length);
        for (int i = 0; i < vectorizedLength; i += LongVector.SPECIES_PREFERRED.length()) {
            LongVector va = LongVector.fromArray((VectorSpecies)LongVector.SPECIES_PREFERRED, (long[])a, (int)i);
            LongVector vb = LongVector.fromArray((VectorSpecies)LongVector.SPECIES_PREFERRED, (long[])b, (int)i);
            LongVector xorResult = va.lanewise((VectorOperators.Binary)VectorOperators.XOR, (Vector)vb);
            sum = sum.add((Vector)xorResult.lanewise(VectorOperators.BIT_COUNT));
        }
        int res = (int)sum.reduceLanes(VectorOperators.ADD);
        for (int i = vectorizedLength; i < a.length; ++i) {
            res += Long.bitCount(a[i] ^ b[i]);
        }
        return res;
    }

    public static float max(MemorySegmentVectorFloat vector) {
        FloatVector accum = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float)-3.4028235E38f);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            accum = accum.max((Vector)a);
        }
        float max = accum.reduceLanes(VectorOperators.MAX);
        for (int i = vectorizedLength; i < vector.length(); ++i) {
            max = Math.max(max, vector.get(i));
        }
        return max;
    }

    public static float min(MemorySegmentVectorFloat vector) {
        FloatVector accum = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_PREFERRED, (float)Float.MAX_VALUE);
        int vectorizedLength = FloatVector.SPECIES_PREFERRED.loopBound(vector.length());
        for (int i = 0; i < vectorizedLength; i += FloatVector.SPECIES_PREFERRED.length()) {
            FloatVector a = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_PREFERRED, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            accum = accum.min((Vector)a);
        }
        float min = accum.reduceLanes(VectorOperators.MIN);
        for (int i = vectorizedLength; i < vector.length(); ++i) {
            min = Math.min(min, vector.get(i));
        }
        return min;
    }

    private static float lvqDotProduct256(MemorySegmentVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector, float vectorSum) {
        int i;
        int length = vector.length();
        int vectorizedLength = FloatVector.SPECIES_256.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        MemorySegmentByteSequence sequenceBacking = (MemorySegmentByteSequence)packedVector.bytes;
        IntVector packedFragmentA = null;
        IntVector packedFragmentB = null;
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_256.length()) {
            FloatVector lvqFloats;
            FloatVector fullFloats = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            if (i % 64 == 0) {
                ByteVector tempBytes = ByteVector.fromMemorySegment((VectorSpecies)ByteVector.SPECIES_256, (MemorySegment)sequenceBacking.get(), (long)i, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                packedFragmentA = tempBytes.reinterpretAsInts();
                tempBytes = ByteVector.fromMemorySegment((VectorSpecies)ByteVector.SPECIES_256, (MemorySegment)sequenceBacking.get(), (long)(i + 32), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                packedFragmentB = tempBytes.reinterpretAsInts();
                lvqFloats = (FloatVector)packedFragmentA.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            } else if (i % 16 == 0) {
                packedFragmentA = packedFragmentA.lanewise(VectorOperators.LSHR, 8);
                packedFragmentB = packedFragmentB.lanewise(VectorOperators.LSHR, 8);
                lvqFloats = (FloatVector)packedFragmentA.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            } else {
                lvqFloats = (FloatVector)packedFragmentB.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            }
            sum = fullFloats.fma((Vector)lvqFloats, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += vector.get(i) * (float)packedVector.getQuantized(i);
            ++i;
        }
        res = res * packedVector.scale + vectorSum * packedVector.bias;
        return res;
    }

    private static float lvqDotProduct512(MemorySegmentVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector, float vectorSum) {
        int i;
        int length = vector.length();
        int vectorizedLength = FloatVector.SPECIES_512.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512);
        MemorySegmentByteSequence sequenceBacking = (MemorySegmentByteSequence)packedVector.bytes;
        IntVector packedFragment = null;
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_512.length()) {
            FloatVector fullFloats = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_512, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            if (i % 64 == 0) {
                ByteVector byteVector = ByteVector.fromMemorySegment((VectorSpecies)ByteVector.SPECIES_512, (MemorySegment)sequenceBacking.get(), (long)i, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                packedFragment = byteVector.reinterpretAsInts();
            } else {
                packedFragment = packedFragment.lanewise(VectorOperators.LSHR, 8);
            }
            FloatVector lvqFloats = (FloatVector)packedFragment.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            sum = fullFloats.fma((Vector)lvqFloats, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            res += vector.get(i) * (float)packedVector.getQuantized(i);
            ++i;
        }
        res = res * packedVector.scale + vectorSum * packedVector.bias;
        return res;
    }

    public static float lvqDotProduct(MemorySegmentVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector, float vectorSum) {
        if (HAS_AVX512) {
            return VectorSimdOps.lvqDotProduct512(vector, packedVector, vectorSum);
        }
        return VectorSimdOps.lvqDotProduct256(vector, packedVector, vectorSum);
    }

    private static float lvqSquareL2Distance256(MemorySegmentVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector) {
        int i;
        int length = vector.length();
        int vectorizedLength = FloatVector.SPECIES_256.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        MemorySegmentByteSequence sequenceBacking = (MemorySegmentByteSequence)packedVector.bytes;
        IntVector packedFragmentA = null;
        IntVector packedFragmentB = null;
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_256.length()) {
            FloatVector lvqFloats;
            FloatVector fullFloats = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            if (i % 64 == 0) {
                ByteVector tempBytes = ByteVector.fromMemorySegment((VectorSpecies)ByteVector.SPECIES_256, (MemorySegment)sequenceBacking.get(), (long)i, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                packedFragmentA = tempBytes.reinterpretAsInts();
                tempBytes = ByteVector.fromMemorySegment((VectorSpecies)ByteVector.SPECIES_256, (MemorySegment)sequenceBacking.get(), (long)(i + 32), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                packedFragmentB = tempBytes.reinterpretAsInts();
                lvqFloats = (FloatVector)packedFragmentA.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            } else if (i % 16 == 0) {
                packedFragmentA = packedFragmentA.lanewise(VectorOperators.LSHR, 8);
                packedFragmentB = packedFragmentB.lanewise(VectorOperators.LSHR, 8);
                lvqFloats = (FloatVector)packedFragmentA.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            } else {
                lvqFloats = (FloatVector)packedFragmentB.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            }
            lvqFloats = lvqFloats.fma(packedVector.scale, packedVector.bias);
            FloatVector diff = fullFloats.sub((Vector)lvqFloats);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = vector.get(i) - packedVector.getDequantized(i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    private static float lvqSquareL2Distance512(MemorySegmentVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector) {
        int i;
        int length = vector.length();
        int vectorizedLength = FloatVector.SPECIES_512.loopBound(length);
        FloatVector sum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512);
        MemorySegmentByteSequence sequenceBacking = (MemorySegmentByteSequence)packedVector.bytes;
        IntVector packedFragment = null;
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_512.length()) {
            FloatVector fullFloats = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_512, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            if (i % 64 == 0) {
                ByteVector tempBytes = ByteVector.fromMemorySegment((VectorSpecies)ByteVector.SPECIES_512, (MemorySegment)sequenceBacking.get(), (long)i, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                packedFragment = tempBytes.reinterpretAsInts();
            } else {
                packedFragment = packedFragment.lanewise(VectorOperators.LSHR, 8);
            }
            FloatVector lvqFloats = (FloatVector)packedFragment.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            lvqFloats = lvqFloats.fma(packedVector.scale, packedVector.bias);
            FloatVector diff = fullFloats.sub((Vector)lvqFloats);
            sum = diff.fma((Vector)diff, (Vector)sum);
        }
        float res = sum.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float diff = vector.get(i) - packedVector.getDequantized(i);
            res += diff * diff;
            ++i;
        }
        return res;
    }

    public static float lvqSquareL2Distance(MemorySegmentVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector) {
        if (HAS_AVX512) {
            return VectorSimdOps.lvqSquareL2Distance512(vector, packedVector);
        }
        return VectorSimdOps.lvqSquareL2Distance256(vector, packedVector);
    }

    private static float lvqCosine256(MemorySegmentVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector, MemorySegmentVectorFloat centroid) {
        int i;
        int length = vector.length();
        int vectorizedLength = FloatVector.SPECIES_256.loopBound(length);
        MemorySegmentByteSequence sequenceBacking = (MemorySegmentByteSequence)packedVector.bytes;
        IntVector packedFragmentA = null;
        IntVector packedFragmentB = null;
        FloatVector vsum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        FloatVector vFullMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        FloatVector vLvqMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_256);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_256.length()) {
            FloatVector lvqFloats;
            FloatVector fullVector = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector centroidVector = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_256, (MemorySegment)centroid.get(), (long)centroid.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            if (i % 64 == 0) {
                ByteVector tempBytes = ByteVector.fromMemorySegment((VectorSpecies)ByteVector.SPECIES_256, (MemorySegment)sequenceBacking.get(), (long)i, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                packedFragmentA = tempBytes.reinterpretAsInts();
                tempBytes = ByteVector.fromMemorySegment((VectorSpecies)ByteVector.SPECIES_256, (MemorySegment)sequenceBacking.get(), (long)(i + 32), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                packedFragmentB = tempBytes.reinterpretAsInts();
                lvqFloats = (FloatVector)packedFragmentA.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            } else if (i % 16 == 0) {
                packedFragmentA = packedFragmentA.lanewise(VectorOperators.LSHR, 8);
                packedFragmentB = packedFragmentB.lanewise(VectorOperators.LSHR, 8);
                lvqFloats = (FloatVector)packedFragmentA.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            } else {
                lvqFloats = (FloatVector)packedFragmentB.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            }
            lvqFloats = lvqFloats.fma(packedVector.scale, packedVector.bias);
            lvqFloats = lvqFloats.add((Vector)centroidVector);
            vsum = fullVector.fma((Vector)lvqFloats, (Vector)vsum);
            vFullMagnitude = fullVector.fma((Vector)fullVector, (Vector)vFullMagnitude);
            vLvqMagnitude = lvqFloats.fma((Vector)lvqFloats, (Vector)vLvqMagnitude);
        }
        float sum = vsum.reduceLanes(VectorOperators.ADD);
        float fullMagnitude = vFullMagnitude.reduceLanes(VectorOperators.ADD);
        float lvqMagnitude = vLvqMagnitude.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float lvqVal = packedVector.getDequantized(i) + centroid.get(i);
            float fullVal = vector.get(i);
            sum += fullVal * lvqVal;
            fullMagnitude += fullVal * fullVal;
            lvqMagnitude += lvqVal * lvqVal;
            ++i;
        }
        return (float)((double)sum / Math.sqrt(fullMagnitude * lvqMagnitude));
    }

    private static float lvqCosine512(MemorySegmentVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector, MemorySegmentVectorFloat centroid) {
        int i;
        int length = vector.length();
        int vectorizedLength = FloatVector.SPECIES_512.loopBound(length);
        MemorySegmentByteSequence sequenceBacking = (MemorySegmentByteSequence)packedVector.bytes;
        IntVector packedFragment = null;
        FloatVector vsum = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512);
        FloatVector vFullMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512);
        FloatVector vLvqMagnitude = FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512);
        for (i = 0; i < vectorizedLength; i += FloatVector.SPECIES_512.length()) {
            FloatVector fullVector = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_512, (MemorySegment)vector.get(), (long)vector.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            FloatVector centroidVector = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_512, (MemorySegment)centroid.get(), (long)centroid.offset(i), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
            if (i % 64 == 0) {
                ByteVector tempBytes = ByteVector.fromMemorySegment((VectorSpecies)ByteVector.SPECIES_512, (MemorySegment)sequenceBacking.get(), (long)i, (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                packedFragment = tempBytes.reinterpretAsInts();
            } else {
                packedFragment = packedFragment.lanewise(VectorOperators.LSHR, 8);
            }
            FloatVector lvqFloats = (FloatVector)packedFragment.lanewise((VectorOperators.Binary)VectorOperators.AND, 255).convert(VectorOperators.I2F, 0);
            lvqFloats = lvqFloats.fma(packedVector.scale, packedVector.bias);
            lvqFloats = lvqFloats.add((Vector)centroidVector);
            vsum = fullVector.fma((Vector)lvqFloats, (Vector)vsum);
            vFullMagnitude = fullVector.fma((Vector)fullVector, (Vector)vFullMagnitude);
            vLvqMagnitude = lvqFloats.fma((Vector)lvqFloats, (Vector)vLvqMagnitude);
        }
        float sum = vsum.reduceLanes(VectorOperators.ADD);
        float fullMagnitude = vFullMagnitude.reduceLanes(VectorOperators.ADD);
        float lvqMagnitude = vLvqMagnitude.reduceLanes(VectorOperators.ADD);
        while (i < length) {
            float lvqVal = packedVector.getDequantized(i) + centroid.get(i);
            float fullVal = vector.get(i);
            sum += fullVal * lvqVal;
            fullMagnitude += fullVal * fullVal;
            lvqMagnitude += lvqVal * lvqVal;
            ++i;
        }
        return (float)((double)sum / Math.sqrt(fullMagnitude * lvqMagnitude));
    }

    public static float lvqCosine(MemorySegmentVectorFloat vector, LocallyAdaptiveVectorQuantization.PackedVector packedVector, MemorySegmentVectorFloat centroid) {
        if (HAS_AVX512) {
            return VectorSimdOps.lvqCosine512(vector, packedVector, centroid);
        }
        return VectorSimdOps.lvqCosine256(vector, packedVector, centroid);
    }

    public static void quantizePartialSums(float delta, MemorySegmentVectorFloat partialSums, MemorySegmentVectorFloat partialBestDistances, MemorySegmentByteSequence partialQuantizedSums) {
        int codebookSize = partialSums.length() / partialBestDistances.length();
        int codebookCount = partialBestDistances.length();
        for (int i = 0; i < codebookCount; ++i) {
            int j;
            int vectorizedLength = FloatVector.SPECIES_512.loopBound(codebookSize);
            float codebookBest = partialBestDistances.get(i);
            FloatVector codebookBestVector = FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_512, (float)codebookBest);
            for (j = 0; j < vectorizedLength; j += FloatVector.SPECIES_512.length()) {
                FloatVector partialSumVector = FloatVector.fromMemorySegment((VectorSpecies)FloatVector.SPECIES_512, (MemorySegment)partialSums.get(), (long)partialSums.offset(i * codebookSize + j), (ByteOrder)ByteOrder.LITTLE_ENDIAN);
                FloatVector quantized = partialSumVector.sub((Vector)codebookBestVector).div(delta);
                quantized = quantized.max((Vector)FloatVector.zero((VectorSpecies)FloatVector.SPECIES_512)).min((Vector)FloatVector.broadcast((VectorSpecies)FloatVector.SPECIES_512, (long)65535L));
                ShortVector quantizedBytes = (ShortVector)quantized.convertShape(VectorOperators.F2S, ShortVector.SPECIES_256, 0);
                quantizedBytes.intoMemorySegment(partialQuantizedSums.get(), (long)(2 * (i * codebookSize + j)), ByteOrder.LITTLE_ENDIAN);
            }
            while (j < codebookSize) {
                float val = partialSums.get(i * codebookSize + j);
                short quantized = (short)Math.min((val - codebookBest) / delta, 65535.0f);
                partialQuantizedSums.setLittleEndianShort(i * codebookSize + j, quantized);
                ++j;
            }
        }
    }
}

