/*
 * Decompiled with CFR 0.152.
 */
package dev.ludovic.netlib.blas;

import dev.ludovic.netlib.JavaBLAS;
import dev.ludovic.netlib.blas.Java8BLAS;

public class Java11BLAS
extends Java8BLAS {
    private static final Java11BLAS instance = new Java11BLAS();

    protected Java11BLAS() {
    }

    public static JavaBLAS getInstance() {
        return instance;
    }

    @Override
    protected void daxpyK(int n, double alpha, double[] x, int offsetx, int incx, double[] y, int offsety, int incy) {
        if (incx == 1 && incy == 1) {
            int ix = 0;
            for (int iy = 0; ix < n && iy < n; ++ix, ++iy) {
                y[offsety + iy] = Math.fma(alpha, x[offsetx + ix], y[offsety + iy]);
            }
        } else {
            int iy;
            int ix = incx < 0 ? (n - 1) * -incx : 0;
            int n2 = iy = incy < 0 ? (n - 1) * -incy : 0;
            while ((incx < 0 ? ix >= 0 : ix < n * incx) && (incy < 0 ? iy >= 0 : iy < n * incy)) {
                y[offsety + iy] = Math.fma(alpha, x[offsetx + ix], y[offsety + iy]);
                ix += incx;
                iy += incy;
            }
        }
    }

    @Override
    protected void saxpyK(int n, float alpha, float[] x, int offsetx, int incx, float[] y, int offsety, int incy) {
        if (incx == 1 && incy == 1) {
            int ix = 0;
            for (int iy = 0; ix < n && iy < n; ++ix, ++iy) {
                y[offsety + iy] = Math.fma(alpha, x[offsetx + ix], y[offsety + iy]);
            }
        } else {
            int iy;
            int ix = incx < 0 ? (n - 1) * -incx : 0;
            int n2 = iy = incy < 0 ? (n - 1) * -incy : 0;
            while ((incx < 0 ? ix >= 0 : ix < n * incx) && (incy < 0 ? iy >= 0 : iy < n * incy)) {
                y[offsety + iy] = Math.fma(alpha, x[offsetx + ix], y[offsety + iy]);
                ix += incx;
                iy += incy;
            }
        }
    }

    @Override
    protected double ddotK(int n, double[] x, int offsetx, int incx, double[] y, int offsety, int incy) {
        double sum = 0.0;
        if (incx == 1 && incy == 1) {
            int iy;
            int ix = 0;
            double sum0 = 0.0;
            double sum1 = 0.0;
            double sum2 = 0.0;
            double sum3 = 0.0;
            for (iy = 0; ix < this.loopBound(n, 4) && iy < this.loopBound(n, 4); ix += 4, iy += 4) {
                sum0 = Math.fma(x[offsetx + ix + 0], y[offsety + iy + 0], sum0);
                sum1 = Math.fma(x[offsetx + ix + 1], y[offsety + iy + 1], sum1);
                sum2 = Math.fma(x[offsetx + ix + 2], y[offsety + iy + 2], sum2);
                sum3 = Math.fma(x[offsetx + ix + 3], y[offsety + iy + 3], sum3);
            }
            sum += sum0 + sum1 + sum2 + sum3;
            while (ix < n && iy < n) {
                sum = Math.fma(x[offsetx + ix], y[offsety + iy], sum);
                ++ix;
                ++iy;
            }
        } else {
            int iy;
            int ix = incx < 0 ? (n - 1) * -incx : 0;
            int n2 = iy = incy < 0 ? (n - 1) * -incy : 0;
            while ((incx < 0 ? ix >= 0 : ix < n * incx) && (incy < 0 ? iy >= 0 : iy < n * incy)) {
                sum = Math.fma(x[offsetx + ix], y[offsety + iy], sum);
                ix += incx;
                iy += incy;
            }
        }
        return sum;
    }

    @Override
    protected float sdotK(int n, float[] x, int offsetx, int incx, float[] y, int offsety, int incy) {
        float sum = 0.0f;
        if (incx == 1 && incy == 1) {
            int iy;
            int ix = 0;
            float sum0 = 0.0f;
            float sum1 = 0.0f;
            float sum2 = 0.0f;
            float sum3 = 0.0f;
            for (iy = 0; ix < this.loopBound(n, 4) && iy < this.loopBound(n, 4); ix += 4, iy += 4) {
                sum0 = Math.fma(x[offsetx + ix + 0], y[offsety + iy + 0], sum0);
                sum1 = Math.fma(x[offsetx + ix + 1], y[offsety + iy + 1], sum1);
                sum2 = Math.fma(x[offsetx + ix + 2], y[offsety + iy + 2], sum2);
                sum3 = Math.fma(x[offsetx + ix + 3], y[offsety + iy + 3], sum3);
            }
            sum += sum0 + sum1 + sum2 + sum3;
            while (ix < n && iy < n) {
                sum = Math.fma(x[offsetx + ix], y[offsety + iy], sum);
                ++ix;
                ++iy;
            }
        } else {
            int iy;
            int ix = incx < 0 ? (n - 1) * -incx : 0;
            int n2 = iy = incy < 0 ? (n - 1) * -incy : 0;
            while ((incx < 0 ? ix >= 0 : ix < n * incx) && (incy < 0 ? iy >= 0 : iy < n * incy)) {
                sum = Math.fma(x[offsetx + ix], y[offsety + iy], sum);
                ix += incx;
                iy += incy;
            }
        }
        return sum;
    }

    @Override
    protected float sdsdotK(int n, float sb, float[] x, int offsetx, int incx, float[] y, int offsety, int incy) {
        double sum = sb;
        if (incx == 1 && incy == 1) {
            int iy;
            int ix = 0;
            double sum0 = 0.0;
            double sum1 = 0.0;
            double sum2 = 0.0;
            double sum3 = 0.0;
            for (iy = 0; ix < this.loopBound(n, 4) && iy < this.loopBound(n, 4); ix += 4, iy += 4) {
                sum0 = Math.fma((double)x[offsetx + ix + 0], (double)y[offsety + iy + 0], sum0);
                sum1 = Math.fma((double)x[offsetx + ix + 1], (double)y[offsety + iy + 1], sum1);
                sum2 = Math.fma((double)x[offsetx + ix + 2], (double)y[offsety + iy + 2], sum2);
                sum3 = Math.fma((double)x[offsetx + ix + 3], (double)y[offsety + iy + 3], sum3);
            }
            sum += sum0 + sum1 + sum2 + sum3;
            while (ix < n && iy < n) {
                sum = Math.fma((double)x[offsetx + ix], (double)y[offsety + iy], sum);
                ++ix;
                ++iy;
            }
        } else {
            int iy;
            int ix = incx < 0 ? (n - 1) * -incx : 0;
            int n2 = iy = incy < 0 ? (n - 1) * -incy : 0;
            while ((incx < 0 ? ix >= 0 : ix < n * incx) && (incy < 0 ? iy >= 0 : iy < n * incy)) {
                sum = Math.fma((double)x[offsetx + ix], (double)y[offsety + iy], sum);
                ix += incx;
                iy += incy;
            }
        }
        return (float)sum;
    }

    @Override
    protected void dgebpTN(int m, int rows, int rowe, int n, int cols, int cole, int k, int is, int ie, double alpha, double[] a, int offseta, int lda, double[] b, int offsetb, int ldb, double beta, double[] c, int offsetc, int ldc) {
        double b0;
        double a2;
        double a1;
        double a0;
        double sum20;
        double b02;
        double a02;
        double sum00;
        int row;
        int col;
        int Tcol = 3;
        int Trow = 3;
        for (col = cols; col < this.loopAlign(cols, cole, 3); ++col) {
            for (row = rows; row < this.loopAlign(rows, rowe, 3); ++row) {
                sum00 = 0.0;
                for (int i = is; i < ie; ++i) {
                    a02 = a[offseta + i + (row + 0) * lda];
                    b02 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a02, b02, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
            }
            while (row < this.loopBound(rowe, 3)) {
                sum00 = 0.0;
                double sum10 = 0.0;
                sum20 = 0.0;
                for (int i = is; i < ie; ++i) {
                    a0 = a[offseta + i + (row + 0) * lda];
                    a1 = a[offseta + i + (row + 1) * lda];
                    a2 = a[offseta + i + (row + 2) * lda];
                    b0 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a0, b0, sum00);
                    sum10 = Math.fma(a1, b0, sum10);
                    sum20 = Math.fma(a2, b0, sum20);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, c[offsetc + (row + 1) + (col + 0) * ldc]);
                c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, c[offsetc + (row + 2) + (col + 0) * ldc]);
                row += 3;
            }
            while (row < rowe) {
                sum00 = 0.0;
                for (int i = is; i < ie; ++i) {
                    a02 = a[offseta + i + (row + 0) * lda];
                    b02 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a02, b02, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                ++row;
            }
        }
        while (col < this.loopBound(cole, 3)) {
            double sum02;
            double sum01;
            for (row = rows; row < this.loopAlign(rows, rowe, 3); ++row) {
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                double sum03 = 0.0;
                for (int i = is; i < ie; ++i) {
                    double a03 = a[offseta + i + (row + 0) * lda];
                    double b03 = b[offsetb + i + (col + 0) * ldb];
                    double b1 = b[offsetb + i + (col + 1) * ldb];
                    double b2 = b[offsetb + i + (col + 2) * ldb];
                    sum00 = Math.fma(a03, b03, sum00);
                    sum01 = Math.fma(a03, b1, sum01);
                    sum02 = Math.fma(a03, b2, sum02);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, c[offsetc + (row + 0) + (col + 1) * ldc]);
                c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, c[offsetc + (row + 0) + (col + 2) * ldc]);
            }
            while (row < this.loopBound(rowe, 3)) {
                this.dgepdotTN(m, row, row + 3, n, col, col + 3, k, is, ie, alpha, a, offseta, lda, b, offsetb, ldb, beta, c, offsetc, ldc);
                row += 3;
            }
            while (row < rowe) {
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                for (int i = is; i < ie; ++i) {
                    a0 = a[offseta + i + (row + 0) * lda];
                    double b04 = b[offsetb + i + (col + 0) * ldb];
                    double b1 = b[offsetb + i + (col + 1) * ldb];
                    double b2 = b[offsetb + i + (col + 2) * ldb];
                    sum00 = Math.fma(a0, b04, sum00);
                    sum01 = Math.fma(a0, b1, sum01);
                    sum02 = Math.fma(a0, b2, sum02);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, c[offsetc + (row + 0) + (col + 1) * ldc]);
                c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, c[offsetc + (row + 0) + (col + 2) * ldc]);
                ++row;
            }
            col += 3;
        }
        while (col < cole) {
            for (row = rows; row < this.loopAlign(rows, rowe, 3); ++row) {
                sum00 = 0.0;
                for (int i = is; i < ie; ++i) {
                    a02 = a[offseta + i + (row + 0) * lda];
                    b02 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a02, b02, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
            }
            while (row < this.loopBound(rowe, 3)) {
                sum00 = 0.0;
                double sum10 = 0.0;
                sum20 = 0.0;
                for (int i = is; i < ie; ++i) {
                    a0 = a[offseta + i + (row + 0) * lda];
                    a1 = a[offseta + i + (row + 1) * lda];
                    a2 = a[offseta + i + (row + 2) * lda];
                    b0 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a0, b0, sum00);
                    sum10 = Math.fma(a1, b0, sum10);
                    sum20 = Math.fma(a2, b0, sum20);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, c[offsetc + (row + 1) + (col + 0) * ldc]);
                c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, c[offsetc + (row + 2) + (col + 0) * ldc]);
                row += 3;
            }
            while (row < rowe) {
                sum00 = 0.0;
                for (int i = is; i < ie; ++i) {
                    a02 = a[offseta + i + (row + 0) * lda];
                    b02 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a02, b02, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void dgepdotTN(int m, int rows, int rowe, int n, int cols, int cole, int k, int is, int ie, double alpha, double[] a, int offseta, int lda, double[] b, int offsetb, int ldb, double beta, double[] c, int offsetc, int ldc) {
        double b2;
        double b1;
        double b0;
        double a2;
        double a1;
        double a0;
        int i;
        int Ti = 2;
        assert (rowe - rows == 3);
        assert (cole - cols == 3);
        int row = rows;
        int col = cols;
        double sum00 = 0.0;
        double sum01 = 0.0;
        double sum02 = 0.0;
        double sum10 = 0.0;
        double sum11 = 0.0;
        double sum12 = 0.0;
        double sum20 = 0.0;
        double sum21 = 0.0;
        double sum22 = 0.0;
        for (i = is; i < this.loopAlign(is, ie, 2); ++i) {
            a0 = a[offseta + i + (row + 0) * lda];
            a1 = a[offseta + i + (row + 1) * lda];
            a2 = a[offseta + i + (row + 2) * lda];
            b0 = b[offsetb + i + (col + 0) * ldb];
            sum00 = Math.fma(a0, b0, sum00);
            sum10 = Math.fma(a1, b0, sum10);
            sum20 = Math.fma(a2, b0, sum20);
            b1 = b[offsetb + i + (col + 1) * ldb];
            sum01 = Math.fma(a0, b1, sum01);
            sum11 = Math.fma(a1, b1, sum11);
            sum21 = Math.fma(a2, b1, sum21);
            b2 = b[offsetb + i + (col + 2) * ldb];
            sum02 = Math.fma(a0, b2, sum02);
            sum12 = Math.fma(a1, b2, sum12);
            sum22 = Math.fma(a2, b2, sum22);
        }
        while (i < this.loopBound(ie, 2)) {
            double a00 = a[offseta + (i + 0) + (row + 0) * lda];
            double a01 = a[offseta + (i + 0) + (row + 1) * lda];
            double a02 = a[offseta + (i + 0) + (row + 2) * lda];
            double b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
            sum00 = Math.fma(a00, b00, sum00);
            sum10 = Math.fma(a01, b00, sum10);
            sum20 = Math.fma(a02, b00, sum20);
            double b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
            sum01 = Math.fma(a00, b01, sum01);
            sum11 = Math.fma(a01, b01, sum11);
            sum21 = Math.fma(a02, b01, sum21);
            double b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
            sum02 = Math.fma(a00, b02, sum02);
            sum12 = Math.fma(a01, b02, sum12);
            sum22 = Math.fma(a02, b02, sum22);
            double a10 = a[offseta + (i + 1) + (row + 0) * lda];
            double a11 = a[offseta + (i + 1) + (row + 1) * lda];
            double a12 = a[offseta + (i + 1) + (row + 2) * lda];
            double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
            sum00 = Math.fma(a10, b10, sum00);
            sum10 = Math.fma(a11, b10, sum10);
            sum20 = Math.fma(a12, b10, sum20);
            double b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
            sum01 = Math.fma(a10, b11, sum01);
            sum11 = Math.fma(a11, b11, sum11);
            sum21 = Math.fma(a12, b11, sum21);
            double b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
            sum02 = Math.fma(a10, b12, sum02);
            sum12 = Math.fma(a11, b12, sum12);
            sum22 = Math.fma(a12, b12, sum22);
            i += 2;
        }
        while (i < ie) {
            a0 = a[offseta + i + (row + 0) * lda];
            a1 = a[offseta + i + (row + 1) * lda];
            a2 = a[offseta + i + (row + 2) * lda];
            b0 = b[offsetb + i + (col + 0) * ldb];
            sum00 = Math.fma(a0, b0, sum00);
            sum10 = Math.fma(a1, b0, sum10);
            sum20 = Math.fma(a2, b0, sum20);
            b1 = b[offsetb + i + (col + 1) * ldb];
            sum01 = Math.fma(a0, b1, sum01);
            sum11 = Math.fma(a1, b1, sum11);
            sum21 = Math.fma(a2, b1, sum21);
            b2 = b[offsetb + i + (col + 2) * ldb];
            sum02 = Math.fma(a0, b2, sum02);
            sum12 = Math.fma(a1, b2, sum12);
            sum22 = Math.fma(a2, b2, sum22);
            ++i;
        }
        c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
        c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, c[offsetc + (row + 0) + (col + 1) * ldc]);
        c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, c[offsetc + (row + 0) + (col + 2) * ldc]);
        c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, c[offsetc + (row + 1) + (col + 0) * ldc]);
        c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, c[offsetc + (row + 1) + (col + 1) * ldc]);
        c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, c[offsetc + (row + 1) + (col + 2) * ldc]);
        c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, c[offsetc + (row + 2) + (col + 0) * ldc]);
        c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, c[offsetc + (row + 2) + (col + 1) * ldc]);
        c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, c[offsetc + (row + 2) + (col + 2) * ldc]);
    }

    @Override
    protected void dgemmNN(int m, int n, int k, double alpha, double[] a, int offseta, int lda, double[] b, int offsetb, int ldb, double beta, double[] c, int offsetc, int ldc) {
        double a01;
        double a00;
        int i;
        double sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            double sum02;
            double sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b02;
                double b01;
                double b00;
                double a20;
                double a10;
                double a002;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                double sum10 = 0.0;
                double sum11 = 0.0;
                double sum12 = 0.0;
                double sum20 = 0.0;
                double sum21 = 0.0;
                double sum22 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    double a012 = a[offseta + (row + 0) + (i + 1) * lda];
                    double a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    double a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    double b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    double b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum12 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                double b02;
                double b01;
                double b00;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    double b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    double b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b00;
                double a20;
                double a10;
                sum00 = 0.0;
                double sum10 = 0.0;
                double sum20 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    double a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    double a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                double b00;
                double a003;
                sum00 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    double a013 = a[offseta + (row + 0) + (i + 1) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0 ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void dgemmNT(int m, int n, int k, double alpha, double[] a, int offseta, int lda, double[] b, int offsetb, int ldb, double beta, double[] c, int offsetc, int ldc) {
        double a01;
        double a00;
        int i;
        double sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            double sum02;
            double sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b02;
                double b01;
                double b00;
                double a20;
                double a10;
                double a002;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                double sum10 = 0.0;
                double sum11 = 0.0;
                double sum12 = 0.0;
                double sum20 = 0.0;
                double sum21 = 0.0;
                double sum22 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    double a012 = a[offseta + (row + 0) + (i + 1) * lda];
                    double a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    double a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    double b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    double b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum12 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                double b02;
                double b01;
                double b00;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    double b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    double b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b00;
                double a20;
                double a10;
                sum00 = 0.0;
                double sum10 = 0.0;
                double sum20 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    double a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    double a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                double b00;
                double a003;
                sum00 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    double a013 = a[offseta + (row + 0) + (i + 1) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0 ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void dgemmTN(int m, int n, int k, double alpha, double[] a, int offseta, int lda, double[] b, int offsetb, int ldb, double beta, double[] c, int offsetc, int ldc) {
        double a01;
        double a00;
        int i;
        double sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            double sum02;
            double sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b02;
                double b01;
                double b00;
                double a20;
                double a10;
                double a002;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                double sum10 = 0.0;
                double sum11 = 0.0;
                double sum12 = 0.0;
                double sum20 = 0.0;
                double sum21 = 0.0;
                double sum22 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    double a012 = a[offseta + (i + 1) + (row + 0) * lda];
                    double a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    double a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    double b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    double b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum12 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                double b02;
                double b01;
                double b00;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    double b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    double b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b00;
                double a20;
                double a10;
                sum00 = 0.0;
                double sum10 = 0.0;
                double sum20 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    double a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    double a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                double b00;
                double a003;
                sum00 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    double a013 = a[offseta + (i + 1) + (row + 0) * lda];
                    double b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0 ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void dgemmTT(int m, int n, int k, double alpha, double[] a, int offseta, int lda, double[] b, int offsetb, int ldb, double beta, double[] c, int offsetc, int ldc) {
        double a01;
        double a00;
        int i;
        double sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            double sum02;
            double sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b02;
                double b01;
                double b00;
                double a20;
                double a10;
                double a002;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                double sum10 = 0.0;
                double sum11 = 0.0;
                double sum12 = 0.0;
                double sum20 = 0.0;
                double sum21 = 0.0;
                double sum22 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    double a012 = a[offseta + (i + 1) + (row + 0) * lda];
                    double a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    double a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    double b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    double b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum12 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum12 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                double b02;
                double b01;
                double b00;
                sum00 = 0.0;
                sum01 = 0.0;
                sum02 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    double b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    double b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                double b00;
                double a20;
                double a10;
                sum00 = 0.0;
                double sum10 = 0.0;
                double sum20 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    double a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    double a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                double b00;
                double a003;
                sum00 = 0.0;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    double a013 = a[offseta + (i + 1) + (row + 0) * lda];
                    double b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0 ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void sgebpTN(int m, int rows, int rowe, int n, int cols, int cole, int k, int is, int ie, float alpha, float[] a, int offseta, int lda, float[] b, int offsetb, int ldb, float beta, float[] c, int offsetc, int ldc) {
        float b0;
        float a2;
        float a1;
        float sum20;
        float a0;
        float sum00;
        int row;
        int col;
        int Tcol = 3;
        int Trow = 3;
        int Ti = 2;
        for (col = cols; col < this.loopAlign(cols, cole, 3); ++col) {
            for (row = rows; row < this.loopAlign(rows, rowe, 3); ++row) {
                sum00 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    a0 = a[offseta + i + (row + 0) * lda];
                    float b02 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a0, b02, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
            }
            while (row < this.loopBound(rowe, 3)) {
                sum00 = 0.0f;
                float sum10 = 0.0f;
                sum20 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    float a02 = a[offseta + i + (row + 0) * lda];
                    a1 = a[offseta + i + (row + 1) * lda];
                    a2 = a[offseta + i + (row + 2) * lda];
                    b0 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a02, b0, sum00);
                    sum10 = Math.fma(a1, b0, sum10);
                    sum20 = Math.fma(a2, b0, sum20);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, c[offsetc + (row + 1) + (col + 0) * ldc]);
                c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, c[offsetc + (row + 2) + (col + 0) * ldc]);
                row += 3;
            }
            while (row < rowe) {
                sum00 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    a0 = a[offseta + i + (row + 0) * lda];
                    float b03 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a0, b03, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                ++row;
            }
        }
        while (col < this.loopBound(cole, 3)) {
            float sum02;
            float sum01;
            for (row = rows; row < this.loopAlign(rows, rowe, 3); ++row) {
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                float sum03 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    float a03 = a[offseta + i + (row + 0) * lda];
                    float b04 = b[offsetb + i + (col + 0) * ldb];
                    float b1 = b[offsetb + i + (col + 1) * ldb];
                    float b2 = b[offsetb + i + (col + 2) * ldb];
                    sum00 = Math.fma(a03, b04, sum00);
                    sum01 = Math.fma(a03, b1, sum01);
                    sum02 = Math.fma(a03, b2, sum02);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, c[offsetc + (row + 0) + (col + 1) * ldc]);
                c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, c[offsetc + (row + 0) + (col + 2) * ldc]);
            }
            while (row < this.loopBound(rowe, 3)) {
                this.sgepdotTN(m, row, row + 3, n, col, col + 3, k, is, ie, alpha, a, offseta, lda, b, offsetb, ldb, beta, c, offsetc, ldc);
                row += 3;
            }
            while (row < rowe) {
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    float a04 = a[offseta + i + (row + 0) * lda];
                    float b05 = b[offsetb + i + (col + 0) * ldb];
                    float b1 = b[offsetb + i + (col + 1) * ldb];
                    float b2 = b[offsetb + i + (col + 2) * ldb];
                    sum00 = Math.fma(a04, b05, sum00);
                    sum01 = Math.fma(a04, b1, sum01);
                    sum02 = Math.fma(a04, b2, sum02);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, c[offsetc + (row + 0) + (col + 1) * ldc]);
                c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, c[offsetc + (row + 0) + (col + 2) * ldc]);
                ++row;
            }
            col += 3;
        }
        while (col < cole) {
            for (row = rows; row < this.loopAlign(rows, rowe, 3); ++row) {
                sum00 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    a0 = a[offseta + i + (row + 0) * lda];
                    float b06 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a0, b06, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
            }
            while (row < this.loopBound(rowe, 3)) {
                sum00 = 0.0f;
                float sum10 = 0.0f;
                sum20 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    float a05 = a[offseta + i + (row + 0) * lda];
                    a1 = a[offseta + i + (row + 1) * lda];
                    a2 = a[offseta + i + (row + 2) * lda];
                    b0 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a05, b0, sum00);
                    sum10 = Math.fma(a1, b0, sum10);
                    sum20 = Math.fma(a2, b0, sum20);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, c[offsetc + (row + 1) + (col + 0) * ldc]);
                c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, c[offsetc + (row + 2) + (col + 0) * ldc]);
                row += 3;
            }
            while (row < rowe) {
                sum00 = 0.0f;
                for (int i = is; i < ie; ++i) {
                    a0 = a[offseta + i + (row + 0) * lda];
                    float b07 = b[offsetb + i + (col + 0) * ldb];
                    sum00 = Math.fma(a0, b07, sum00);
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void sgepdotTN(int m, int rows, int rowe, int n, int cols, int cole, int k, int is, int ie, float alpha, float[] a, int offseta, int lda, float[] b, int offsetb, int ldb, float beta, float[] c, int offsetc, int ldc) {
        float b2;
        float b1;
        float b0;
        float a2;
        float a1;
        float a0;
        int i;
        int Ti = 2;
        assert (rowe - rows == 3);
        assert (cole - cols == 3);
        int row = rows;
        int col = cols;
        float sum00 = 0.0f;
        float sum01 = 0.0f;
        float sum02 = 0.0f;
        float sum10 = 0.0f;
        float sum11 = 0.0f;
        float sum12 = 0.0f;
        float sum20 = 0.0f;
        float sum21 = 0.0f;
        float sum22 = 0.0f;
        for (i = is; i < this.loopAlign(is, ie, 2); ++i) {
            a0 = a[offseta + i + (row + 0) * lda];
            a1 = a[offseta + i + (row + 1) * lda];
            a2 = a[offseta + i + (row + 2) * lda];
            b0 = b[offsetb + i + (col + 0) * ldb];
            sum00 = Math.fma(a0, b0, sum00);
            sum10 = Math.fma(a1, b0, sum10);
            sum20 = Math.fma(a2, b0, sum20);
            b1 = b[offsetb + i + (col + 1) * ldb];
            sum01 = Math.fma(a0, b1, sum01);
            sum11 = Math.fma(a1, b1, sum11);
            sum21 = Math.fma(a2, b1, sum21);
            b2 = b[offsetb + i + (col + 2) * ldb];
            sum02 = Math.fma(a0, b2, sum02);
            sum12 = Math.fma(a1, b2, sum12);
            sum22 = Math.fma(a2, b2, sum22);
        }
        while (i < this.loopBound(ie, 2)) {
            float a00 = a[offseta + (i + 0) + (row + 0) * lda];
            float a01 = a[offseta + (i + 0) + (row + 1) * lda];
            float a02 = a[offseta + (i + 0) + (row + 2) * lda];
            float b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
            sum00 = Math.fma(a00, b00, sum00);
            sum10 = Math.fma(a01, b00, sum10);
            sum20 = Math.fma(a02, b00, sum20);
            float b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
            sum01 = Math.fma(a00, b01, sum01);
            sum11 = Math.fma(a01, b01, sum11);
            sum21 = Math.fma(a02, b01, sum21);
            float b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
            sum02 = Math.fma(a00, b02, sum02);
            sum12 = Math.fma(a01, b02, sum12);
            sum22 = Math.fma(a02, b02, sum22);
            float a10 = a[offseta + (i + 1) + (row + 0) * lda];
            float a11 = a[offseta + (i + 1) + (row + 1) * lda];
            float a12 = a[offseta + (i + 1) + (row + 2) * lda];
            float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
            sum00 = Math.fma(a10, b10, sum00);
            sum10 = Math.fma(a11, b10, sum10);
            sum20 = Math.fma(a12, b10, sum20);
            float b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
            sum01 = Math.fma(a10, b11, sum01);
            sum11 = Math.fma(a11, b11, sum11);
            sum21 = Math.fma(a12, b11, sum21);
            float b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
            sum02 = Math.fma(a10, b12, sum02);
            sum12 = Math.fma(a11, b12, sum12);
            sum22 = Math.fma(a12, b12, sum22);
            i += 2;
        }
        while (i < ie) {
            a0 = a[offseta + i + (row + 0) * lda];
            a1 = a[offseta + i + (row + 1) * lda];
            a2 = a[offseta + i + (row + 2) * lda];
            b0 = b[offsetb + i + (col + 0) * ldb];
            sum00 = Math.fma(a0, b0, sum00);
            sum10 = Math.fma(a1, b0, sum10);
            sum20 = Math.fma(a2, b0, sum20);
            b1 = b[offsetb + i + (col + 1) * ldb];
            sum01 = Math.fma(a0, b1, sum01);
            sum11 = Math.fma(a1, b1, sum11);
            sum21 = Math.fma(a2, b1, sum21);
            b2 = b[offsetb + i + (col + 2) * ldb];
            sum02 = Math.fma(a0, b2, sum02);
            sum12 = Math.fma(a1, b2, sum12);
            sum22 = Math.fma(a2, b2, sum22);
            ++i;
        }
        c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, c[offsetc + (row + 0) + (col + 0) * ldc]);
        c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, c[offsetc + (row + 0) + (col + 1) * ldc]);
        c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, c[offsetc + (row + 0) + (col + 2) * ldc]);
        c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, c[offsetc + (row + 1) + (col + 0) * ldc]);
        c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, c[offsetc + (row + 1) + (col + 1) * ldc]);
        c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, c[offsetc + (row + 1) + (col + 2) * ldc]);
        c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, c[offsetc + (row + 2) + (col + 0) * ldc]);
        c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, c[offsetc + (row + 2) + (col + 1) * ldc]);
        c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, c[offsetc + (row + 2) + (col + 2) * ldc]);
    }

    @Override
    protected void sgemmNN(int m, int n, int k, float alpha, float[] a, int offseta, int lda, float[] b, int offsetb, int ldb, float beta, float[] c, int offsetc, int ldc) {
        float a01;
        float a00;
        int i;
        float sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            float sum02;
            float sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b02;
                float b01;
                float b00;
                float a20;
                float a10;
                float a002;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                float sum10 = 0.0f;
                float sum11 = 0.0f;
                float sum12 = 0.0f;
                float sum20 = 0.0f;
                float sum21 = 0.0f;
                float sum22 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    float a012 = a[offseta + (row + 0) + (i + 1) * lda];
                    float a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    float a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    float b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    float b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum11 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                float b02;
                float b01;
                float b00;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    float b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    float b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b00;
                float a20;
                float a10;
                sum00 = 0.0f;
                float sum10 = 0.0f;
                float sum20 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    float a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    float a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                float b00;
                float a003;
                sum00 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    float a013 = a[offseta + (row + 0) + (i + 1) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0f ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void sgemmNT(int m, int n, int k, float alpha, float[] a, int offseta, int lda, float[] b, int offsetb, int ldb, float beta, float[] c, int offsetc, int ldc) {
        float a01;
        float a00;
        int i;
        float sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            float sum02;
            float sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b02;
                float b01;
                float b00;
                float a20;
                float a10;
                float a002;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                float sum10 = 0.0f;
                float sum11 = 0.0f;
                float sum12 = 0.0f;
                float sum20 = 0.0f;
                float sum21 = 0.0f;
                float sum22 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    float a012 = a[offseta + (row + 0) + (i + 1) * lda];
                    float a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    float a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    float b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    float b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum11 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                float b02;
                float b01;
                float b00;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    float b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    float b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b00;
                float a20;
                float a10;
                sum00 = 0.0f;
                float sum10 = 0.0f;
                float sum20 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (row + 0) + (i + 1) * lda];
                    float a11 = a[offseta + (row + 1) + (i + 1) * lda];
                    float a21 = a[offseta + (row + 2) + (i + 1) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (row + 0) + (i + 0) * lda];
                    a10 = a[offseta + (row + 1) + (i + 0) * lda];
                    a20 = a[offseta + (row + 2) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                float b00;
                float a003;
                sum00 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    float a013 = a[offseta + (row + 0) + (i + 1) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (row + 0) + (i + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0f ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void sgemmTN(int m, int n, int k, float alpha, float[] a, int offseta, int lda, float[] b, int offsetb, int ldb, float beta, float[] c, int offsetc, int ldc) {
        float a01;
        float a00;
        int i;
        float sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            float sum02;
            float sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b02;
                float b01;
                float b00;
                float a20;
                float a10;
                float a002;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                float sum10 = 0.0f;
                float sum11 = 0.0f;
                float sum12 = 0.0f;
                float sum20 = 0.0f;
                float sum21 = 0.0f;
                float sum22 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    float a012 = a[offseta + (i + 1) + (row + 0) * lda];
                    float a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    float a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    float b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    float b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum11 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                float b02;
                float b01;
                float b00;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    float b11 = b[offsetb + (i + 1) + (col + 1) * ldb];
                    float b12 = b[offsetb + (i + 1) + (col + 2) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    b01 = b[offsetb + (i + 0) + (col + 1) * ldb];
                    b02 = b[offsetb + (i + 0) + (col + 2) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b00;
                float a20;
                float a10;
                sum00 = 0.0f;
                float sum10 = 0.0f;
                float sum20 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    float a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    float a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                float b00;
                float a003;
                sum00 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    float a013 = a[offseta + (i + 1) + (row + 0) * lda];
                    float b10 = b[offsetb + (i + 1) + (col + 0) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (i + 0) + (col + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0f ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void sgemmTT(int m, int n, int k, float alpha, float[] a, int offseta, int lda, float[] b, int offsetb, int ldb, float beta, float[] c, int offsetc, int ldc) {
        float a01;
        float a00;
        int i;
        float sum00;
        int row;
        int col;
        int Trow = 3;
        int Tcol = 3;
        int Ti = 2;
        for (col = 0; col < this.loopBound(n, 3); col += 3) {
            float sum02;
            float sum01;
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b02;
                float b01;
                float b00;
                float a20;
                float a10;
                float a002;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                float sum10 = 0.0f;
                float sum11 = 0.0f;
                float sum12 = 0.0f;
                float sum20 = 0.0f;
                float sum21 = 0.0f;
                float sum22 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    float a012 = a[offseta + (i + 1) + (row + 0) * lda];
                    float a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    float a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    float b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    float b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a012, b10, sum00);
                    sum01 = Math.fma(a012, b11, sum01);
                    sum02 = Math.fma(a012, b12, sum02);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum11 = Math.fma(a11, b11, sum11);
                    sum11 = Math.fma(a11, b12, sum12);
                    sum20 = Math.fma(a21, b10, sum20);
                    sum21 = Math.fma(a21, b11, sum21);
                    sum22 = Math.fma(a21, b12, sum22);
                }
                while (i < k) {
                    a002 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a002, b00, sum00);
                    sum01 = Math.fma(a002, b01, sum01);
                    sum02 = Math.fma(a002, b02, sum02);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum11 = Math.fma(a10, b01, sum11);
                    sum11 = Math.fma(a10, b02, sum12);
                    sum20 = Math.fma(a20, b00, sum20);
                    sum21 = Math.fma(a20, b01, sum21);
                    sum22 = Math.fma(a20, b02, sum22);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 1) * ldc] = Math.fma(alpha, sum11, beta * c[offsetc + (row + 1) + (col + 1) * ldc]);
                    c[offsetc + (row + 1) + (col + 2) * ldc] = Math.fma(alpha, sum12, beta * c[offsetc + (row + 1) + (col + 2) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 1) * ldc] = Math.fma(alpha, sum21, beta * c[offsetc + (row + 2) + (col + 1) * ldc]);
                    c[offsetc + (row + 2) + (col + 2) * ldc] = Math.fma(alpha, sum22, beta * c[offsetc + (row + 2) + (col + 2) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 1) + (col + 1) * ldc] = alpha * sum11;
                c[offsetc + (row + 1) + (col + 2) * ldc] = alpha * sum12;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
                c[offsetc + (row + 2) + (col + 1) * ldc] = alpha * sum21;
                c[offsetc + (row + 2) + (col + 2) * ldc] = alpha * sum22;
            }
            while (row < m) {
                float b02;
                float b01;
                float b00;
                sum00 = 0.0f;
                sum01 = 0.0f;
                sum02 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    float b11 = b[offsetb + (col + 1) + (i + 1) * ldb];
                    float b12 = b[offsetb + (col + 2) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum01 = Math.fma(a01, b11, sum01);
                    sum02 = Math.fma(a01, b12, sum02);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    b01 = b[offsetb + (col + 1) + (i + 0) * ldb];
                    b02 = b[offsetb + (col + 2) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum01 = Math.fma(a00, b01, sum01);
                    sum02 = Math.fma(a00, b02, sum02);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 0) + (col + 1) * ldc] = Math.fma(alpha, sum01, beta * c[offsetc + (row + 0) + (col + 1) * ldc]);
                    c[offsetc + (row + 0) + (col + 2) * ldc] = Math.fma(alpha, sum02, beta * c[offsetc + (row + 0) + (col + 2) * ldc]);
                } else {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                    c[offsetc + (row + 0) + (col + 1) * ldc] = alpha * sum01;
                    c[offsetc + (row + 0) + (col + 2) * ldc] = alpha * sum02;
                }
                ++row;
            }
        }
        while (col < n) {
            for (row = 0; row < this.loopBound(m, 3); row += 3) {
                float b00;
                float a20;
                float a10;
                sum00 = 0.0f;
                float sum10 = 0.0f;
                float sum20 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    a01 = a[offseta + (i + 1) + (row + 0) * lda];
                    float a11 = a[offseta + (i + 1) + (row + 1) * lda];
                    float a21 = a[offseta + (i + 1) + (row + 2) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a01, b10, sum00);
                    sum10 = Math.fma(a11, b10, sum10);
                    sum20 = Math.fma(a21, b10, sum20);
                }
                while (i < k) {
                    a00 = a[offseta + (i + 0) + (row + 0) * lda];
                    a10 = a[offseta + (i + 0) + (row + 1) * lda];
                    a20 = a[offseta + (i + 0) + (row + 2) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a00, b00, sum00);
                    sum10 = Math.fma(a10, b00, sum10);
                    sum20 = Math.fma(a20, b00, sum20);
                    ++i;
                }
                if (beta != 0.0f) {
                    c[offsetc + (row + 0) + (col + 0) * ldc] = Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]);
                    c[offsetc + (row + 1) + (col + 0) * ldc] = Math.fma(alpha, sum10, beta * c[offsetc + (row + 1) + (col + 0) * ldc]);
                    c[offsetc + (row + 2) + (col + 0) * ldc] = Math.fma(alpha, sum20, beta * c[offsetc + (row + 2) + (col + 0) * ldc]);
                    continue;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = alpha * sum00;
                c[offsetc + (row + 1) + (col + 0) * ldc] = alpha * sum10;
                c[offsetc + (row + 2) + (col + 0) * ldc] = alpha * sum20;
            }
            while (row < m) {
                float b00;
                float a003;
                sum00 = 0.0f;
                for (i = 0; i < this.loopBound(k, 2); i += 2) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    float a013 = a[offseta + (i + 1) + (row + 0) * lda];
                    float b10 = b[offsetb + (col + 0) + (i + 1) * ldb];
                    sum00 = Math.fma(a013, b10, sum00);
                }
                while (i < k) {
                    a003 = a[offseta + (i + 0) + (row + 0) * lda];
                    b00 = b[offsetb + (col + 0) + (i + 0) * ldb];
                    sum00 = Math.fma(a003, b00, sum00);
                    ++i;
                }
                c[offsetc + (row + 0) + (col + 0) * ldc] = beta != 0.0f ? Math.fma(alpha, sum00, beta * c[offsetc + (row + 0) + (col + 0) * ldc]) : alpha * sum00;
                ++row;
            }
            ++col;
        }
    }

    @Override
    protected void dgemvN(int m, int n, double alpha, double[] a, int offseta, int lda, double[] x, int offsetx, int incx, double beta, double[] y, int offsety, int incy) {
        int iy;
        int row;
        int ix;
        if (beta != 1.0) {
            int iy2;
            int row2 = 0;
            int n2 = iy2 = incy < 0 ? (m - 1) * -incy : 0;
            while (row2 < m) {
                y[offsety + iy2] = beta != 0.0 ? beta * y[offsety + iy2] : 0.0;
                ++row2;
                iy2 += incy;
            }
        }
        int col = 0;
        int n3 = ix = incx < 0 ? (n - 1) * -incx : 0;
        while (col < this.loopBound(n, 4)) {
            row = 0;
            iy = incy < 0 ? (m - 1) * -incy : 0;
            double alphax0 = alpha * x[offsetx + ix + incx * 0];
            double alphax1 = alpha * x[offsetx + ix + incx * 1];
            double alphax2 = alpha * x[offsetx + ix + incx * 2];
            double alphax3 = alpha * x[offsetx + ix + incx * 3];
            while (row < m) {
                y[offsety + iy] = Math.fma(alphax0, a[offseta + row + (col + 0) * lda], Math.fma(alphax1, a[offseta + row + (col + 1) * lda], Math.fma(alphax2, a[offseta + row + (col + 2) * lda], Math.fma(alphax3, a[offseta + row + (col + 3) * lda], y[offsety + iy]))));
                ++row;
                iy += incy;
            }
            col += 4;
            ix += incx * 4;
        }
        while (col < n) {
            row = 0;
            iy = incy < 0 ? (m - 1) * -incy : 0;
            double alphax = alpha * x[offsetx + ix];
            while (row < m) {
                y[offsety + iy] = Math.fma(alphax, a[offseta + row + col * lda], y[offsety + iy]);
                ++row;
                iy += incy;
            }
            ++col;
            ix += incx;
        }
    }

    @Override
    protected void dgemvT(int m, int n, double alpha, double[] a, int offseta, int lda, double[] x, int offsetx, int incx, double beta, double[] y, int offsety, int incy) {
        int ix;
        int row;
        int iy;
        int col = 0;
        int n2 = iy = incy < 0 ? (n - 1) * -incy : 0;
        while (col < this.loopBound(n, 4)) {
            row = 0;
            ix = incx < 0 ? (m - 1) * -incx : 0;
            double sum0 = 0.0;
            double sum1 = 0.0;
            double sum2 = 0.0;
            double sum3 = 0.0;
            while (row < m) {
                double xix = x[offsetx + ix];
                sum0 = Math.fma(xix, a[offseta + row + (col + 0) * lda], sum0);
                sum1 = Math.fma(xix, a[offseta + row + (col + 1) * lda], sum1);
                sum2 = Math.fma(xix, a[offseta + row + (col + 2) * lda], sum2);
                sum3 = Math.fma(xix, a[offseta + row + (col + 3) * lda], sum3);
                ++row;
                ix += incx;
            }
            if (beta != 0.0) {
                y[offsety + iy + incy * 0] = alpha * sum0 + beta * y[offsety + iy + incy * 0];
                y[offsety + iy + incy * 1] = alpha * sum1 + beta * y[offsety + iy + incy * 1];
                y[offsety + iy + incy * 2] = alpha * sum2 + beta * y[offsety + iy + incy * 2];
                y[offsety + iy + incy * 3] = alpha * sum3 + beta * y[offsety + iy + incy * 3];
            } else {
                y[offsety + iy + incy * 0] = alpha * sum0;
                y[offsety + iy + incy * 1] = alpha * sum1;
                y[offsety + iy + incy * 2] = alpha * sum2;
                y[offsety + iy + incy * 3] = alpha * sum3;
            }
            col += 4;
            iy += incy * 4;
        }
        while (col < n) {
            row = 0;
            ix = incx < 0 ? (m - 1) * -incx : 0;
            double sum = 0.0;
            while (row < m) {
                sum = Math.fma(x[offsetx + ix], a[offseta + row + col * lda], sum);
                ++row;
                ix += incx;
            }
            y[offsety + iy] = beta != 0.0 ? alpha * sum + beta * y[offsety + iy] : alpha * sum;
            ++col;
            iy += incy;
        }
    }

    @Override
    protected void sgemvN(int m, int n, float alpha, float[] a, int offseta, int lda, float[] x, int offsetx, int incx, float beta, float[] y, int offsety, int incy) {
        int ix;
        int iy;
        int row = 0;
        int n2 = iy = incy < 0 ? (m - 1) * -incy : 0;
        while (row < m) {
            y[offsety + iy] = beta != 0.0f ? beta * y[offsety + iy] : 0.0f;
            ++row;
            iy += incy;
        }
        int col = 0;
        int n3 = ix = incx < 0 ? (n - 1) * -incx : 0;
        while (col < this.loopBound(n, 8)) {
            int iy2;
            float alphax0 = alpha * x[offsetx + ix + incx * 0];
            float alphax1 = alpha * x[offsetx + ix + incx * 1];
            float alphax2 = alpha * x[offsetx + ix + incx * 2];
            float alphax3 = alpha * x[offsetx + ix + incx * 3];
            float alphax4 = alpha * x[offsetx + ix + incx * 4];
            float alphax5 = alpha * x[offsetx + ix + incx * 5];
            float alphax6 = alpha * x[offsetx + ix + incx * 6];
            float alphax7 = alpha * x[offsetx + ix + incx * 7];
            int row2 = 0;
            int n4 = iy2 = incy < 0 ? (m - 1) * -incy : 0;
            while (row2 < m) {
                y[offsety + iy2] = Math.fma(alphax0, a[offseta + row2 + (col + 0) * lda], Math.fma(alphax1, a[offseta + row2 + (col + 1) * lda], Math.fma(alphax2, a[offseta + row2 + (col + 2) * lda], Math.fma(alphax3, a[offseta + row2 + (col + 3) * lda], Math.fma(alphax4, a[offseta + row2 + (col + 4) * lda], Math.fma(alphax5, a[offseta + row2 + (col + 5) * lda], Math.fma(alphax6, a[offseta + row2 + (col + 6) * lda], Math.fma(alphax7, a[offseta + row2 + (col + 7) * lda], y[offsety + iy2]))))))));
                ++row2;
                iy2 += incy;
            }
            col += 8;
            ix += incx * 8;
        }
        while (col < n) {
            int iy3;
            float alphax = alpha * x[offsetx + ix];
            int row3 = 0;
            int n5 = iy3 = incy < 0 ? (m - 1) * -incy : 0;
            while (row3 < m) {
                y[offsety + iy3] = Math.fma(alphax, a[offseta + row3 + col * lda], y[offsety + iy3]);
                ++row3;
                iy3 += incy;
            }
            ++col;
            ix += incx;
        }
    }

    @Override
    protected void sgemvT(int m, int n, float alpha, float[] a, int offseta, int lda, float[] x, int offsetx, int incx, float beta, float[] y, int offsety, int incy) {
        int iy;
        int col = 0;
        int n2 = iy = incy < 0 ? (n - 1) * -incy : 0;
        while (col < this.loopBound(n, 8)) {
            int ix;
            float sum0 = 0.0f;
            float sum1 = 0.0f;
            float sum2 = 0.0f;
            float sum3 = 0.0f;
            float sum4 = 0.0f;
            float sum5 = 0.0f;
            float sum6 = 0.0f;
            float sum7 = 0.0f;
            int row = 0;
            int n3 = ix = incx < 0 ? (m - 1) * -incx : 0;
            while (row < m) {
                sum0 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 0) * lda], sum0);
                sum1 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 1) * lda], sum1);
                sum2 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 2) * lda], sum2);
                sum3 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 3) * lda], sum3);
                sum4 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 4) * lda], sum4);
                sum5 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 5) * lda], sum5);
                sum6 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 6) * lda], sum6);
                sum7 = Math.fma(x[offsetx + ix], a[offseta + row + (col + 7) * lda], sum7);
                ++row;
                ix += incx;
            }
            if (beta != 0.0f) {
                y[offsety + iy + incy * 0] = alpha * sum0 + beta * y[offsety + iy + incy * 0];
                y[offsety + iy + incy * 1] = alpha * sum1 + beta * y[offsety + iy + incy * 1];
                y[offsety + iy + incy * 2] = alpha * sum2 + beta * y[offsety + iy + incy * 2];
                y[offsety + iy + incy * 3] = alpha * sum3 + beta * y[offsety + iy + incy * 3];
                y[offsety + iy + incy * 4] = alpha * sum4 + beta * y[offsety + iy + incy * 4];
                y[offsety + iy + incy * 5] = alpha * sum5 + beta * y[offsety + iy + incy * 5];
                y[offsety + iy + incy * 6] = alpha * sum6 + beta * y[offsety + iy + incy * 6];
                y[offsety + iy + incy * 7] = alpha * sum7 + beta * y[offsety + iy + incy * 7];
            } else {
                y[offsety + iy + incy * 0] = alpha * sum0;
                y[offsety + iy + incy * 1] = alpha * sum1;
                y[offsety + iy + incy * 2] = alpha * sum2;
                y[offsety + iy + incy * 3] = alpha * sum3;
                y[offsety + iy + incy * 4] = alpha * sum4;
                y[offsety + iy + incy * 5] = alpha * sum5;
                y[offsety + iy + incy * 6] = alpha * sum6;
                y[offsety + iy + incy * 7] = alpha * sum7;
            }
            col += 8;
            iy += incy * 8;
        }
        while (col < n) {
            int ix;
            float sum = 0.0f;
            int row = 0;
            int n4 = ix = incx < 0 ? (m - 1) * -incx : 0;
            while (row < m) {
                sum = Math.fma(x[offsetx + ix], a[offseta + row + col * lda], sum);
                ++row;
                ix += incx;
            }
            y[offsety + iy] = beta != 0.0f ? alpha * sum + beta * y[offsety + iy] : alpha * sum;
            ++col;
            iy += incy;
        }
    }

    @Override
    protected void dgerK(int m, int n, double alpha, double[] x, int offsetx, int incx, double[] y, int offsety, int incy, double[] a, int offseta, int lda) {
        int iy;
        int col = 0;
        int n2 = iy = incy < 0 ? (n - 1) * -incy : 0;
        while (col < this.loopBound(n, 4)) {
            int jx;
            double alphayiy0 = alpha * y[offsety + iy + incy * 0];
            double alphayiy1 = alpha * y[offsety + iy + incy * 1];
            double alphayiy2 = alpha * y[offsety + iy + incy * 2];
            double alphayiy3 = alpha * y[offsety + iy + incy * 3];
            int row = 0;
            int n3 = jx = incx < 0 ? (n - 1) * -incx : 0;
            while (row < m) {
                double xjx = x[offsetx + jx];
                a[offseta + row + (col + 0) * lda] = Math.fma(alphayiy0, xjx, a[offseta + row + (col + 0) * lda]);
                a[offseta + row + (col + 1) * lda] = Math.fma(alphayiy1, xjx, a[offseta + row + (col + 1) * lda]);
                a[offseta + row + (col + 2) * lda] = Math.fma(alphayiy2, xjx, a[offseta + row + (col + 2) * lda]);
                a[offseta + row + (col + 3) * lda] = Math.fma(alphayiy3, xjx, a[offseta + row + (col + 3) * lda]);
                ++row;
                jx += incx;
            }
            col += 4;
            iy += incy * 4;
        }
        while (col < n) {
            int jx;
            double alphayiy = alpha * y[offsety + iy];
            int row = 0;
            int n4 = jx = incx < 0 ? (n - 1) * -incx : 0;
            while (row < m) {
                a[offseta + row + col * lda] = Math.fma(alphayiy, x[offsetx + jx], a[offseta + row + col * lda]);
                ++row;
                jx += incx;
            }
            ++col;
            iy += incy;
        }
    }

    @Override
    protected void sgerK(int m, int n, float alpha, float[] x, int offsetx, int incx, float[] y, int offsety, int incy, float[] a, int offseta, int lda) {
        int iy;
        int col = 0;
        int n2 = iy = incy < 0 ? (n - 1) * -incy : 0;
        while (col < this.loopBound(n, 4)) {
            int jx;
            float alphayiy0 = alpha * y[offsety + iy + incy * 0];
            float alphayiy1 = alpha * y[offsety + iy + incy * 1];
            float alphayiy2 = alpha * y[offsety + iy + incy * 2];
            float alphayiy3 = alpha * y[offsety + iy + incy * 3];
            int row = 0;
            int n3 = jx = incx < 0 ? (n - 1) * -incx : 0;
            while (row < m) {
                float xjx = x[offsetx + jx];
                a[offseta + row + (col + 0) * lda] = Math.fma(alphayiy0, xjx, a[offseta + row + (col + 0) * lda]);
                a[offseta + row + (col + 1) * lda] = Math.fma(alphayiy1, xjx, a[offseta + row + (col + 1) * lda]);
                a[offseta + row + (col + 2) * lda] = Math.fma(alphayiy2, xjx, a[offseta + row + (col + 2) * lda]);
                a[offseta + row + (col + 3) * lda] = Math.fma(alphayiy3, xjx, a[offseta + row + (col + 3) * lda]);
                ++row;
                jx += incx;
            }
            col += 4;
            iy += incy * 4;
        }
        while (col < n) {
            int jx;
            float alphayiy = alpha * y[offsety + iy];
            int row = 0;
            int n4 = jx = incx < 0 ? (n - 1) * -incx : 0;
            while (row < m) {
                a[offseta + row + col * lda] = Math.fma(alphayiy, x[offsetx + jx], a[offseta + row + col * lda]);
                ++row;
                jx += incx;
            }
            ++col;
            iy += incy;
        }
    }

    @Override
    protected double dnrm2K(int n, double[] x, int offsetx, int incx) {
        double x3;
        double x2;
        double x1;
        double x0;
        int ix;
        double sum0 = 0.0;
        double sum1 = 0.0;
        double sum2 = 0.0;
        double sum3 = 0.0;
        if (incx == 1) {
            for (ix = 0; ix < this.loopBound(n, 4); ix += 4) {
                x0 = x[offsetx + ix + 0];
                x1 = x[offsetx + ix + 1];
                x2 = x[offsetx + ix + 2];
                x3 = x[offsetx + ix + 3];
                sum0 = Math.fma(x0, x0, sum0);
                sum1 = Math.fma(x1, x1, sum1);
                sum2 = Math.fma(x2, x2, sum2);
                sum3 = Math.fma(x3, x3, sum3);
            }
        } else {
            while (ix < this.loopBound(n, 4) * incx) {
                x0 = x[offsetx + ix + 0 * incx];
                x1 = x[offsetx + ix + 1 * incx];
                x2 = x[offsetx + ix + 2 * incx];
                x3 = x[offsetx + ix + 3 * incx];
                sum0 = Math.fma(x0, x0, sum0);
                sum1 = Math.fma(x1, x1, sum1);
                sum2 = Math.fma(x2, x2, sum2);
                sum3 = Math.fma(x3, x3, sum3);
                ix += 4 * incx;
            }
        }
        double sum = sum0 + sum1 + sum2 + sum3;
        while (ix < n * incx) {
            double x02 = x[offsetx + ix + 0];
            sum = Math.fma(x02, x02, sum);
            ix += incx;
        }
        return Math.sqrt(sum);
    }

    @Override
    protected float snrm2K(int n, float[] x, int offsetx, int incx) {
        float x3;
        float x2;
        float x1;
        float x0;
        int ix;
        float sum0 = 0.0f;
        float sum1 = 0.0f;
        float sum2 = 0.0f;
        float sum3 = 0.0f;
        if (incx == 1) {
            for (ix = 0; ix < this.loopBound(n, 4); ix += 4) {
                x0 = x[offsetx + ix + 0];
                x1 = x[offsetx + ix + 1];
                x2 = x[offsetx + ix + 2];
                x3 = x[offsetx + ix + 3];
                sum0 = Math.fma(x0, x0, sum0);
                sum1 = Math.fma(x1, x1, sum1);
                sum2 = Math.fma(x2, x2, sum2);
                sum3 = Math.fma(x3, x3, sum3);
            }
        } else {
            while (ix < this.loopBound(n, 4) * incx) {
                x0 = x[offsetx + ix + 0 * incx];
                x1 = x[offsetx + ix + 1 * incx];
                x2 = x[offsetx + ix + 2 * incx];
                x3 = x[offsetx + ix + 3 * incx];
                sum0 = Math.fma(x0, x0, sum0);
                sum1 = Math.fma(x1, x1, sum1);
                sum2 = Math.fma(x2, x2, sum2);
                sum3 = Math.fma(x3, x3, sum3);
                ix += 4 * incx;
            }
        }
        float sum = sum0 + sum1 + sum2 + sum3;
        while (ix < n * incx) {
            float x02 = x[offsetx + ix + 0];
            sum = Math.fma(x02, x02, sum);
            ix += incx;
        }
        return (float)Math.sqrt(sum);
    }
}

