使用x86 SIMD指令对应用程序进行优化并分析

图像格式转化

YUV2RGB:

$$ \begin{bmatrix} R\\ G\\ B \end{bmatrix} = \begin{bmatrix} 1.164383 & 0 & 1.596027 \\ 1.164383 & -0.391762 & -0.812968 \\ 1.164383 & 2.017232 & 0 \end{bmatrix} \begin{bmatrix} Y-16\\ U-128\\ V-128 \end{bmatrix} $$

Alpha 混合:

$$ \begin{bmatrix} R^\prime\\G^\prime\\B^\prime \end{bmatrix} = \begin{bmatrix} A/256 & 0 & 0 \\ 0 & A/256 & 0 \\ 0 & 0 & A/256 \\ \end{bmatrix} \begin{bmatrix} R\\G\\B \end{bmatrix} $$

RGB2YUV:

$$ \begin{bmatrix} Y\\U\\V \end{bmatrix} = \begin{bmatrix} 0.256788 & 0.504129 & 0.097906 \\ -0.148223 & -0.290993 & 0.439216 \\ 0.439216 & -0.367788 & -0.071427 \end{bmatrix} \begin{bmatrix} R^\prime\\G^\prime\\B^\prime \end{bmatrix} + \begin{bmatrix} 16\\128\\128 \end{bmatrix} $$

基础ISA

直接实现上述矩阵乘法即可。源代码如下:

void BasicProcessor() {
    auto *data = new Pixel[size];
    for (int i = 0; i < height; i++) {
        for (int j = 0; j < width; j++) {
            int offset = i * width + j;
            int yi = (uint8_t) y[i][j];
            int ui = (uint8_t) u[i / 2][j / 2];
            int vi = (uint8_t) v[i / 2][j / 2];
            int r = 1.164383 * (yi - 16) + 1.596027 * (vi - 128);
            int b = 1.164383 * (yi - 16) + 2.017232 * (ui - 128);
            int g = 1.164383 * (yi - 16) - 0.391762 * (ui - 128) - 0.812968 * (vi - 128);
            data[offset].r = between(r);
            data[offset].b = between(b);
            data[offset].g = between(g);
        }
    }

    for (int num = 0; num < frame_num; num++) {
        int a = num * 3 + 1;
        for (int i = 0; i < height; i++) {
            for (int j = 0; j < width; j++) {
                int offset = i * width + j;
                int r = data[offset].r * a / 256.0;
                int b = data[offset].b * a / 256.0;
                int g = data[offset].g * a / 256.0;
                int yi = 0.256788 * r + 0.504129 * g + 0.097906 * b + 16;
                int ui = -0.148223 * r - 0.290993 * g + 0.439216 * b + 128;
                int vi = 0.439216 * r - 0.367788 * g - 0.071427 * b + 128;
                result[num][0][offset] = yi;
                result[num][1][(i / 2) * (width / 2) + (j / 2)] = ui;
                result[num][2][(i / 2) * (width / 2) + (j / 2)] = vi;
            }
        }
    }
    delete[] data;
}

MMX扩展指令

MMX是一个比较老的指令集,只支持整数运算,使用64位寄存器,在C++中可以通过引用mmintrin.h头文件。由于原本运算都是浮点数运算,这里我参考stackoverflow给出的算法,给出了使用整数并行运算的实现。源代码如下:

void MMXProcessor() {
    auto *data = new __m64[size];
    for (int i = 0; i < height; i++) {
        for (int j = 0; j < width; j++) {
            auto yi = (uint8_t) y[i][j];
            auto ui = (uint8_t) u[i / 2][j / 2];
            auto vi = (uint8_t) v[i / 2][j / 2];
            __m64 yuv = _mm_setr_pi16(16, 128, 128, 0);
            yuv = _mm_sub_pi16(_mm_setr_pi16(yi, ui, vi, 0), yuv);
            __m64 r = _mm_madd_pi16(yuv, _mm_setr_pi16(298, 0, 409, 0));
            __m64 g = _mm_madd_pi16(yuv, _mm_setr_pi16(298, -100, -208, 0));
            __m64 b = _mm_madd_pi16(yuv, _mm_setr_pi16(298, 516, 0, 0));
            int rr = (_mm_cvtsi64_si32(r) + int(_mm_cvtm64_si64(r) >> 32));
            int gg = (_mm_cvtsi64_si32(g) + int(_mm_cvtm64_si64(g) >> 32));
            int bb = (_mm_cvtsi64_si32(b) + int(_mm_cvtm64_si64(b) >> 32));
            __m64 val = _mm_setr_pi32(rr, gg);
            val = _mm_add_pi32(val, _mm_setr_pi32(128, 128));
            val = _mm_srai_pi32(val, 8);
            int ri = between(_mm_cvtsi64_si32(val));
            int gi = between(int(_mm_cvtm64_si64(val)>>32));
            int bi = between((bb + 128) >> 8);
            data[i * width + j] = _mm_setr_pi16(ri, gi, bi, 0);
        }
    }
    for (int num = 0; num < frame_num; num++) {
        int a = num * 3 + 1;
        for (int i = 0; i < height; i++) {
            for (int j = 0; j < width; j++) {
                int offset = i * width + j;
                __m64 rgb = _mm_srli_pi16(_mm_mullo_pi16(data[offset], _mm_set1_pi16(a)), 8);
                __m64 yi = (_mm_madd_pi16(rgb, _mm_setr_pi16(66, 129, 25, 0)));
                __m64 ui = (_mm_madd_pi16(rgb, _mm_setr_pi16(-38, -74, 112, 0)));
                __m64 vi = (_mm_madd_pi16(rgb, _mm_setr_pi16(112, -94, -18, 0)));
                int yy = (_mm_cvtsi64_si32(yi) + int(_mm_cvtm64_si64(yi) >> 32));
                int uu = (_mm_cvtsi64_si32(ui) + int(_mm_cvtm64_si64(ui) >> 32));
                int vv = (_mm_cvtsi64_si32(vi) + int(_mm_cvtm64_si64(vi) >> 32));
                __m64 val = _mm_setr_pi32(yy, uu);
                val = _mm_add_pi32(val, _mm_setr_pi32(128, 128));
                val = _mm_srai_pi32(val, 8);
                val = _mm_add_pi32(val, _mm_setr_pi32(16, 128));
                int ri = between(_mm_cvtsi64_si32(val));
                int gi = between(int(_mm_cvtm64_si64(val)>>32));
                int bi = between(((vv + 128) >> 8) + 128);
                result[num][0][offset] = ri;
                result[num][1][(i / 2) * (width / 2) + j / 2] = gi;
                result[num][2][(i / 2) * (width / 2) + j / 2] = bi;
            }
        }
    }
    delete[] data;
};

SSE2扩展指令

SSE和SSE2指令集使用128位寄存器,可同时处理4个单精度浮点数或2个双精度浮点数。由于图像转化过程精度要求不高,所以可以简单的将程序并行化。在C++中可以引用emmintrin.h头文件。源代码如下:

void SSE2Processor() {
    auto *data = new __m128[size];
    for (int i = 0; i < height; i++) {
        for (int j = 0; j < width; j++) {
            auto yi = (uint8_t) y[i][j];
            auto ui = (uint8_t) u[i / 2][j / 2];
            auto vi = (uint8_t) v[i / 2][j / 2];
            __m128 rgb = _mm_setr_ps(yi, ui, vi, 0);
            float r, g, b;
            rgb = _mm_sub_ps(rgb, _mm_setr_ps(16, 128, 128, 0));
            _mm_store_ss(&r, _mm_dp_ps(rgb, _mm_setr_ps(1.164383, 0, 1.596027, 0), 0b01110001));
            _mm_store_ss(&g, _mm_dp_ps(rgb, _mm_setr_ps(1.164383, -0.391762, -0.812968, 0), 0b01110001));
            _mm_store_ss(&b, _mm_dp_ps(rgb, _mm_setr_ps(1.164383, 2.017232, 0, 0), 0b01110001));
            r = between(r);
            g = between(g);
            b = between(b);
            data[i * width + j] = _mm_setr_ps(r, g, b, 0);
        }
    }

    for (int num = 0; num < frame_num; num++) {
        int a = num * 3 + 1;
        for (int i = 0; i < height; i++) {
            for (int j = 0; j < width; j++) {
                int offset = i * width + j;
                __m128 rgba = _mm_mul_ps(data[offset], _mm_set1_ps(a / 256.0));
                float yi, ui, vi;
                _mm_store_ss(&yi, _mm_dp_ps(rgba, _mm_setr_ps(0.256788, 0.504129, 0.097906, 0), 0b01110001));
                _mm_store_ss(&ui, _mm_dp_ps(rgba, _mm_setr_ps(-0.148223, -0.290993, 0.439216, 0), 0b01110001));
                _mm_store_ss(&vi, _mm_dp_ps(rgba, _mm_setr_ps(0.439216, -0.367788, -0.071427, 0), 0b01110001));
                result[num][0][offset] = yi + 16;
                result[num][1][(i / 2) * (width / 2) + j / 2] = ui + 128;
                result[num][2][(i / 2) * (width / 2) + j / 2] = vi + 128;
            }
        }
    }
    delete[] data;
};

AVX扩展指令

AVX扩展指令使用256位寄存器,可同时处理8个单精度浮点数,这里我每次使用3个256位寄存器,一次处理8个数据点,极大的提高了并行效率。在C++中可以引用immintrin.h头文件。源代码如下:

void AVXProcessor() {
    auto *data = new __m256[size * 3 / 8];
    int cont = 0;
    for (int i = 0; i < height; i++) {
        for (int j = 0; j < width; j++) {
            ...
            
            __m256 y0 = _mm256_setr_ps(yi0, yi1, yi2, yi3, yi4, yi5, yi6, yi7);
            y0 = _mm256_sub_ps(y0, _mm256_set1_ps(16));
            y0 = _mm256_mul_ps(y0, _mm256_set1_ps(1.164383));

            __m256 u0 = _mm256_setr_ps(ui0, ui1, ui2, ui3, ui4, ui5, ui6, ui7);
            u0 = _mm256_sub_ps(u0, _mm256_set1_ps(128));

            __m256 v0 = _mm256_setr_ps(vi0, vi1, vi2, vi3, vi4, vi5, vi6, vi7);
            v0 = _mm256_sub_ps(v0, _mm256_set1_ps(128));

            __m256 r = _mm256_add_ps(y0, _mm256_mul_ps(v0, _mm256_set1_ps(1.596027)));
            __m256 g = _mm256_add_ps(y0, _mm256_add_ps(_mm256_mul_ps(u0, _mm256_set1_ps(-0.391762)),
                                                       _mm256_mul_ps(v0, _mm256_set1_ps(-0.812968))));
            __m256 b = _mm256_add_ps(y0, _mm256_mul_ps(u0, _mm256_set1_ps(2.017232)));
            __m256 zero = _mm256_set1_ps(0);
            __m256 max = _mm256_set1_ps(255);

            r = _mm256_max_ps(r, zero);
            r = _mm256_min_ps(r, max);
            g = _mm256_max_ps(g, zero);
            g = _mm256_min_ps(g, max);
            b = _mm256_max_ps(b, zero);
            b = _mm256_min_ps(b, max);

            data[cont++] = r;
            data[cont++] = g;
            data[cont++] = b;
        }
    }


    for (int num = 0; num < frame_num; num++) {
        int a = num * 3 + 1;
        cont = 0;
        for (int i = 0; i < height; i++) {
            for (int j = 0; j < width; j++) {
                __m256 r = data[cont++];
                __m256 g = data[cont++];
                __m256 b = data[cont++];

                r = _mm256_mul_ps(r, _mm256_set1_ps(a / 256.0));
                g = _mm256_mul_ps(g, _mm256_set1_ps(a / 256.0));
                b = _mm256_mul_ps(b, _mm256_set1_ps(a / 256.0));

                __m256 yi = _mm256_add_ps(_mm256_set1_ps(16),
                                          _mm256_add_ps(_mm256_mul_ps(r, _mm256_set1_ps(0.256788)),
                                                        _mm256_add_ps(_mm256_mul_ps(g, _mm256_set1_ps(0.504129)),
                                                                      _mm256_mul_ps(b, _mm256_set1_ps(0.097906)))));
                __m256 ui = _mm256_add_ps(_mm256_set1_ps(128),
                                          _mm256_add_ps(_mm256_mul_ps(r, _mm256_set1_ps(-0.148223)),
                                                        _mm256_add_ps(_mm256_mul_ps(g, _mm256_set1_ps(-0.290993)),
                                                                      _mm256_mul_ps(b, _mm256_set1_ps(0.439216)))));
                __m256 vi = _mm256_add_ps(_mm256_set1_ps(128),
                                          _mm256_add_ps(_mm256_mul_ps(r, _mm256_set1_ps(0.439216)),
                                                        _mm256_add_ps(_mm256_mul_ps(g, _mm256_set1_ps(-0.367788)),
                                                                      _mm256_mul_ps(b, _mm256_set1_ps(-0.071427)))));

                yi = _mm256_cvtps_epi32(yi);
                ui = _mm256_cvtps_epi32(ui);
                vi = _mm256_cvtps_epi32(vi);

                ...
            }
        }
    }
    delete[] data;
};

运行结果

使用ffplay播放生成文件,仅凭肉眼看不出四个指令集生成文件的差异。对四个指令集运行时间统计如下表:

源文件BASICMMXSSE2AVX
dem1.yuv1230ms1243ms648ms375ms
dem2.yuv1274ms1281ms635ms332ms

从上表中可以看出,SSE2和AVX指令对图像处理速度有很大提升,而MMX指令集相对基础指令集并没有太大改进。下面对这个结果做一个原因分析:

  1. MMX使用整数运算,相对基础指令集并没有有效减少指令数。
  2. SSE2相对BASIC实现了2倍的性能提升,这是因为SSE2中采用SIMD将点积运算并行化了。
  3. AVX相对SSE2也有2倍性能提升,这是因为AVX中利用循环展开的方法,将原本的8次循环转化为1次循环,一次处理8个数据点,极大的实现了并行计算技术。

设计自定义扩展指令对SIMD应用优化并分析

需求

设计若干32位宽的扩展指令,支持8个宽度为256位的SIMD指令专用寄存器,支持8/16/32位pack、unpack计算,支持加/减/乘法,支持饱和计算,支持必要的数据传输指令。

自设计指令的编码、助记符,以及语义

一共有8个SIMD寄存器,分别标号m0~m7,则需要3位标示一个SIMD寄存器。为尽可能与RISCV其他扩展指令集兼容,OPCODE域仍取7位。为了满足SIMD的扩展性,使用5位标示一个SIMD寄存器,则可扩展至32个SIMD寄存器,个数与x、f寄存器相同。

编码助记符语义
0000000 mrs2 mrs1 000 mrd 0000100addri8pmrs1和mrs2按packed 8位int相加放入mrd
0000000 mrs2 mrs1 001 mrd 0000100addri8psmrs1和mrs2按packed 8位int饱和相加放入mrd
0000001 mrs2 mrs1 000 mrd 0000100addri16pmrs1和mrs2按packed 16位int相加放入mrd
0000001 mrs2 mrs1 001 mrd 0000100addri16psmrs1和mrs2按packed 16位int饱和相加放入mrd
0000010 mrs2 mrs1 000 mrd 0000100addri32pmrs1和mrs2按packed 32位int相加放入mrd
0000010 mrs2 mrs1 001 mrd 0000100addri32psmrs1和mrs2按packed 32位int饱和相加放入mrd
0000011 mrs2 mrs1 000 mrd 0000100addri64pmrs1和mrs2按packed 64位int相加放入mrd
0000011 mrs2 mrs1 001 mrd 0000100addri64psmrs1和mrs2按packed 64位int饱和相加放入mrd
0000110 mrs2 mrs1 000 mrd 0000100addrf32pmrs1和mrs2按packed 32位float相加放入mrd
0000111 mrs2 mrs1 000 mrd 0000100addrf64pmrs1和mrs2按packed 64位double相加放入mrd
0001000 mrs2 mrs1 000 mrd 0000100subri8pmrs1和mrs2按packed 8位int相减放入mrd
0001000 mrs2 mrs1 001 mrd 0000100subri8psmrs1和mrs2按packed 8位int饱和相减放入mrd
0001001 mrs2 mrs1 000 mrd 0000100subri16pmrs1和mrs2按packed 16位int相减放入mrd
0001001 mrs2 mrs1 001 mrd 0000100subri16psmrs1和mrs2按packed 16位int饱和相减放入mrd
0001010 mrs2 mrs1 000 mrd 0000100subri32pmrs1和mrs2按packed 32位int相减放入mrd
0001010 mrs2 mrs1 001 mrd 0000100subri32psmrs1和mrs2按packed 32位int饱和相减放入mrd
0001011 mrs2 mrs1 000 mrd 0000100subri64pmrs1和mrs2按packed 64位int相减放入mrd
0001011 mrs2 mrs1 001 mrd 0000100subri64psmrs1和mrs2按packed 64位int饱和相减放入mrd
0001110 mrs2 mrs1 000 mrd 0000100subrf32pmrs1和mrs2按packed 32位float相减放入mrd
0001111 mrs2 mrs1 000 mrd 0000100subrf64pmrs1和mrs2按packed 64位double相减放入mrd
0010000 mrs2 mrs1 000 mrd 0000100mulri8pmrs1和mrs2按packed 8位int相乘放入mrd
0010001 mrs2 mrs1 000 mrd 0000100mulri16pmrs1和mrs2按packed 16位int相乘放入mrd
0010010 mrs2 mrs1 000 mrd 0000100mulri32pmrs1和mrs2按packed 32位int相乘放入mrd
0010011 mrs2 mrs1 000 mrd 0000100mulri64pmrs1和mrs2按packed 64位int相乘放入mrd
0010110 mrs2 mrs1 000 mrd 0000100mulrf32pmrs1和mrs2按packed 32位float相乘放入mrd
0010111 mrs2 mrs1 000 mrd 0000100mulrf64pmrs1和mrs2按packed 64位double相乘放入mrd
0100000 mrs2 mrs1 000 mrd 0000100maxri8pmrs1和mrs2按packed 8位int比较,大的放入mrd
0100001 mrs2 mrs1 000 mrd 0000100maxri16pmrs1和mrs2按packed 16位int比较,大的放入mrd
0100010 mrs2 mrs1 000 mrd 0000100maxri32pmrs1和mrs2按packed 32位int比较,大的放入mrd
0100011 mrs2 mrs1 000 mrd 0000100maxri64pmrs1和mrs2按packed 64位int比较,大的放入mrd
0100110 mrs2 mrs1 000 mrd 0000100maxrf32pmrs1和mrs2按packed 32位float比较,大的放入mrd
0100111 mrs2 mrs1 000 mrd 0000100maxrf64pmrs1和mrs2按packed 64位double比较,大的放入mrd
1000000 mrs2 mrs1 000 mrd 0000100minri8pmrs1和mrs2按packed 8位int比较,小的放入mrd
1000001 mrs2 mrs1 000 mrd 0000100minri16pmrs1和mrs2按packed 16位int比较,小的放入mrd
1000010 mrs2 mrs1 000 mrd 0000100minri32pmrs1和mrs2按packed 32位int比较,小的放入mrd
1000011 mrs2 mrs1 000 mrd 0000100minri64pmrs1和mrs2按packed 64位int比较,小的放入mrd
1000110 mrs2 mrs1 000 mrd 0000100minrf32pmrs1和mrs2按packed 32位float比较,小的放入mrd
1000111 mrs2 mrs1 000 mrd 0000100minrf64pmrs1和mrs2按packed 64位double比较,小的放入mrd
imm[11:0] xrs1 000 mrd 0001000addii8pxrs1和imm按packed 8位int相加复制多次放入mrd
imm[11:0] xrs1 001 mrd 0001000addii16psxrs1和imm按packed 16位int相加复制多次放入mrd
imm[11:0] xrs1 010 mrd 0001000addii32pxrs1和imm按packed 32位int相加复制多次放入mrd
imm[11:0] xrs1 011 mrd 0001000addii64pxrs1和imm按packed 64位int相加复制多次放入mrd
imm[11:0] frs1 110 mrd 0001000addif32pfrs1和imm按packed 32位float相加复制多次放入mrd
imm[11:0] frs1 111 mrd 0001000addif64pfrs1和imm按packed 64位double相加复制多次放入mrd
0000000 xrs2 mrs1 000 mrd 0100000sumi8按8位整型计算mrs1中的和,结果存入mrd的第xrs2处(mrd视为64位分组)
0000001 xrs2 mrs1 000 mrd 0100000sumi16按16位整型计算mrs1中的和,结果存入mrd的第xrs2处(mrd视为64位分组)
0000010 xrs2 mrs1 000 mrd 0100000sumi32按32位整型计算mrs1中的和,结果存入mrd的第xrs2处(mrd视为64位分组)
0000011 xrs2 mrs1 000 mrd 0100000sumi64按64位整型计算mrs1中的和,结果存入mrd的第xrs2处(mrd视为64位分组)
0000110 xrs2 mrs1 000 mrd 0100000sumf32按32位浮点计算mrs1中的和,结果存入mrd的第xrs2处(mrd视为64位分组)
0000111 xrs2 mrs1 000 mrd 0100000sumf64按64位浮点计算mrs1中的和,结果存入mrd的第xrs2处(mrd视为64位分组)
imm[11:0] xrs1 000 mrd 0010000lvxrs1和imm相加作为内存地址,从该地址读取32字节数据存入mrd
imm[11:5] mrs2 xrs1 001 imm[4:0] 0010000sdxrs1和imm相加作为内存地址,从mrs2中向该地址写入32字节数据

使用mySIMD重写核心程序

基本上是改写AVX指令集的程序,m0~m8为8个SIMD寄存器

void AVsXProcessor() {
    auto **data = new double *[size];
    for (int i = 0; i < size; i++) {
        data[i] = new double[4];
    }

    for (int i = 0; i < height; i++) {
        for (int j = 0; j < width; j++) {
            auto yi0 = (uint8_t) y[i][j];
            auto ui0 = (uint8_t) u[i / 2][j / 2];
            auto vi0 = (uint8_t) v[i / 2][j / 2];

            double tmp[4] = {yi0, ui0, vi0, 0};
            m0 = lv tmp, 0
            tmp = {16.0, 128.0, 128.0, 0.0};
            m1 = lv tmp, 0
            m0 = subrf64p m0, m1

            tmp = {1.164383, 0.0, 1.596027, 0.0};
            m1 = lv tmp, 0
            m2 = mulrf64p m0, m1

            tmp = {1.164383, -0.391762, -0.812968, 0.0};
            m1 = lv  tmp, 0
            m3 = mulrf64p m0, m1

            tmp = {1.164384, 2.017232, 0.0, 0.0};
            m1 = lv tmp, 0
            m4 = mulrf64p m0, m1

            m5 = sumf64 m2, 0
            m5 = sumf64 m3, 1
            m5 = sumf64 m4, 2

            tmp = {0, 0, 0, 0};
            m1 = lv tmp, 0
            m5 = maxrf64p m5, m1

            tmp = {255, 255, 255, 255};
            m1 = lv tmp, 0
            m5 = minrf64p m5, m1

            sv m5, data[i * width + j]
        }
    }


    for (int num = 0; num < frame_num; num++) {
        int a = num * 3 + 1;
        for (int i = 0; i < height; i++) {
            for (int j = 0; j < width; j++) {

                double tmp[4] = {a / 256.0, a / 256.0, a / 256.0, a / 256.0};
                m0 = lv data[i * width + j], 0
                m1 = lv tmp, 0
                m0 = mulrf64p m0, m1

                tmp = {0.256788, 0.504129, 0.097906, 0.0};
                m1 = lv tmp, 0
                m2 = mulrf64p m0, m1

                tmp = {-0.148223, 0.290993, 0.439126, 0.0};
                m1 = lv tmp, 0
                m3 = mulrf64p m0, m1

                tmp = {0.439126, -0.367788, -0.071427, 0.0};
                m1 = lv tmp, 0
                m4 = mulrf64p m0, m1

                m5 = sumf64 m2, 0
                m5 = sumf64 m3, 1
                m5 = sumf64 m4, 2

                tmp = {16, 128, 128, 0};
                m1 = lv tmp, 0
                m5 = addrf64p m5, m1

                sv m5, tmp

                result[num][0][i * width + j] = tmp[0];
                result[num][1][(i / 2) * (width / 2) + j / 2] = tmp[1];
                result[num][2][(i / 2) * (width / 2) + j / 2] = tmp[2];

            }
        }
    }
    for (int i = 0; i < size; i++) {
        delete[] data[i];
    }
    delete[] data;
};

性能提升

使用mySIMD指令集后,主程序共包含32条SIMD指令,由于每条指令处理4个数据,但有一个数据是无效的,所以相对于一个像素点,其减少的指令数为32*2=64条。如果进行优化,则每次循环只需要4个SIMD寄存器,那么由于图像处理出现数据冒险是小概率的,所以可以同时进行两个循环。

综上所述,减少的指令数大约为64*1920*1080=1.3x10^8条,执行速度会是原来的三倍左右。

最后修改:2020 年 11 月 20 日
如果觉得我的文章对你有用,请随意赞赏