From: DRC Date: Thu, 31 Jan 2019 04:41:57 +0000 (-0600) Subject: Improve readability of Loongson MMI code X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=bb0d1702885182a9489d7f4f21d528105727153b;p=libjpeg-turbo Improve readability of Loongson MMI code We have more than eight registers to work with, as well as three-operand intrinsics, so there's no need for the implementation to be such a literal port of the MMX code. --- diff --git a/simd/loongson/jccolext-mmi.c b/simd/loongson/jccolext-mmi.c index a94b53b..fdbfb50 100644 --- a/simd/loongson/jccolext-mmi.c +++ b/simd/loongson/jccolext-mmi.c @@ -34,59 +34,59 @@ #if RGB_RED == 0 -#define mmA mm0 -#define mmB mm1 +#define mmA re +#define mmB ro #elif RGB_GREEN == 0 -#define mmA mm2 -#define mmB mm3 +#define mmA ge +#define mmB go #elif RGB_BLUE == 0 -#define mmA mm4 -#define mmB mm5 +#define mmA be +#define mmB bo #else -#define mmA mm6 -#define mmB mm7 +#define mmA xe +#define mmB xo #endif #if RGB_RED == 1 -#define mmC mm0 -#define mmD mm1 +#define mmC re +#define mmD ro #elif RGB_GREEN == 1 -#define mmC mm2 -#define mmD mm3 +#define mmC ge +#define mmD go #elif RGB_BLUE == 1 -#define mmC mm4 -#define mmD mm5 +#define mmC be +#define mmD bo #else -#define mmC mm6 -#define mmD mm7 +#define mmC xe +#define mmD xo #endif #if RGB_RED == 2 -#define mmE mm0 -#define mmF mm1 +#define mmE re +#define mmF ro #elif RGB_GREEN == 2 -#define mmE mm2 -#define mmF mm3 +#define mmE ge +#define mmF go #elif RGB_BLUE == 2 -#define mmE mm4 -#define mmF mm5 +#define mmE be +#define mmF bo #else -#define mmE mm6 -#define mmF mm7 +#define mmE xe +#define mmF xo #endif #if RGB_RED == 3 -#define mmG mm0 -#define mmH mm1 +#define mmG re +#define mmH ro #elif RGB_GREEN == 3 -#define mmG mm2 -#define mmH mm3 +#define mmG ge +#define mmH go #elif RGB_BLUE == 3 -#define mmG mm4 -#define mmH mm5 +#define mmG be +#define mmH bo #else -#define mmG mm6 -#define mmH mm7 +#define mmG xe +#define mmH xo #endif @@ -96,9 +96,17 @@ void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf, { JSAMPROW inptr, outptr0, outptr1, outptr2; int num_cols, col; - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - __m64 wk[7]; - __m64 Y_BG, Cb_RG, Cr_BG; + __m64 re, ro, ge, go, be, bo, xe; +#if RGB_PIXELSIZE == 4 + __m64 xo; +#endif + __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho; + __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho; + __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho; + __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye; + __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y; + __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb; + __m64 crle, crhe, cre, crlo, crho, cro, cr; while (--num_rows >= 0) { inptr = *input_buf++; @@ -196,9 +204,8 @@ void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf, } inptr += RGB_PIXELSIZE * 8; } - mmD = mmA; + mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT); mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT); - mmD = _mm_srli_si64(mmD, 4 * BYTE_BIT); mmA = _mm_unpackhi_pi8(mmA, mmG); mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT); @@ -206,26 +213,22 @@ void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf, mmD = _mm_unpacklo_pi8(mmD, mmF); mmG = _mm_unpackhi_pi8(mmG, mmF); - mmE = mmA; + mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT); mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT); - mmE = _mm_srli_si64(mmE, 4 * BYTE_BIT); mmA = _mm_unpackhi_pi8(mmA, mmD); mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT); mmE = _mm_unpacklo_pi8(mmE, mmG); mmD = _mm_unpackhi_pi8(mmD, mmG); - mmC = mmA; + mmC = _mm_loadhi_pi8_f(mmA); mmA = _mm_loadlo_pi8_f(mmA); - mmC = _mm_loadhi_pi8_f(mmC); - mmB = mmE; + mmB = _mm_loadhi_pi8_f(mmE); mmE = _mm_loadlo_pi8_f(mmE); - mmB = _mm_loadhi_pi8_f(mmB); - mmF = mmD; + mmF = _mm_loadhi_pi8_f(mmD); mmD = _mm_loadlo_pi8_f(mmD); - mmF = _mm_loadhi_pi8_f(mmF); #else /* RGB_PIXELSIZE == 4 */ @@ -281,187 +284,156 @@ void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf, mmC = _mm_load_si64((__m64 *)&inptr[24]); inptr += RGB_PIXELSIZE * 8; } - mmB = mmA; + mmB = _mm_unpackhi_pi8(mmA, mmF); mmA = _mm_unpacklo_pi8(mmA, mmF); - mmB = _mm_unpackhi_pi8(mmB, mmF); - mmG = mmD; + mmG = _mm_unpackhi_pi8(mmD, mmC); mmD = _mm_unpacklo_pi8(mmD, mmC); - mmG = _mm_unpackhi_pi8(mmG, mmC); - mmE = mmA; + mmE = _mm_unpackhi_pi16(mmA, mmD); mmA = _mm_unpacklo_pi16(mmA, mmD); - mmE = _mm_unpackhi_pi16(mmE, mmD); - mmH = mmB; + mmH = _mm_unpackhi_pi16(mmB, mmG); mmB = _mm_unpacklo_pi16(mmB, mmG); - mmH = _mm_unpackhi_pi16(mmH, mmG); - mmC = mmA; + mmC = _mm_loadhi_pi8_f(mmA); mmA = _mm_loadlo_pi8_f(mmA); - mmC = _mm_loadhi_pi8_f(mmC); - mmD = mmB; + mmD = _mm_loadhi_pi8_f(mmB); mmB = _mm_loadlo_pi8_f(mmB); - mmD = _mm_loadhi_pi8_f(mmD); - mmG = mmE; + mmG = _mm_loadhi_pi8_f(mmE); mmE = _mm_loadlo_pi8_f(mmE); - mmG = _mm_loadhi_pi8_f(mmG); - mmF = mmH; - mmF = _mm_unpacklo_pi8(mmF, mmH); + mmF = _mm_unpacklo_pi8(mmH, mmH); mmH = _mm_unpackhi_pi8(mmH, mmH); mmF = _mm_srli_pi16(mmF, BYTE_BIT); mmH = _mm_srli_pi16(mmH, BYTE_BIT); #endif - wk[0] = mm0; - wk[1] = mm1; - wk[2] = mm4; - wk[3] = mm5; - - mm6 = mm1; - mm1 = _mm_unpacklo_pi16(mm1, mm3); - mm6 = _mm_unpackhi_pi16(mm6, mm3); - mm7 = mm1; - mm4 = mm6; - mm1 = _mm_madd_pi16(mm1, PW_F0299_F0337); - mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337); - mm7 = _mm_madd_pi16(mm7, PW_MF016_MF033); - mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033); - - wk[4] = mm1; - wk[5] = mm6; - - mm1 = _mm_loadlo_pi16_f(mm5); - mm6 = _mm_loadhi_pi16_f(mm5); - mm1 = _mm_srli_pi32(mm1, 1); - mm6 = _mm_srli_pi32(mm6, 1); - - mm5 = PD_ONEHALFM1_CJ; - mm7 = _mm_add_pi32(mm7, mm1); - mm4 = _mm_add_pi32(mm4, mm6); - mm7 = _mm_add_pi32(mm7, mm5); - mm4 = _mm_add_pi32(mm4, mm5); - mm7 = _mm_srli_pi32(mm7, SCALEBITS); - mm4 = _mm_srli_pi32(mm4, SCALEBITS); - mm7 = _mm_packs_pi32(mm7, mm4); - - mm1 = wk[2]; - mm6 = mm0; - mm0 = _mm_unpacklo_pi16(mm0, mm2); - mm6 = _mm_unpackhi_pi16(mm6, mm2); - mm5 = mm0; - mm4 = mm6; - mm0 = _mm_madd_pi16(mm0, PW_F0299_F0337); - mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337); - mm5 = _mm_madd_pi16(mm5, PW_MF016_MF033); - mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033); - - wk[6] = mm0; - wk[7] = mm6; - mm0 = _mm_loadlo_pi16_f(mm1); - mm6 = _mm_loadhi_pi16_f(mm1); - mm0 = _mm_srli_pi32(mm0, 1); - mm6 = _mm_srli_pi32(mm6, 1); - - mm1 = PD_ONEHALFM1_CJ; - mm5 = _mm_add_pi32(mm5, mm0); - mm4 = _mm_add_pi32(mm4, mm6); - mm5 = _mm_add_pi32(mm5, mm1); - mm4 = _mm_add_pi32(mm4, mm1); - mm5 = _mm_srli_pi32(mm5, SCALEBITS); - mm4 = _mm_srli_pi32(mm4, SCALEBITS); - mm5 = _mm_packs_pi32(mm5, mm4); - - mm7 = _mm_slli_pi16(mm7, BYTE_BIT); - mm5 = _mm_or_si64(mm5, mm7); - Cb_RG = mm5; - - mm0 = wk[3]; - mm6 = wk[2]; - mm1 = wk[1]; - - mm4 = mm0; - mm0 = _mm_unpacklo_pi16(mm0, mm3); - mm4 = _mm_unpackhi_pi16(mm4, mm3); - mm7 = mm0; - mm5 = mm4; - mm0 = _mm_madd_pi16(mm0, PW_F0114_F0250); - mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250); - mm7 = _mm_madd_pi16(mm7, PW_MF008_MF041); - mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041); - - mm3 = PD_ONEHALF; - mm0 = _mm_add_pi32(mm0, wk[4]); - mm4 = _mm_add_pi32(mm4, wk[5]); - mm0 = _mm_add_pi32(mm0, mm3); - mm4 = _mm_add_pi32(mm4, mm3); - mm0 = _mm_srli_pi32(mm0, SCALEBITS); - mm4 = _mm_srli_pi32(mm4, SCALEBITS); - mm0 = _mm_packs_pi32(mm0, mm4); - - mm3 = _mm_loadlo_pi16_f(mm1); - mm4 = _mm_loadhi_pi16_f(mm1); - mm3 = _mm_srli_pi32(mm3, 1); - mm4 = _mm_srli_pi32(mm4, 1); - - mm1 = PD_ONEHALFM1_CJ; - mm7 = _mm_add_pi32(mm7, mm3); - mm5 = _mm_add_pi32(mm5, mm4); - mm7 = _mm_add_pi32(mm7, mm1); - mm5 = _mm_add_pi32(mm5, mm1); - mm7 = _mm_srli_pi32(mm7, SCALEBITS); - mm5 = _mm_srli_pi32(mm5, SCALEBITS); - mm7 = _mm_packs_pi32(mm7, mm5); - - mm3 = wk[0]; - mm4 = mm6; - mm6 = _mm_unpacklo_pi16(mm6, mm2); - mm4 = _mm_unpackhi_pi16(mm4, mm2); - mm1 = mm6; - mm5 = mm4; - mm6 = _mm_madd_pi16(mm6, PW_F0114_F0250); - mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250); - mm1 = _mm_madd_pi16(mm1, PW_MF008_MF041); - mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041); - - mm2 = PD_ONEHALF; - mm6 = _mm_add_pi32(mm6, wk[6]); - mm4 = _mm_add_pi32(mm4, wk[7]); - mm6 = _mm_add_pi32(mm6, mm2); - mm4 = _mm_add_pi32(mm4, mm2); - mm6 = _mm_srli_pi32(mm6, SCALEBITS); - mm4 = _mm_srli_pi32(mm4, SCALEBITS); - mm6 = _mm_packs_pi32(mm6, mm4); - - mm0 = _mm_slli_pi16(mm0, BYTE_BIT); - mm6 = _mm_or_si64(mm6, mm0); - Y_BG = mm6; - - mm2 = _mm_loadlo_pi16_f(mm3); - mm4 = _mm_loadhi_pi16_f(mm3); - mm2 = _mm_srli_pi32(mm2, 1); - mm4 = _mm_srli_pi32(mm4, 1); - - mm0 = PD_ONEHALFM1_CJ; - mm1 = _mm_add_pi32(mm1, mm2); - mm5 = _mm_add_pi32(mm5, mm4); - mm1 = _mm_add_pi32(mm1, mm0); - mm5 = _mm_add_pi32(mm5, mm0); - mm1 = _mm_srli_pi32(mm1, SCALEBITS); - mm5 = _mm_srli_pi32(mm5, SCALEBITS); - mm1 = _mm_packs_pi32(mm1, mm5); - - mm7 = _mm_slli_pi16(mm7, BYTE_BIT); - mm1 = _mm_or_si64(mm1, mm7); - Cr_BG = mm1; - - _mm_store_si64((__m64 *)&outptr0[0], Y_BG); - _mm_store_si64((__m64 *)&outptr1[0], Cb_RG); - _mm_store_si64((__m64 *)&outptr2[0], Cr_BG); + /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6) + * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7) + * + * (Original) + * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + * + * (This implementation) + * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + */ + + rglo = _mm_unpacklo_pi16(ro, go); + rgho = _mm_unpackhi_pi16(ro, go); + ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337); + yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337); + cblo = _mm_madd_pi16(rglo, PW_MF016_MF033); + cbho = _mm_madd_pi16(rgho, PW_MF016_MF033); + + blo = _mm_loadlo_pi16_f(bo); + bho = _mm_loadhi_pi16_f(bo); + halfblo = _mm_srli_pi32(blo, 1); + halfbho = _mm_srli_pi32(bho, 1); + + cblo = _mm_add_pi32(cblo, halfblo); + cbho = _mm_add_pi32(cbho, halfbho); + cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ); + cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ); + cblo = _mm_srli_pi32(cblo, SCALEBITS); + cbho = _mm_srli_pi32(cbho, SCALEBITS); + cbo = _mm_packs_pi32(cblo, cbho); + + rgle = _mm_unpacklo_pi16(re, ge); + rghe = _mm_unpackhi_pi16(re, ge); + yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337); + yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337); + cble = _mm_madd_pi16(rgle, PW_MF016_MF033); + cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033); + + ble = _mm_loadlo_pi16_f(be); + bhe = _mm_loadhi_pi16_f(be); + halfble = _mm_srli_pi32(ble, 1); + halfbhe = _mm_srli_pi32(bhe, 1); + + cble = _mm_add_pi32(cble, halfble); + cbhe = _mm_add_pi32(cbhe, halfbhe); + cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ); + cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ); + cble = _mm_srli_pi32(cble, SCALEBITS); + cbhe = _mm_srli_pi32(cbhe, SCALEBITS); + cbe = _mm_packs_pi32(cble, cbhe); + + cbo = _mm_slli_pi16(cbo, BYTE_BIT); + cb = _mm_or_si64(cbe, cbo); + + bglo = _mm_unpacklo_pi16(bo, go); + bgho = _mm_unpackhi_pi16(bo, go); + ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250); + yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250); + crlo = _mm_madd_pi16(bglo, PW_MF008_MF041); + crho = _mm_madd_pi16(bgho, PW_MF008_MF041); + + ylo = _mm_add_pi32(ylo_bg, ylo_rg); + yho = _mm_add_pi32(yho_bg, yho_rg); + ylo = _mm_add_pi32(ylo, PD_ONEHALF); + yho = _mm_add_pi32(yho, PD_ONEHALF); + ylo = _mm_srli_pi32(ylo, SCALEBITS); + yho = _mm_srli_pi32(yho, SCALEBITS); + yo = _mm_packs_pi32(ylo, yho); + + rlo = _mm_loadlo_pi16_f(ro); + rho = _mm_loadhi_pi16_f(ro); + halfrlo = _mm_srli_pi32(rlo, 1); + halfrho = _mm_srli_pi32(rho, 1); + + crlo = _mm_add_pi32(crlo, halfrlo); + crho = _mm_add_pi32(crho, halfrho); + crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ); + crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ); + crlo = _mm_srli_pi32(crlo, SCALEBITS); + crho = _mm_srli_pi32(crho, SCALEBITS); + cro = _mm_packs_pi32(crlo, crho); + + bgle = _mm_unpacklo_pi16(be, ge); + bghe = _mm_unpackhi_pi16(be, ge); + yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250); + yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250); + crle = _mm_madd_pi16(bgle, PW_MF008_MF041); + crhe = _mm_madd_pi16(bghe, PW_MF008_MF041); + + yle = _mm_add_pi32(yle_bg, yle_rg); + yhe = _mm_add_pi32(yhe_bg, yhe_rg); + yle = _mm_add_pi32(yle, PD_ONEHALF); + yhe = _mm_add_pi32(yhe, PD_ONEHALF); + yle = _mm_srli_pi32(yle, SCALEBITS); + yhe = _mm_srli_pi32(yhe, SCALEBITS); + ye = _mm_packs_pi32(yle, yhe); + + yo = _mm_slli_pi16(yo, BYTE_BIT); + y = _mm_or_si64(ye, yo); + + rle = _mm_loadlo_pi16_f(re); + rhe = _mm_loadhi_pi16_f(re); + halfrle = _mm_srli_pi32(rle, 1); + halfrhe = _mm_srli_pi32(rhe, 1); + + crle = _mm_add_pi32(crle, halfrle); + crhe = _mm_add_pi32(crhe, halfrhe); + crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ); + crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ); + crle = _mm_srli_pi32(crle, SCALEBITS); + crhe = _mm_srli_pi32(crhe, SCALEBITS); + cre = _mm_packs_pi32(crle, crhe); + + cro = _mm_slli_pi16(cro, BYTE_BIT); + cr = _mm_or_si64(cre, cro); + + _mm_store_si64((__m64 *)&outptr0[0], y); + _mm_store_si64((__m64 *)&outptr1[0], cb); + _mm_store_si64((__m64 *)&outptr2[0], cr); } } } diff --git a/simd/loongson/jcsample-mmi.c b/simd/loongson/jcsample-mmi.c index 2f2d851..0354dac 100644 --- a/simd/loongson/jcsample-mmi.c +++ b/simd/loongson/jcsample-mmi.c @@ -1,7 +1,7 @@ /* * Loongson MMI optimizations for libjpeg-turbo * - * Copyright (C) 2015, 2018, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015, 2018-2019, D. R. Commander. All Rights Reserved. * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. * All Rights Reserved. * Authors: ZhuChen @@ -39,18 +39,20 @@ void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor, JDIMENSION width_in_blocks, JSAMPARRAY input_data, JSAMPARRAY output_data) { - int inrow, outrow, outcol, bias; + int inrow, outrow, outcol; JDIMENSION output_cols = width_in_blocks * DCTSIZE; JSAMPROW inptr0, inptr1, outptr; - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6 = 0.0, mm7; + __m64 bias, mask = 0.0, thisavg, nextavg, avg; + __m64 this0o, this0e, this0, this0sum, next0o, next0e, next0, next0sum; + __m64 this1o, this1e, this1, this1sum, next1o, next1e, next1, next1sum; expand_right_edge(input_data, max_v_samp_factor, image_width, output_cols * 2); - bias = (1 << 17) + 1; /* 0x00020001 (bias pattern) */ - mm7 = _mm_set1_pi32(bias); /* mm7={1, 2, 1, 2} */ - mm6 = _mm_cmpeq_pi16(mm6, mm6); - mm6 = _mm_srli_pi16(mm6, BYTE_BIT); /* mm6={0xFF 0x00 0xFF 0x00 ..} */ + bias = _mm_set1_pi32((1 << 17) + 1); /* 0x00020001 (32-bit bias pattern) */ + /* bias={1, 2, 1, 2} (16-bit) */ + mask = _mm_cmpeq_pi16(mask, mask); + mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */ for (inrow = 0, outrow = 0; outrow < v_samp_factor; inrow += 2, outrow++) { @@ -62,39 +64,35 @@ void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor, for (outcol = output_cols; outcol > 0; outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) { - mm0 = _mm_load_si64((__m64 *)&inptr0[0]); - mm1 = _mm_load_si64((__m64 *)&inptr1[0]); - mm2 = _mm_load_si64((__m64 *)&inptr0[8]); - mm3 = _mm_load_si64((__m64 *)&inptr1[8]); + this0 = _mm_load_si64((__m64 *)&inptr0[0]); + this1 = _mm_load_si64((__m64 *)&inptr1[0]); + next0 = _mm_load_si64((__m64 *)&inptr0[8]); + next1 = _mm_load_si64((__m64 *)&inptr1[8]); - mm4 = mm0; - mm5 = mm1; - mm0 = _mm_and_si64(mm0, mm6); - mm4 = _mm_srli_pi16(mm4, BYTE_BIT); - mm1 = _mm_and_si64(mm1, mm6); - mm5 = _mm_srli_pi16(mm5, BYTE_BIT); - mm0 = _mm_add_pi16(mm0, mm4); - mm1 = _mm_add_pi16(mm1, mm5); + this0o = _mm_and_si64(this0, mask); + this0e = _mm_srli_pi16(this0, BYTE_BIT); + this1o = _mm_and_si64(this1, mask); + this1e = _mm_srli_pi16(this1, BYTE_BIT); + this0sum = _mm_add_pi16(this0o, this0e); + this1sum = _mm_add_pi16(this1o, this1e); - mm4 = mm2; - mm5 = mm3; - mm2 = _mm_and_si64(mm2, mm6); - mm4 = _mm_srli_pi16(mm4, BYTE_BIT); - mm3 = _mm_and_si64(mm3, mm6); - mm5 = _mm_srli_pi16(mm5, BYTE_BIT); - mm2 = _mm_add_pi16(mm2, mm4); - mm3 = _mm_add_pi16(mm3, mm5); + next0o = _mm_and_si64(next0, mask); + next0e = _mm_srli_pi16(next0, BYTE_BIT); + next1o = _mm_and_si64(next1, mask); + next1e = _mm_srli_pi16(next1, BYTE_BIT); + next0sum = _mm_add_pi16(next0o, next0e); + next1sum = _mm_add_pi16(next1o, next1e); - mm0 = _mm_add_pi16(mm0, mm1); - mm2 = _mm_add_pi16(mm2, mm3); - mm0 = _mm_add_pi16(mm0, mm7); - mm2 = _mm_add_pi16(mm2, mm7); - mm0 = _mm_srli_pi16(mm0, 2); - mm2 = _mm_srli_pi16(mm2, 2); + thisavg = _mm_add_pi16(this0sum, this1sum); + nextavg = _mm_add_pi16(next0sum, next1sum); + thisavg = _mm_add_pi16(thisavg, bias); + nextavg = _mm_add_pi16(nextavg, bias); + thisavg = _mm_srli_pi16(thisavg, 2); + nextavg = _mm_srli_pi16(nextavg, 2); - mm0 = _mm_packs_pu16(mm0, mm2); + avg = _mm_packs_pu16(thisavg, nextavg); - _mm_store_si64((__m64 *)&outptr[0], mm0); + _mm_store_si64((__m64 *)&outptr[0], avg); } } } diff --git a/simd/loongson/jdcolext-mmi.c b/simd/loongson/jdcolext-mmi.c index 7f44649..982fa76 100644 --- a/simd/loongson/jdcolext-mmi.c +++ b/simd/loongson/jdcolext-mmi.c @@ -33,59 +33,59 @@ #if RGB_RED == 0 -#define mmA mm0 -#define mmB mm1 +#define mmA re +#define mmB ro #elif RGB_GREEN == 0 -#define mmA mm2 -#define mmB mm3 +#define mmA ge +#define mmB go #elif RGB_BLUE == 0 -#define mmA mm4 -#define mmB mm5 +#define mmA be +#define mmB bo #else -#define mmA mm6 -#define mmB mm7 +#define mmA xe +#define mmB xo #endif #if RGB_RED == 1 -#define mmC mm0 -#define mmD mm1 +#define mmC re +#define mmD ro #elif RGB_GREEN == 1 -#define mmC mm2 -#define mmD mm3 +#define mmC ge +#define mmD go #elif RGB_BLUE == 1 -#define mmC mm4 -#define mmD mm5 +#define mmC be +#define mmD bo #else -#define mmC mm6 -#define mmD mm7 +#define mmC xe +#define mmD xo #endif #if RGB_RED == 2 -#define mmE mm0 -#define mmF mm1 +#define mmE re +#define mmF ro #elif RGB_GREEN == 2 -#define mmE mm2 -#define mmF mm3 +#define mmE ge +#define mmF go #elif RGB_BLUE == 2 -#define mmE mm4 -#define mmF mm5 +#define mmE be +#define mmF bo #else -#define mmE mm6 -#define mmF mm7 +#define mmE xe +#define mmF xo #endif #if RGB_RED == 3 -#define mmG mm0 -#define mmH mm1 +#define mmG re +#define mmH ro #elif RGB_GREEN == 3 -#define mmG mm2 -#define mmH mm3 +#define mmG ge +#define mmH go #elif RGB_BLUE == 3 -#define mmG mm4 -#define mmH mm5 +#define mmG be +#define mmH bo #else -#define mmG mm6 -#define mmH mm7 +#define mmG xe +#define mmH xo #endif @@ -95,8 +95,9 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf, { JSAMPROW outptr, inptr0, inptr1, inptr2; int num_cols, col; - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - __m64 mm8, wk[2]; + __m64 ye, yo, y, cbe, cbe2, cbo, cbo2, cb, cre, cre2, cro, cro2, cr; + __m64 re, ro, gle, ghe, ge, glo, gho, go, be, bo, xe = 0.0, xo = 0.0; + __m64 decenter, mask; while (--num_rows >= 0) { inptr0 = input_buf[0][input_row]; @@ -108,25 +109,24 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf, for (num_cols = out_width; num_cols > 0; num_cols -= 8, inptr0 += 8, inptr1 += 8, inptr2 += 8) { - mm5 = _mm_load_si64((__m64 *)inptr1); - mm1 = _mm_load_si64((__m64 *)inptr2); - mm8 = _mm_load_si64((__m64 *)inptr0); - mm4 = 0; - mm7 = 0; - mm4 = _mm_cmpeq_pi16(mm4, mm4); - mm7 = _mm_cmpeq_pi16(mm7, mm7); - mm4 = _mm_srli_pi16(mm4, BYTE_BIT); - mm7 = _mm_slli_pi16(mm7, 7); /* mm7={0xFF80 0xFF80 0xFF80 0xFF80} */ - mm0 = mm4; /* mm0=mm4={0xFF 0x00 0xFF 0x00 ..} */ - - mm4 = _mm_and_si64(mm4, mm5); /* mm4=Cb(0246)=CbE */ - mm5 = _mm_srli_pi16(mm5, BYTE_BIT); /* mm5=Cb(1357)=CbO */ - mm0 = _mm_and_si64(mm0, mm1); /* mm0=Cr(0246)=CrE */ - mm1 = _mm_srli_pi16(mm1, BYTE_BIT); /* mm1=Cr(1357)=CrO */ - mm4 = _mm_add_pi16(mm4, mm7); - mm5 = _mm_add_pi16(mm5, mm7); - mm0 = _mm_add_pi16(mm0, mm7); - mm1 = _mm_add_pi16(mm1, mm7); + cb = _mm_load_si64((__m64 *)inptr1); + cr = _mm_load_si64((__m64 *)inptr2); + y = _mm_load_si64((__m64 *)inptr0); + + mask = decenter = 0.0; + mask = _mm_cmpeq_pi16(mask, mask); + decenter = _mm_cmpeq_pi16(decenter, decenter); + mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */ + decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */ + + cbe = _mm_and_si64(mask, cb); /* Cb(0246) */ + cbo = _mm_srli_pi16(cb, BYTE_BIT); /* Cb(1357) */ + cre = _mm_and_si64(mask, cr); /* Cr(0246) */ + cro = _mm_srli_pi16(cr, BYTE_BIT); /* Cr(1357) */ + cbe = _mm_add_pi16(cbe, decenter); + cbo = _mm_add_pi16(cbo, decenter); + cre = _mm_add_pi16(cre, decenter); + cro = _mm_add_pi16(cro, decenter); /* (Original) * R = Y + 1.40200 * Cr @@ -139,116 +139,98 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf, * B = Y - 0.22800 * Cb + Cb + Cb */ - mm2 = mm4; /* mm2 = CbE */ - mm3 = mm5; /* mm3 = CbO */ - mm4 = _mm_add_pi16(mm4, mm4); /* mm4 = 2*CbE */ - mm5 = _mm_add_pi16(mm5, mm5); /* mm5 = 2*CbO */ - mm6 = mm0; /* mm6 = CrE */ - mm7 = mm1; /* mm7 = CrO */ - mm0 = _mm_add_pi16(mm0, mm0); /* mm0 = 2*CrE */ - mm1 = _mm_add_pi16(mm1, mm1); /* mm1 = 2*CrO */ - - mm4 = _mm_mulhi_pi16(mm4, PW_MF0228); /* mm4=(2*CbE * -FIX(0.22800) */ - mm5 = _mm_mulhi_pi16(mm5, PW_MF0228); /* mm5=(2*CbO * -FIX(0.22800) */ - mm0 = _mm_mulhi_pi16(mm0, PW_F0402); /* mm0=(2*CrE * FIX(0.40200)) */ - mm1 = _mm_mulhi_pi16(mm1, PW_F0402); /* mm1=(2*CrO * FIX(0.40200)) */ - - mm4 = _mm_add_pi16(mm4, PW_ONE); - mm5 = _mm_add_pi16(mm5, PW_ONE); - mm4 = _mm_srai_pi16(mm4, 1); /* mm4=(CbE * -FIX(0.22800)) */ - mm5 = _mm_srai_pi16(mm5, 1); /* mm5=(CbO * -FIX(0.22800)) */ - mm0 = _mm_add_pi16(mm0, PW_ONE); - mm1 = _mm_add_pi16(mm1, PW_ONE); - mm0 = _mm_srai_pi16(mm0, 1); /* mm0=(CrE * FIX(0.40200)) */ - mm1 = _mm_srai_pi16(mm1, 1); /* mm1=(CrO * FIX(0.40200)) */ - - mm4 = _mm_add_pi16(mm4, mm2); - mm5 = _mm_add_pi16(mm5, mm3); - mm4 = _mm_add_pi16(mm4, mm2); /* mm4=(CbE * FIX(1.77200))=(B-Y)E */ - mm5 = _mm_add_pi16(mm5, mm3); /* mm5=(CbO * FIX(1.77200))=(B-Y)O */ - mm0 = _mm_add_pi16(mm0, mm6); /* mm0=(CrE * FIX(1.40200))=(R-Y)E */ - mm1 = _mm_add_pi16(mm1, mm7); /* mm1=(CrO * FIX(1.40200))=(R-Y)O */ - - wk[0] = mm4; /* wk(0)=(B-Y)E */ - wk[1] = mm5; /* wk(1)=(B-Y)O */ - - mm4 = mm2; - mm5 = mm3; - mm2 = _mm_unpacklo_pi16(mm2, mm6); - mm4 = _mm_unpackhi_pi16(mm4, mm6); - mm2 = _mm_madd_pi16(mm2, PW_MF0344_F0285); - mm4 = _mm_madd_pi16(mm4, PW_MF0344_F0285); - mm3 = _mm_unpacklo_pi16(mm3, mm7); - mm5 = _mm_unpackhi_pi16(mm5, mm7); - mm3 = _mm_madd_pi16(mm3, PW_MF0344_F0285); - mm5 = _mm_madd_pi16(mm5, PW_MF0344_F0285); - - mm2 = _mm_add_pi32(mm2, PD_ONEHALF); - mm4 = _mm_add_pi32(mm4, PD_ONEHALF); - mm2 = _mm_srai_pi32(mm2, SCALEBITS); - mm4 = _mm_srai_pi32(mm4, SCALEBITS); - mm3 = _mm_add_pi32(mm3, PD_ONEHALF); - mm5 = _mm_add_pi32(mm5, PD_ONEHALF); - mm3 = _mm_srai_pi32(mm3, SCALEBITS); - mm5 = _mm_srai_pi32(mm5, SCALEBITS); - - mm2 = _mm_packs_pi32(mm2, mm4); /* mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) */ - mm3 = _mm_packs_pi32(mm3, mm5); /* mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) */ - mm2 = _mm_sub_pi16(mm2, mm6); /* mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */ - mm3 = _mm_sub_pi16(mm3, mm7); /* mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */ - - mm5 = mm8; /* mm5=Y(01234567) */ - - mm4 = _mm_cmpeq_pi16(mm4, mm4); - mm4 = _mm_srli_pi16(mm4, BYTE_BIT); /* mm4={0xFF 0x00 0xFF 0x00 ..} */ - mm4 = _mm_and_si64(mm4, mm5); /* mm4=Y(0246)=YE */ - mm5 = _mm_srli_pi16(mm5, BYTE_BIT); /* mm5=Y(1357)=YO */ - - mm0 = _mm_add_pi16(mm0, mm4); /* mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) */ - mm1 = _mm_add_pi16(mm1, mm5); /* mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) */ - mm0 = _mm_packs_pu16(mm0, mm0); /* mm0=(R0 R2 R4 R6 ** ** ** **) */ - mm1 = _mm_packs_pu16(mm1, mm1); /* mm1=(R1 R3 R5 R7 ** ** ** **) */ - - mm2 = _mm_add_pi16(mm2, mm4); /* mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) */ - mm3 = _mm_add_pi16(mm3, mm5); /* mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) */ - mm2 = _mm_packs_pu16(mm2, mm2); /* mm2=(G0 G2 G4 G6 ** ** ** **) */ - mm3 = _mm_packs_pu16(mm3, mm3); /* mm3=(G1 G3 G5 G7 ** ** ** **) */ - - mm4 = _mm_add_pi16(mm4, wk[0]); /* mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) */ - mm5 = _mm_add_pi16(mm5, wk[1]); /* mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) */ - mm4 = _mm_packs_pu16(mm4, mm4); /* mm4=(B0 B2 B4 B6 ** ** ** **) */ - mm5 = _mm_packs_pu16(mm5, mm5); /* mm5=(B1 B3 B5 B7 ** ** ** **) */ + cbe2 = _mm_add_pi16(cbe, cbe); /* 2*CbE */ + cbo2 = _mm_add_pi16(cbo, cbo); /* 2*CbO */ + cre2 = _mm_add_pi16(cre, cre); /* 2*CrE */ + cro2 = _mm_add_pi16(cro, cro); /* 2*CrO */ + + be = _mm_mulhi_pi16(cbe2, PW_MF0228); /* (2*CbE * -FIX(0.22800) */ + bo = _mm_mulhi_pi16(cbo2, PW_MF0228); /* (2*CbO * -FIX(0.22800) */ + re = _mm_mulhi_pi16(cre2, PW_F0402); /* (2*CrE * FIX(0.40200)) */ + ro = _mm_mulhi_pi16(cro2, PW_F0402); /* (2*CrO * FIX(0.40200)) */ + + be = _mm_add_pi16(be, PW_ONE); + bo = _mm_add_pi16(bo, PW_ONE); + be = _mm_srai_pi16(be, 1); /* (CbE * -FIX(0.22800)) */ + bo = _mm_srai_pi16(bo, 1); /* (CbO * -FIX(0.22800)) */ + re = _mm_add_pi16(re, PW_ONE); + ro = _mm_add_pi16(ro, PW_ONE); + re = _mm_srai_pi16(re, 1); /* (CrE * FIX(0.40200)) */ + ro = _mm_srai_pi16(ro, 1); /* (CrO * FIX(0.40200)) */ + + be = _mm_add_pi16(be, cbe); + bo = _mm_add_pi16(bo, cbo); + be = _mm_add_pi16(be, cbe); /* (CbE * FIX(1.77200))=(B-Y)E */ + bo = _mm_add_pi16(bo, cbo); /* (CbO * FIX(1.77200))=(B-Y)O */ + re = _mm_add_pi16(re, cre); /* (CrE * FIX(1.40200))=(R-Y)E */ + ro = _mm_add_pi16(ro, cro); /* (CrO * FIX(1.40200))=(R-Y)O */ + + gle = _mm_unpacklo_pi16(cbe, cre); + ghe = _mm_unpackhi_pi16(cbe, cre); + gle = _mm_madd_pi16(gle, PW_MF0344_F0285); + ghe = _mm_madd_pi16(ghe, PW_MF0344_F0285); + glo = _mm_unpacklo_pi16(cbo, cro); + gho = _mm_unpackhi_pi16(cbo, cro); + glo = _mm_madd_pi16(glo, PW_MF0344_F0285); + gho = _mm_madd_pi16(gho, PW_MF0344_F0285); + + gle = _mm_add_pi32(gle, PD_ONEHALF); + ghe = _mm_add_pi32(ghe, PD_ONEHALF); + gle = _mm_srai_pi32(gle, SCALEBITS); + ghe = _mm_srai_pi32(ghe, SCALEBITS); + glo = _mm_add_pi32(glo, PD_ONEHALF); + gho = _mm_add_pi32(gho, PD_ONEHALF); + glo = _mm_srai_pi32(glo, SCALEBITS); + gho = _mm_srai_pi32(gho, SCALEBITS); + + ge = _mm_packs_pi32(gle, ghe); /* CbE*-FIX(0.344)+CrE*FIX(0.285) */ + go = _mm_packs_pi32(glo, gho); /* CbO*-FIX(0.344)+CrO*FIX(0.285) */ + ge = _mm_sub_pi16(ge, cre); /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */ + go = _mm_sub_pi16(go, cro); /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */ + + ye = _mm_and_si64(mask, y); /* Y(0246) */ + yo = _mm_srli_pi16(y, BYTE_BIT); /* Y(1357) */ + + re = _mm_add_pi16(re, ye); /* ((R-Y)E+YE)=(R0 R2 R4 R6) */ + ro = _mm_add_pi16(ro, yo); /* ((R-Y)O+YO)=(R1 R3 R5 R7) */ + re = _mm_packs_pu16(re, re); /* (R0 R2 R4 R6 ** ** ** **) */ + ro = _mm_packs_pu16(ro, ro); /* (R1 R3 R5 R7 ** ** ** **) */ + + ge = _mm_add_pi16(ge, ye); /* ((G-Y)E+YE)=(G0 G2 G4 G6) */ + go = _mm_add_pi16(go, yo); /* ((G-Y)O+YO)=(G1 G3 G5 G7) */ + ge = _mm_packs_pu16(ge, ge); /* (G0 G2 G4 G6 ** ** ** **) */ + go = _mm_packs_pu16(go, go); /* (G1 G3 G5 G7 ** ** ** **) */ + + be = _mm_add_pi16(be, ye); /* (YE+(B-Y)E)=(B0 B2 B4 B6) */ + bo = _mm_add_pi16(bo, yo); /* (YO+(B-Y)O)=(B1 B3 B5 B7) */ + be = _mm_packs_pu16(be, be); /* (B0 B2 B4 B6 ** ** ** **) */ + bo = _mm_packs_pu16(bo, bo); /* (B1 B3 B5 B7 ** ** ** **) */ #if RGB_PIXELSIZE == 3 /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */ /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */ - mmA = _mm_unpacklo_pi8(mmA, mmC); /* mmA=(00 10 02 12 04 14 06 16) */ - mmE = _mm_unpacklo_pi8(mmE, mmB); /* mmE=(20 01 22 03 24 05 26 07) */ - mmD = _mm_unpacklo_pi8(mmD, mmF); /* mmD=(11 21 13 23 15 25 17 27) */ + mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */ + mmE = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */ + mmD = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */ - mmG = mmA; - mmH = mmA; - mmA = _mm_unpacklo_pi16(mmA, mmE); /* mmA=(00 10 20 01 02 12 22 03) */ - mmG = _mm_unpackhi_pi16(mmG, mmE); /* mmG=(04 14 24 05 06 16 26 07) */ + mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT); - mmH = _mm_srli_si64(mmH, 2 * BYTE_BIT); - mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT); + mmG = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 05 06 16 26 07) */ + mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 01 02 12 22 03) */ - mmC = mmD; - mmB = mmD; - mmD = _mm_unpacklo_pi16(mmD, mmH); /* mmD=(11 21 02 12 13 23 04 14) */ - mmC = _mm_unpackhi_pi16(mmC, mmH); /* mmC=(15 25 06 16 17 27 -- --) */ + mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT); + mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT); /* (13 23 15 25 17 27 -- --) */ - mmB = _mm_srli_si64(mmB, 2 * BYTE_BIT); /* mmB=(13 23 15 25 17 27 -- --) */ + mmC = _mm_unpackhi_pi16(mmD, mmH); /* (15 25 06 16 17 27 -- --) */ + mmD = _mm_unpacklo_pi16(mmD, mmH); /* (11 21 02 12 13 23 04 14) */ - mmF = mmE; - mmE = _mm_unpacklo_pi16(mmE, mmB); /* mmE=(22 03 13 23 24 05 15 25) */ - mmF = _mm_unpackhi_pi16(mmF, mmB); /* mmF=(26 07 17 27 -- -- -- --) */ + mmF = _mm_unpackhi_pi16(mmE, mmB); /* (26 07 17 27 -- -- -- --) */ + mmE = _mm_unpacklo_pi16(mmE, mmB); /* (22 03 13 23 24 05 15 25) */ - mmA = _mm_unpacklo_pi32(mmA, mmD); /* mmA=(00 10 20 01 11 21 02 12) */ - mmE = _mm_unpacklo_pi32(mmE, mmG); /* mmE=(22 03 13 23 04 14 24 05) */ - mmC = _mm_unpacklo_pi32(mmC, mmF); /* mmC=(15 25 06 16 26 07 17 27) */ + mmA = _mm_unpacklo_pi32(mmA, mmD); /* (00 10 20 01 11 21 02 12) */ + mmE = _mm_unpacklo_pi32(mmE, mmG); /* (22 03 13 23 04 14 24 05) */ + mmC = _mm_unpacklo_pi32(mmC, mmF); /* (15 25 06 16 26 07 17 27) */ if (num_cols >= 8) { if (!(((long)outptr) & 7)) { @@ -332,35 +314,31 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf, #else /* RGB_PIXELSIZE == 4 */ #ifdef RGBX_FILLER_0XFF - mm6 = _mm_cmpeq_pi8(mm6, mm6); - mm7 = _mm_cmpeq_pi8(mm7, mm7); + xe = _mm_cmpeq_pi8(xe, xe); + xo = _mm_cmpeq_pi8(xo, xo); #else - mm6 = _mm_xor_si64(mm6, mm6); - mm7 = _mm_xor_si64(mm7, mm7); + xe = _mm_xor_si64(xe, xe); + xo = _mm_xor_si64(xo, xo); #endif /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */ /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */ /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */ /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */ - mmA = _mm_unpacklo_pi8(mmA, mmC); /* mmA=(00 10 02 12 04 14 06 16) */ - mmE = _mm_unpacklo_pi8(mmE, mmG); /* mmE=(20 30 22 32 24 34 26 36) */ - mmB = _mm_unpacklo_pi8(mmB, mmD); /* mmB=(01 11 03 13 05 15 07 17) */ - mmF = _mm_unpacklo_pi8(mmF, mmH); /* mmF=(21 31 23 33 25 35 27 37) */ - - mmC = mmA; - mmA = _mm_unpacklo_pi16(mmA, mmE); /* mmA=(00 10 20 30 02 12 22 32) */ - mmC = _mm_unpackhi_pi16(mmC, mmE); /* mmC=(04 14 24 34 06 16 26 36) */ - mmG = mmB; - mmB = _mm_unpacklo_pi16(mmB, mmF); /* mmB=(01 11 21 31 03 13 23 33) */ - mmG = _mm_unpackhi_pi16(mmG, mmF); /* mmG=(05 15 25 35 07 17 27 37) */ - - mmD = mmA; - mmA = _mm_unpacklo_pi32(mmA, mmB); /* mmA=(00 10 20 30 01 11 21 31) */ - mmD = _mm_unpackhi_pi32(mmD, mmB); /* mmD=(02 12 22 32 03 13 23 33) */ - mmH = mmC; - mmC = _mm_unpacklo_pi32(mmC, mmG); /* mmC=(04 14 24 34 05 15 25 35) */ - mmH = _mm_unpackhi_pi32(mmH, mmG); /* mmH=(06 16 26 36 07 17 27 37) */ + mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */ + mmE = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */ + mmB = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */ + mmF = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */ + + mmC = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 34 06 16 26 36) */ + mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 30 02 12 22 32) */ + mmG = _mm_unpackhi_pi16(mmB, mmF); /* (05 15 25 35 07 17 27 37) */ + mmB = _mm_unpacklo_pi16(mmB, mmF); /* (01 11 21 31 03 13 23 33) */ + + mmD = _mm_unpackhi_pi32(mmA, mmB); /* (02 12 22 32 03 13 23 33) */ + mmA = _mm_unpacklo_pi32(mmA, mmB); /* (00 10 20 30 01 11 21 31) */ + mmH = _mm_unpackhi_pi32(mmC, mmG); /* (06 16 26 36 07 17 27 37) */ + mmC = _mm_unpacklo_pi32(mmC, mmG); /* (04 14 24 34 05 15 25 35) */ if (num_cols >= 8) { _mm_store_si64((__m64 *)outptr, mmA); diff --git a/simd/loongson/jdsample-mmi.c b/simd/loongson/jdsample-mmi.c index 00a6265..d7e10db 100644 --- a/simd/loongson/jdsample-mmi.c +++ b/simd/loongson/jdsample-mmi.c @@ -1,7 +1,7 @@ /* * Loongson MMI optimizations for libjpeg-turbo * - * Copyright (C) 2015, 2018, D. R. Commander. All Rights Reserved. + * Copyright (C) 2015, 2018-2019, D. R. Commander. All Rights Reserved. * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. * All Rights Reserved. * Authors: ZhuChen @@ -51,56 +51,53 @@ static uint64_t const_value[] = { #define PROCESS_ROW(r) { \ - mm7 = _mm_load_si64((__m64 *)outptr##r); /* mm7=IntrL=( 0 1 2 3) */ \ - mm3 = _mm_load_si64((__m64 *)outptr##r + 1); /* mm3=IntrH=( 4 5 6 7) */ \ + __m64 samp0123, samp123X, samp3XXX, samp1234, sampX012, samp_1012; \ + __m64 samp4567, sampXXX4, sampX456, samp3456, samp567X, samp7XXX, samp5678; \ + __m64 outle, outhe, outlo, outho, outl, outh; \ \ - mm0 = mm7; \ - mm4 = mm3; \ - mm0 = _mm_srli_si64(mm0, 2 * BYTE_BIT); /* mm0=( 1 2 3 -) */ \ - mm4 = _mm_slli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( - - - 4) */ \ - mm5 = mm7; \ - mm6 = mm3; \ - mm5 = _mm_srli_si64(mm5, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm5=( 3 - - -) */ \ - mm6 = _mm_slli_si64(mm6, 2 * BYTE_BIT); /* mm6=( - 4 5 6) */ \ + samp0123 = _mm_load_si64((__m64 *)outptr##r); /* ( 0 1 2 3) */ \ + samp4567 = _mm_load_si64((__m64 *)outptr##r + 1); /* ( 4 5 6 7) */ \ \ - mm0 = _mm_or_si64(mm0, mm4); /* mm0=( 1 2 3 4) */ \ - mm5 = _mm_or_si64(mm5, mm6); /* mm5=( 3 4 5 6) */ \ + samp123X = _mm_srli_si64(samp0123, 2 * BYTE_BIT); /* ( 1 2 3 -) */ \ + sampXXX4 = _mm_slli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 4) */ \ + samp3XXX = _mm_srli_si64(samp0123, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( 3 - - -) */ \ + sampX456 = _mm_slli_si64(samp4567, 2 * BYTE_BIT); /* ( - 4 5 6) */ \ \ - mm1 = mm7; \ - mm2 = mm3; \ - mm1 = _mm_slli_si64(mm1, 2 * BYTE_BIT); /* mm1=( - 0 1 2) */ \ - mm2 = _mm_srli_si64(mm2, 2 * BYTE_BIT); /* mm2=( 5 6 7 -) */ \ - mm4 = mm3; \ - mm4 = _mm_srli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( 7 - - -) */ \ + samp1234 = _mm_or_si64(samp123X, sampXXX4); /* ( 1 2 3 4) */ \ + samp3456 = _mm_or_si64(samp3XXX, sampX456); /* ( 3 4 5 6) */ \ \ - mm1 = _mm_or_si64(mm1, wk[r]); /* mm1=(-1 0 1 2) */ \ - mm2 = _mm_or_si64(mm2, wk[r + 2]); /* mm2=( 5 6 6 8) */ \ + sampX012 = _mm_slli_si64(samp0123, 2 * BYTE_BIT); /* ( - 0 1 2) */ \ + samp567X = _mm_srli_si64(samp4567, 2 * BYTE_BIT); /* ( 5 6 7 -) */ \ + samp7XXX = _mm_srli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( 7 - - -) */ \ \ - wk[r] = mm4; \ + samp_1012 = _mm_or_si64(sampX012, wk[r]); /* (-1 0 1 2) */ \ + samp5678 = _mm_or_si64(samp567X, wk[r + 2]); /* ( 5 6 7 8) */ \ \ - mm7 = _mm_mullo_pi16(mm7, PW_THREE); \ - mm3 = _mm_mullo_pi16(mm3, PW_THREE); \ - mm1 = _mm_add_pi16(mm1, PW_EIGHT); \ - mm5 = _mm_add_pi16(mm5, PW_EIGHT); \ - mm0 = _mm_add_pi16(mm0, PW_SEVEN); \ - mm2 = _mm_add_pi16(mm2, PW_SEVEN); \ + wk[r] = samp7XXX; \ \ - mm1 = _mm_add_pi16(mm1, mm7); \ - mm5 = _mm_add_pi16(mm5, mm3); \ - mm1 = _mm_srli_pi16(mm1, 4); /* mm1=OutrLE=( 0 2 4 6) */ \ - mm5 = _mm_srli_pi16(mm5, 4); /* mm5=OutrHE=( 8 10 12 14) */ \ - mm0 = _mm_add_pi16(mm0, mm7); \ - mm2 = _mm_add_pi16(mm2, mm3); \ - mm0 = _mm_srli_pi16(mm0, 4); /* mm0=OutrLO=( 1 3 5 7) */ \ - mm2 = _mm_srli_pi16(mm2, 4); /* mm2=OutrHO=( 9 11 13 15) */ \ + samp0123 = _mm_mullo_pi16(samp0123, PW_THREE); \ + samp4567 = _mm_mullo_pi16(samp4567, PW_THREE); \ + samp_1012 = _mm_add_pi16(samp_1012, PW_EIGHT); \ + samp3456 = _mm_add_pi16(samp3456, PW_EIGHT); \ + samp1234 = _mm_add_pi16(samp1234, PW_SEVEN); \ + samp5678 = _mm_add_pi16(samp5678, PW_SEVEN); \ \ - mm0 = _mm_slli_pi16(mm0, BYTE_BIT); \ - mm2 = _mm_slli_pi16(mm2, BYTE_BIT); \ - mm1 = _mm_or_si64(mm1, mm0); /* mm1=OutrL=( 0 1 2 3 4 5 6 7) */ \ - mm5 = _mm_or_si64(mm5, mm2); /* mm5=OutrH=( 8 9 10 11 12 13 14 15) */ \ + outle = _mm_add_pi16(samp_1012, samp0123); \ + outhe = _mm_add_pi16(samp3456, samp4567); \ + outle = _mm_srli_pi16(outle, 4); /* ( 0 2 4 6) */ \ + outhe = _mm_srli_pi16(outhe, 4); /* ( 8 10 12 14) */ \ + outlo = _mm_add_pi16(samp1234, samp0123); \ + outho = _mm_add_pi16(samp5678, samp4567); \ + outlo = _mm_srli_pi16(outlo, 4); /* ( 1 3 5 7) */ \ + outho = _mm_srli_pi16(outho, 4); /* ( 9 11 13 15) */ \ \ - _mm_store_si64((__m64 *)outptr##r, mm1); \ - _mm_store_si64((__m64 *)outptr##r + 1, mm5); \ + outlo = _mm_slli_pi16(outlo, BYTE_BIT); \ + outho = _mm_slli_pi16(outho, BYTE_BIT); \ + outl = _mm_or_si64(outle, outlo); /* ( 0 1 2 3 4 5 6 7) */ \ + outh = _mm_or_si64(outhe, outho); /* ( 8 9 10 11 12 13 14 15) */ \ + \ + _mm_store_si64((__m64 *)outptr##r, outl); \ + _mm_store_si64((__m64 *)outptr##r + 1, outh); \ } void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor, @@ -111,8 +108,17 @@ void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor, JSAMPARRAY output_data = *output_data_ptr; JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1; int inrow, outrow, incol, tmp, tmp1; - __m64 mm0, mm1, mm2, mm3 = 0.0, mm4, mm5, mm6, mm7 = 0.0; - __m64 wk[4], mm_tmp; + __m64 this_1l, this_1h, this_1, thiscolsum_1l, thiscolsum_1h; + __m64 this0l, this0h, this0; + __m64 this1l, this1h, this1, thiscolsum1l, thiscolsum1h; + __m64 next_1l, next_1h, next_1, nextcolsum_1l, nextcolsum_1h; + __m64 next0l, next0h, next0; + __m64 next1l, next1h, next1, nextcolsum1l, nextcolsum1h; + __m64 zero = 0.0, mask0 = 0.0, masklast, wk[4]; + + mask0 = _mm_cmpeq_pi8(mask0, mask0); + masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT); + mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT); for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) { @@ -124,7 +130,7 @@ void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor, if (downsampled_width & 7) { tmp = (downsampled_width - 1) * sizeof(JSAMPLE); - tmp1 = downsampled_width * sizeof(JSAMPLE); + tmp1 = downsampled_width * sizeof(JSAMPLE); asm("daddu $8, %3, %6\r\n" "lb $9, ($8)\r\n" "daddu $8, %3, %7\r\n" @@ -144,42 +150,33 @@ void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor, } /* process the first column block */ - mm0 = _mm_load_si64((__m64 *)inptr0); /* mm0 = row[ 0][0] */ - mm1 = _mm_load_si64((__m64 *)inptr_1); /* mm1 = row[-1][0] */ - mm2 = _mm_load_si64((__m64 *)inptr1); /* mm2 = row[ 1][0] */ - - mm3 = _mm_xor_si64(mm3, mm3); /* mm3 = (all 0's) */ - mm4 = mm0; - mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][0]( 0 1 2 3) */ - mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][0]( 4 5 6 7) */ - mm5 = mm1; - mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][0]( 0 1 2 3) */ - mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][0]( 4 5 6 7) */ - mm6 = mm2; - mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][0]( 0 1 2 3) */ - mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][0]( 4 5 6 7) */ - - mm0 = _mm_mullo_pi16(mm0, PW_THREE); - mm4 = _mm_mullo_pi16(mm4, PW_THREE); - - mm7 = _mm_cmpeq_pi8(mm7, mm7); - mm7 = _mm_srli_si64(mm7, (SIZEOF_MMWORD - 2) * BYTE_BIT); - - mm1 = _mm_add_pi16(mm1, mm0); /* mm1=Int0L=( 0 1 2 3) */ - mm5 = _mm_add_pi16(mm5, mm4); /* mm5=Int0H=( 4 5 6 7) */ - mm2 = _mm_add_pi16(mm2, mm0); /* mm2=Int1L=( 0 1 2 3) */ - mm6 = _mm_add_pi16(mm6, mm4); /* mm6=Int1H=( 4 5 6 7) */ - - _mm_store_si64((__m64 *)outptr0, mm1); /* temporarily save */ - _mm_store_si64((__m64 *)outptr0 + 1, mm5); /* the intermediate data */ - _mm_store_si64((__m64 *)outptr1, mm2); - _mm_store_si64((__m64 *)outptr1 + 1, mm6); - - mm1 = _mm_and_si64(mm1, mm7); /* mm1=( 0 - - -) */ - mm2 = _mm_and_si64(mm2, mm7); /* mm2=( 0 - - -) */ - - wk[0] = mm1; - wk[1] = mm2; + this0 = _mm_load_si64((__m64 *)inptr0); /* row[ 0][0] */ + this_1 = _mm_load_si64((__m64 *)inptr_1); /* row[-1][0] */ + this1 = _mm_load_si64((__m64 *)inptr1); /* row[ 1][0] */ + + this0l = _mm_unpacklo_pi8(this0, zero); /* row[ 0][0]( 0 1 2 3) */ + this0h = _mm_unpackhi_pi8(this0, zero); /* row[ 0][0]( 4 5 6 7) */ + this_1l = _mm_unpacklo_pi8(this_1, zero); /* row[-1][0]( 0 1 2 3) */ + this_1h = _mm_unpackhi_pi8(this_1, zero); /* row[-1][0]( 4 5 6 7) */ + this1l = _mm_unpacklo_pi8(this1, zero); /* row[+1][0]( 0 1 2 3) */ + this1h = _mm_unpackhi_pi8(this1, zero); /* row[+1][0]( 4 5 6 7) */ + + this0l = _mm_mullo_pi16(this0l, PW_THREE); + this0h = _mm_mullo_pi16(this0h, PW_THREE); + + thiscolsum_1l = _mm_add_pi16(this_1l, this0l); /* ( 0 1 2 3) */ + thiscolsum_1h = _mm_add_pi16(this_1h, this0h); /* ( 4 5 6 7) */ + thiscolsum1l = _mm_add_pi16(this0l, this1l); /* ( 0 1 2 3) */ + thiscolsum1h = _mm_add_pi16(this0h, this1h); /* ( 4 5 6 7) */ + + /* temporarily save the intermediate data */ + _mm_store_si64((__m64 *)outptr0, thiscolsum_1l); + _mm_store_si64((__m64 *)outptr0 + 1, thiscolsum_1h); + _mm_store_si64((__m64 *)outptr1, thiscolsum1l); + _mm_store_si64((__m64 *)outptr1 + 1, thiscolsum1h); + + wk[0] = _mm_and_si64(thiscolsum_1l, mask0); /* ( 0 - - -) */ + wk[1] = _mm_and_si64(thiscolsum1l, mask0); /* ( 0 - - -) */ for (incol = downsampled_width; incol > 0; incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8, @@ -187,52 +184,41 @@ void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor, if (incol > 8) { /* process the next column block */ - mm0 = _mm_load_si64((__m64 *)inptr0 + 1); /* mm0 = row[ 0][1] */ - mm1 = _mm_load_si64((__m64 *)inptr_1 + 1); /* mm1 = row[-1][1] */ - mm2 = _mm_load_si64((__m64 *)inptr1 + 1); /* mm2 = row[+1][1] */ - - mm3 = _mm_setzero_si64(); /* mm3 = (all 0's) */ - mm4 = mm0; - mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][1]( 0 1 2 3) */ - mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][1]( 4 5 6 7) */ - mm5 = mm1; - mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][1]( 0 1 2 3) */ - mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][1]( 4 5 6 7) */ - mm6 = mm2; - mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][1]( 0 1 2 3) */ - mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][1]( 4 5 6 7) */ - - mm0 = _mm_mullo_pi16(mm0, PW_THREE); - mm4 = _mm_mullo_pi16(mm4, PW_THREE); - - mm1 = _mm_add_pi16(mm1, mm0); /* mm1 = Int0L = ( 0 1 2 3) */ - mm5 = _mm_add_pi16(mm5, mm4); /* mm5 = Int0H = ( 4 5 6 7) */ - mm2 = _mm_add_pi16(mm2, mm0); /* mm2 = Int1L = ( 0 1 2 3) */ - mm6 = _mm_add_pi16(mm6, mm4); /* mm6 = Int1H = ( 4 5 6 7) */ - - _mm_store_si64((__m64 *)outptr0 + 2, mm1); /* temporarily save */ - _mm_store_si64((__m64 *)outptr0 + 3, mm5); /* the intermediate data */ - _mm_store_si64((__m64 *)outptr1 + 2, mm2); - _mm_store_si64((__m64 *)outptr1 + 3, mm6); - - mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm1=( - - - 0) */ - mm2 = _mm_slli_si64(mm2, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm2=( - - - 0) */ - - wk[2] = mm1; - wk[3] = mm2; + next0 = _mm_load_si64((__m64 *)inptr0 + 1); /* row[ 0][1] */ + next_1 = _mm_load_si64((__m64 *)inptr_1 + 1); /* row[-1][1] */ + next1 = _mm_load_si64((__m64 *)inptr1 + 1); /* row[+1][1] */ + + next0l = _mm_unpacklo_pi8(next0, zero); /* row[ 0][1]( 0 1 2 3) */ + next0h = _mm_unpackhi_pi8(next0, zero); /* row[ 0][1]( 4 5 6 7) */ + next_1l = _mm_unpacklo_pi8(next_1, zero); /* row[-1][1]( 0 1 2 3) */ + next_1h = _mm_unpackhi_pi8(next_1, zero); /* row[-1][1]( 4 5 6 7) */ + next1l = _mm_unpacklo_pi8(next1, zero); /* row[+1][1]( 0 1 2 3) */ + next1h = _mm_unpackhi_pi8(next1, zero); /* row[+1][1]( 4 5 6 7) */ + + next0l = _mm_mullo_pi16(next0l, PW_THREE); + next0h = _mm_mullo_pi16(next0h, PW_THREE); + + nextcolsum_1l = _mm_add_pi16(next_1l, next0l); /* ( 0 1 2 3) */ + nextcolsum_1h = _mm_add_pi16(next_1h, next0h); /* ( 4 5 6 7) */ + nextcolsum1l = _mm_add_pi16(next0l, next1l); /* ( 0 1 2 3) */ + nextcolsum1h = _mm_add_pi16(next0h, next1h); /* ( 4 5 6 7) */ + + /* temporarily save the intermediate data */ + _mm_store_si64((__m64 *)outptr0 + 2, nextcolsum_1l); + _mm_store_si64((__m64 *)outptr0 + 3, nextcolsum_1h); + _mm_store_si64((__m64 *)outptr1 + 2, nextcolsum1l); + _mm_store_si64((__m64 *)outptr1 + 3, nextcolsum1h); + + wk[2] = _mm_slli_si64(nextcolsum_1l, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 0) */ + wk[3] = _mm_slli_si64(nextcolsum1l, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 0) */ } else { - /* process the last column block */ - mm1 = _mm_cmpeq_pi8(mm1, mm1); - mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); - mm2 = mm1; + __m64 tmp; - mm_tmp = _mm_load_si64((__m64 *)outptr0 + 1); - mm1 = _mm_and_si64(mm1, mm_tmp); /* mm1=( - - - 7) */ - mm_tmp = _mm_load_si64((__m64 *)outptr1 + 1); - mm2 = _mm_and_si64(mm2, mm_tmp); /* mm2=( - - - 7) */ - - wk[2] = mm1; - wk[3] = mm2; + /* process the last column block */ + tmp = _mm_load_si64((__m64 *)outptr0 + 1); + wk[2] = _mm_and_si64(masklast, tmp); /* ( - - - 7) */ + tmp = _mm_load_si64((__m64 *)outptr1 + 1); + wk[3] = _mm_and_si64(masklast, tmp); /* ( - - - 7) */ } /* process the upper row */ diff --git a/simd/loongson/jquanti-mmi.c b/simd/loongson/jquanti-mmi.c index f9a3f81..339002f 100644 --- a/simd/loongson/jquanti-mmi.c +++ b/simd/loongson/jquanti-mmi.c @@ -6,7 +6,7 @@ * Authors: ZhuChen * CaiWanwei * SunZhangzhi - * Copyright (C) 2018, D. R. Commander. All Rights Reserved. + * Copyright (C) 2018-2019, D. R. Commander. All Rights Reserved. * * Based on the x86 SIMD extension for IJG JPEG library * Copyright (C) 1999-2006, MIYASAKA Masaru. @@ -34,77 +34,73 @@ #define DO_QUANT() { \ - mm2 = _mm_load_si64((__m64 *)&workspace[0]); \ - mm3 = _mm_load_si64((__m64 *)&workspace[4]); \ + __m64 rowl, rowh, rowls, rowhs, rowlsave, rowhsave; \ + __m64 corrl, corrh, recipl, reciph, scalel, scaleh; \ \ - mm0 = mm2; \ - mm1 = mm3; \ + rowl = _mm_load_si64((__m64 *)&workspace[0]); \ + rowh = _mm_load_si64((__m64 *)&workspace[4]); \ \ - mm2 = _mm_srai_pi16(mm2, (WORD_BIT - 1)); /* -1 if value < 0, */ \ - /* 0 otherwise */ \ - mm3 = _mm_srai_pi16(mm3, (WORD_BIT - 1)); \ + /* Branch-less absolute value */ \ + rowls = _mm_srai_pi16(rowl, (WORD_BIT - 1)); /* -1 if value < 0, */ \ + /* 0 otherwise */ \ + rowhs = _mm_srai_pi16(rowh, (WORD_BIT - 1)); \ \ - mm0 = _mm_xor_si64(mm0, mm2); /* val = -val */ \ - mm1 = _mm_xor_si64(mm1, mm3); \ - mm0 = _mm_sub_pi16(mm0, mm2); \ - mm1 = _mm_sub_pi16(mm1, mm3); \ + rowl = _mm_xor_si64(rowl, rowls); /* val = -val */ \ + rowh = _mm_xor_si64(rowh, rowhs); \ + rowl = _mm_sub_pi16(rowl, rowls); \ + rowh = _mm_sub_pi16(rowh, rowhs); \ \ - corr0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]); /* correction */ \ - corr1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \ + corrl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]); /* correction */ \ + corrh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \ \ - mm0 = _mm_add_pi16(mm0, corr0); /* correction + roundfactor */ \ - mm1 = _mm_add_pi16(mm1, corr1); \ + rowlsave = rowl = _mm_add_pi16(rowl, corrl); /* correction + roundfactor */ \ + rowhsave = rowh = _mm_add_pi16(rowh, corrh); \ \ - mm4 = mm0; \ - mm5 = mm1; \ + recipl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]); /* reciprocal */ \ + reciph = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \ \ - recip0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]); /* reciprocal */ \ - recip1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \ + rowl = _mm_mulhi_pi16(rowl, recipl); \ + rowh = _mm_mulhi_pi16(rowh, reciph); \ \ - mm0 = _mm_mulhi_pi16(mm0, recip0); \ - mm1 = _mm_mulhi_pi16(mm1, recip1); \ + /* reciprocal is always negative (MSB=1), so we always need to add the */ \ + /* initial value (input value is never negative as we inverted it at the */ \ + /* start of this routine) */ \ + rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \ + rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \ \ - mm0 = _mm_add_pi16(mm0, mm4); /* reciprocal is always negative */ \ - mm1 = _mm_add_pi16(mm1, mm5); /* (MSB=1), so we always need to add the */ \ - /* initial value (input value is never */ \ - /* negative as we inverted it at the */ \ - /* start of this routine) */ \ + scalel = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]); /* scale */ \ + scaleh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \ \ - scale0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]); /* scale */ \ - scale1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \ + rowl = _mm_mulhi_pi16(rowl, scalel); \ + rowh = _mm_mulhi_pi16(rowh, scaleh); \ \ - mm6 = scale0; \ - mm7 = scale1; \ - mm4 = mm0; \ - mm5 = mm1; \ + /* determine if scale is negative */ \ + scalel = _mm_srai_pi16(scalel, (WORD_BIT - 1)); \ + scaleh = _mm_srai_pi16(scaleh, (WORD_BIT - 1)); \ \ - mm0 = _mm_mulhi_pi16(mm0, mm6); \ - mm1 = _mm_mulhi_pi16(mm1, mm7); \ + /* and add input if it is */ \ + scalel = _mm_and_si64(scalel, rowlsave); \ + scaleh = _mm_and_si64(scaleh, rowhsave); \ + rowl = _mm_add_pi16(rowl, scalel); \ + rowh = _mm_add_pi16(rowh, scaleh); \ \ - mm6 = _mm_srai_pi16(mm6, (WORD_BIT - 1)); /* determine if scale... */ \ - /* is negative */ \ - mm7 = _mm_srai_pi16(mm7, (WORD_BIT - 1)); \ + /* then check if negative input */ \ + rowlsave = _mm_srai_pi16(rowlsave, (WORD_BIT - 1)); \ + rowhsave = _mm_srai_pi16(rowhsave, (WORD_BIT - 1)); \ \ - mm6 = _mm_and_si64(mm6, mm4); /* and add input if it is */ \ - mm7 = _mm_and_si64(mm7, mm5); \ - mm0 = _mm_add_pi16(mm0, mm6); \ - mm1 = _mm_add_pi16(mm1, mm7); \ + /* and add scale if it is */ \ + rowlsave = _mm_and_si64(rowlsave, scalel); \ + rowhsave = _mm_and_si64(rowhsave, scaleh); \ + rowl = _mm_add_pi16(rowl, rowlsave); \ + rowh = _mm_add_pi16(rowh, rowhsave); \ \ - mm4 = _mm_srai_pi16(mm4, (WORD_BIT - 1)); /* then check if... */ \ - mm5 = _mm_srai_pi16(mm5, (WORD_BIT - 1)); /* negative input */ \ + rowl = _mm_xor_si64(rowl, rowls); /* val = -val */ \ + rowh = _mm_xor_si64(rowh, rowhs); \ + rowl = _mm_sub_pi16(rowl, rowls); \ + rowh = _mm_sub_pi16(rowh, rowhs); \ \ - mm4 = _mm_and_si64(mm4, scale0); /* and add scale if it is */ \ - mm5 = _mm_and_si64(mm5, scale1); \ - mm0 = _mm_add_pi16(mm0, mm4); \ - mm1 = _mm_add_pi16(mm1, mm5); \ - \ - mm0 = _mm_xor_si64(mm0, mm2); /* val = -val */ \ - mm1 = _mm_xor_si64(mm1, mm3); \ - mm0 = _mm_sub_pi16(mm0, mm2); \ - mm1 = _mm_sub_pi16(mm1, mm3); \ - \ - _mm_store_si64((__m64 *)&output_ptr[0], mm0); \ - _mm_store_si64((__m64 *)&output_ptr[4], mm1); \ + _mm_store_si64((__m64 *)&output_ptr[0], rowl); \ + _mm_store_si64((__m64 *)&output_ptr[4], rowh); \ \ workspace += DCTSIZE; \ divisors += DCTSIZE; \ @@ -116,8 +112,6 @@ void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) { JCOEFPTR output_ptr = coef_block; - __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7; - __m64 corr0, corr1, recip0, recip1, scale0, scale1; DO_QUANT() DO_QUANT()