#if RGB_RED == 0
-#define mmA mm0
-#define mmB mm1
+#define mmA re
+#define mmB ro
#elif RGB_GREEN == 0
-#define mmA mm2
-#define mmB mm3
+#define mmA ge
+#define mmB go
#elif RGB_BLUE == 0
-#define mmA mm4
-#define mmB mm5
+#define mmA be
+#define mmB bo
#else
-#define mmA mm6
-#define mmB mm7
+#define mmA xe
+#define mmB xo
#endif
#if RGB_RED == 1
-#define mmC mm0
-#define mmD mm1
+#define mmC re
+#define mmD ro
#elif RGB_GREEN == 1
-#define mmC mm2
-#define mmD mm3
+#define mmC ge
+#define mmD go
#elif RGB_BLUE == 1
-#define mmC mm4
-#define mmD mm5
+#define mmC be
+#define mmD bo
#else
-#define mmC mm6
-#define mmD mm7
+#define mmC xe
+#define mmD xo
#endif
#if RGB_RED == 2
-#define mmE mm0
-#define mmF mm1
+#define mmE re
+#define mmF ro
#elif RGB_GREEN == 2
-#define mmE mm2
-#define mmF mm3
+#define mmE ge
+#define mmF go
#elif RGB_BLUE == 2
-#define mmE mm4
-#define mmF mm5
+#define mmE be
+#define mmF bo
#else
-#define mmE mm6
-#define mmF mm7
+#define mmE xe
+#define mmF xo
#endif
#if RGB_RED == 3
-#define mmG mm0
-#define mmH mm1
+#define mmG re
+#define mmH ro
#elif RGB_GREEN == 3
-#define mmG mm2
-#define mmH mm3
+#define mmG ge
+#define mmH go
#elif RGB_BLUE == 3
-#define mmG mm4
-#define mmH mm5
+#define mmG be
+#define mmH bo
#else
-#define mmG mm6
-#define mmH mm7
+#define mmG xe
+#define mmH xo
#endif
{
JSAMPROW inptr, outptr0, outptr1, outptr2;
int num_cols, col;
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
- __m64 wk[7];
- __m64 Y_BG, Cb_RG, Cr_BG;
+ __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+ __m64 xo;
+#endif
+ __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+ __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho;
+ __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho;
+ __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+ __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+ __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb;
+ __m64 crle, crhe, cre, crlo, crho, cro, cr;
while (--num_rows >= 0) {
inptr = *input_buf++;
}
inptr += RGB_PIXELSIZE * 8;
}
- mmD = mmA;
+ mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
- mmD = _mm_srli_si64(mmD, 4 * BYTE_BIT);
mmA = _mm_unpackhi_pi8(mmA, mmG);
mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
mmD = _mm_unpacklo_pi8(mmD, mmF);
mmG = _mm_unpackhi_pi8(mmG, mmF);
- mmE = mmA;
+ mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
- mmE = _mm_srli_si64(mmE, 4 * BYTE_BIT);
mmA = _mm_unpackhi_pi8(mmA, mmD);
mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
mmE = _mm_unpacklo_pi8(mmE, mmG);
mmD = _mm_unpackhi_pi8(mmD, mmG);
- mmC = mmA;
+ mmC = _mm_loadhi_pi8_f(mmA);
mmA = _mm_loadlo_pi8_f(mmA);
- mmC = _mm_loadhi_pi8_f(mmC);
- mmB = mmE;
+ mmB = _mm_loadhi_pi8_f(mmE);
mmE = _mm_loadlo_pi8_f(mmE);
- mmB = _mm_loadhi_pi8_f(mmB);
- mmF = mmD;
+ mmF = _mm_loadhi_pi8_f(mmD);
mmD = _mm_loadlo_pi8_f(mmD);
- mmF = _mm_loadhi_pi8_f(mmF);
#else /* RGB_PIXELSIZE == 4 */
mmC = _mm_load_si64((__m64 *)&inptr[24]);
inptr += RGB_PIXELSIZE * 8;
}
- mmB = mmA;
+ mmB = _mm_unpackhi_pi8(mmA, mmF);
mmA = _mm_unpacklo_pi8(mmA, mmF);
- mmB = _mm_unpackhi_pi8(mmB, mmF);
- mmG = mmD;
+ mmG = _mm_unpackhi_pi8(mmD, mmC);
mmD = _mm_unpacklo_pi8(mmD, mmC);
- mmG = _mm_unpackhi_pi8(mmG, mmC);
- mmE = mmA;
+ mmE = _mm_unpackhi_pi16(mmA, mmD);
mmA = _mm_unpacklo_pi16(mmA, mmD);
- mmE = _mm_unpackhi_pi16(mmE, mmD);
- mmH = mmB;
+ mmH = _mm_unpackhi_pi16(mmB, mmG);
mmB = _mm_unpacklo_pi16(mmB, mmG);
- mmH = _mm_unpackhi_pi16(mmH, mmG);
- mmC = mmA;
+ mmC = _mm_loadhi_pi8_f(mmA);
mmA = _mm_loadlo_pi8_f(mmA);
- mmC = _mm_loadhi_pi8_f(mmC);
- mmD = mmB;
+ mmD = _mm_loadhi_pi8_f(mmB);
mmB = _mm_loadlo_pi8_f(mmB);
- mmD = _mm_loadhi_pi8_f(mmD);
- mmG = mmE;
+ mmG = _mm_loadhi_pi8_f(mmE);
mmE = _mm_loadlo_pi8_f(mmE);
- mmG = _mm_loadhi_pi8_f(mmG);
- mmF = mmH;
- mmF = _mm_unpacklo_pi8(mmF, mmH);
+ mmF = _mm_unpacklo_pi8(mmH, mmH);
mmH = _mm_unpackhi_pi8(mmH, mmH);
mmF = _mm_srli_pi16(mmF, BYTE_BIT);
mmH = _mm_srli_pi16(mmH, BYTE_BIT);
#endif
- wk[0] = mm0;
- wk[1] = mm1;
- wk[2] = mm4;
- wk[3] = mm5;
-
- mm6 = mm1;
- mm1 = _mm_unpacklo_pi16(mm1, mm3);
- mm6 = _mm_unpackhi_pi16(mm6, mm3);
- mm7 = mm1;
- mm4 = mm6;
- mm1 = _mm_madd_pi16(mm1, PW_F0299_F0337);
- mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
- mm7 = _mm_madd_pi16(mm7, PW_MF016_MF033);
- mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
-
- wk[4] = mm1;
- wk[5] = mm6;
-
- mm1 = _mm_loadlo_pi16_f(mm5);
- mm6 = _mm_loadhi_pi16_f(mm5);
- mm1 = _mm_srli_pi32(mm1, 1);
- mm6 = _mm_srli_pi32(mm6, 1);
-
- mm5 = PD_ONEHALFM1_CJ;
- mm7 = _mm_add_pi32(mm7, mm1);
- mm4 = _mm_add_pi32(mm4, mm6);
- mm7 = _mm_add_pi32(mm7, mm5);
- mm4 = _mm_add_pi32(mm4, mm5);
- mm7 = _mm_srli_pi32(mm7, SCALEBITS);
- mm4 = _mm_srli_pi32(mm4, SCALEBITS);
- mm7 = _mm_packs_pi32(mm7, mm4);
-
- mm1 = wk[2];
- mm6 = mm0;
- mm0 = _mm_unpacklo_pi16(mm0, mm2);
- mm6 = _mm_unpackhi_pi16(mm6, mm2);
- mm5 = mm0;
- mm4 = mm6;
- mm0 = _mm_madd_pi16(mm0, PW_F0299_F0337);
- mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
- mm5 = _mm_madd_pi16(mm5, PW_MF016_MF033);
- mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
-
- wk[6] = mm0;
- wk[7] = mm6;
- mm0 = _mm_loadlo_pi16_f(mm1);
- mm6 = _mm_loadhi_pi16_f(mm1);
- mm0 = _mm_srli_pi32(mm0, 1);
- mm6 = _mm_srli_pi32(mm6, 1);
-
- mm1 = PD_ONEHALFM1_CJ;
- mm5 = _mm_add_pi32(mm5, mm0);
- mm4 = _mm_add_pi32(mm4, mm6);
- mm5 = _mm_add_pi32(mm5, mm1);
- mm4 = _mm_add_pi32(mm4, mm1);
- mm5 = _mm_srli_pi32(mm5, SCALEBITS);
- mm4 = _mm_srli_pi32(mm4, SCALEBITS);
- mm5 = _mm_packs_pi32(mm5, mm4);
-
- mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
- mm5 = _mm_or_si64(mm5, mm7);
- Cb_RG = mm5;
-
- mm0 = wk[3];
- mm6 = wk[2];
- mm1 = wk[1];
-
- mm4 = mm0;
- mm0 = _mm_unpacklo_pi16(mm0, mm3);
- mm4 = _mm_unpackhi_pi16(mm4, mm3);
- mm7 = mm0;
- mm5 = mm4;
- mm0 = _mm_madd_pi16(mm0, PW_F0114_F0250);
- mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
- mm7 = _mm_madd_pi16(mm7, PW_MF008_MF041);
- mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
-
- mm3 = PD_ONEHALF;
- mm0 = _mm_add_pi32(mm0, wk[4]);
- mm4 = _mm_add_pi32(mm4, wk[5]);
- mm0 = _mm_add_pi32(mm0, mm3);
- mm4 = _mm_add_pi32(mm4, mm3);
- mm0 = _mm_srli_pi32(mm0, SCALEBITS);
- mm4 = _mm_srli_pi32(mm4, SCALEBITS);
- mm0 = _mm_packs_pi32(mm0, mm4);
-
- mm3 = _mm_loadlo_pi16_f(mm1);
- mm4 = _mm_loadhi_pi16_f(mm1);
- mm3 = _mm_srli_pi32(mm3, 1);
- mm4 = _mm_srli_pi32(mm4, 1);
-
- mm1 = PD_ONEHALFM1_CJ;
- mm7 = _mm_add_pi32(mm7, mm3);
- mm5 = _mm_add_pi32(mm5, mm4);
- mm7 = _mm_add_pi32(mm7, mm1);
- mm5 = _mm_add_pi32(mm5, mm1);
- mm7 = _mm_srli_pi32(mm7, SCALEBITS);
- mm5 = _mm_srli_pi32(mm5, SCALEBITS);
- mm7 = _mm_packs_pi32(mm7, mm5);
-
- mm3 = wk[0];
- mm4 = mm6;
- mm6 = _mm_unpacklo_pi16(mm6, mm2);
- mm4 = _mm_unpackhi_pi16(mm4, mm2);
- mm1 = mm6;
- mm5 = mm4;
- mm6 = _mm_madd_pi16(mm6, PW_F0114_F0250);
- mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
- mm1 = _mm_madd_pi16(mm1, PW_MF008_MF041);
- mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
-
- mm2 = PD_ONEHALF;
- mm6 = _mm_add_pi32(mm6, wk[6]);
- mm4 = _mm_add_pi32(mm4, wk[7]);
- mm6 = _mm_add_pi32(mm6, mm2);
- mm4 = _mm_add_pi32(mm4, mm2);
- mm6 = _mm_srli_pi32(mm6, SCALEBITS);
- mm4 = _mm_srli_pi32(mm4, SCALEBITS);
- mm6 = _mm_packs_pi32(mm6, mm4);
-
- mm0 = _mm_slli_pi16(mm0, BYTE_BIT);
- mm6 = _mm_or_si64(mm6, mm0);
- Y_BG = mm6;
-
- mm2 = _mm_loadlo_pi16_f(mm3);
- mm4 = _mm_loadhi_pi16_f(mm3);
- mm2 = _mm_srli_pi32(mm2, 1);
- mm4 = _mm_srli_pi32(mm4, 1);
-
- mm0 = PD_ONEHALFM1_CJ;
- mm1 = _mm_add_pi32(mm1, mm2);
- mm5 = _mm_add_pi32(mm5, mm4);
- mm1 = _mm_add_pi32(mm1, mm0);
- mm5 = _mm_add_pi32(mm5, mm0);
- mm1 = _mm_srli_pi32(mm1, SCALEBITS);
- mm5 = _mm_srli_pi32(mm5, SCALEBITS);
- mm1 = _mm_packs_pi32(mm1, mm5);
-
- mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
- mm1 = _mm_or_si64(mm1, mm7);
- Cr_BG = mm1;
-
- _mm_store_si64((__m64 *)&outptr0[0], Y_BG);
- _mm_store_si64((__m64 *)&outptr1[0], Cb_RG);
- _mm_store_si64((__m64 *)&outptr2[0], Cr_BG);
+ /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+ * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+ *
+ * (Original)
+ * Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ *
+ * (This implementation)
+ * Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+ * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+ * Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+ */
+
+ rglo = _mm_unpacklo_pi16(ro, go);
+ rgho = _mm_unpackhi_pi16(ro, go);
+ ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+ yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+ cblo = _mm_madd_pi16(rglo, PW_MF016_MF033);
+ cbho = _mm_madd_pi16(rgho, PW_MF016_MF033);
+
+ blo = _mm_loadlo_pi16_f(bo);
+ bho = _mm_loadhi_pi16_f(bo);
+ halfblo = _mm_srli_pi32(blo, 1);
+ halfbho = _mm_srli_pi32(bho, 1);
+
+ cblo = _mm_add_pi32(cblo, halfblo);
+ cbho = _mm_add_pi32(cbho, halfbho);
+ cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ);
+ cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ);
+ cblo = _mm_srli_pi32(cblo, SCALEBITS);
+ cbho = _mm_srli_pi32(cbho, SCALEBITS);
+ cbo = _mm_packs_pi32(cblo, cbho);
+
+ rgle = _mm_unpacklo_pi16(re, ge);
+ rghe = _mm_unpackhi_pi16(re, ge);
+ yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+ yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+ cble = _mm_madd_pi16(rgle, PW_MF016_MF033);
+ cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033);
+
+ ble = _mm_loadlo_pi16_f(be);
+ bhe = _mm_loadhi_pi16_f(be);
+ halfble = _mm_srli_pi32(ble, 1);
+ halfbhe = _mm_srli_pi32(bhe, 1);
+
+ cble = _mm_add_pi32(cble, halfble);
+ cbhe = _mm_add_pi32(cbhe, halfbhe);
+ cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ);
+ cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ);
+ cble = _mm_srli_pi32(cble, SCALEBITS);
+ cbhe = _mm_srli_pi32(cbhe, SCALEBITS);
+ cbe = _mm_packs_pi32(cble, cbhe);
+
+ cbo = _mm_slli_pi16(cbo, BYTE_BIT);
+ cb = _mm_or_si64(cbe, cbo);
+
+ bglo = _mm_unpacklo_pi16(bo, go);
+ bgho = _mm_unpackhi_pi16(bo, go);
+ ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+ yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+ crlo = _mm_madd_pi16(bglo, PW_MF008_MF041);
+ crho = _mm_madd_pi16(bgho, PW_MF008_MF041);
+
+ ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+ yho = _mm_add_pi32(yho_bg, yho_rg);
+ ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+ yho = _mm_add_pi32(yho, PD_ONEHALF);
+ ylo = _mm_srli_pi32(ylo, SCALEBITS);
+ yho = _mm_srli_pi32(yho, SCALEBITS);
+ yo = _mm_packs_pi32(ylo, yho);
+
+ rlo = _mm_loadlo_pi16_f(ro);
+ rho = _mm_loadhi_pi16_f(ro);
+ halfrlo = _mm_srli_pi32(rlo, 1);
+ halfrho = _mm_srli_pi32(rho, 1);
+
+ crlo = _mm_add_pi32(crlo, halfrlo);
+ crho = _mm_add_pi32(crho, halfrho);
+ crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ);
+ crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ);
+ crlo = _mm_srli_pi32(crlo, SCALEBITS);
+ crho = _mm_srli_pi32(crho, SCALEBITS);
+ cro = _mm_packs_pi32(crlo, crho);
+
+ bgle = _mm_unpacklo_pi16(be, ge);
+ bghe = _mm_unpackhi_pi16(be, ge);
+ yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+ yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+ crle = _mm_madd_pi16(bgle, PW_MF008_MF041);
+ crhe = _mm_madd_pi16(bghe, PW_MF008_MF041);
+
+ yle = _mm_add_pi32(yle_bg, yle_rg);
+ yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+ yle = _mm_add_pi32(yle, PD_ONEHALF);
+ yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+ yle = _mm_srli_pi32(yle, SCALEBITS);
+ yhe = _mm_srli_pi32(yhe, SCALEBITS);
+ ye = _mm_packs_pi32(yle, yhe);
+
+ yo = _mm_slli_pi16(yo, BYTE_BIT);
+ y = _mm_or_si64(ye, yo);
+
+ rle = _mm_loadlo_pi16_f(re);
+ rhe = _mm_loadhi_pi16_f(re);
+ halfrle = _mm_srli_pi32(rle, 1);
+ halfrhe = _mm_srli_pi32(rhe, 1);
+
+ crle = _mm_add_pi32(crle, halfrle);
+ crhe = _mm_add_pi32(crhe, halfrhe);
+ crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ);
+ crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ);
+ crle = _mm_srli_pi32(crle, SCALEBITS);
+ crhe = _mm_srli_pi32(crhe, SCALEBITS);
+ cre = _mm_packs_pi32(crle, crhe);
+
+ cro = _mm_slli_pi16(cro, BYTE_BIT);
+ cr = _mm_or_si64(cre, cro);
+
+ _mm_store_si64((__m64 *)&outptr0[0], y);
+ _mm_store_si64((__m64 *)&outptr1[0], cb);
+ _mm_store_si64((__m64 *)&outptr2[0], cr);
}
}
}
/*
* Loongson MMI optimizations for libjpeg-turbo
*
- * Copyright (C) 2015, 2018, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2015, 2018-2019, D. R. Commander. All Rights Reserved.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
* All Rights Reserved.
* Authors: ZhuChen <zhuchen@loongson.cn>
JDIMENSION width_in_blocks,
JSAMPARRAY input_data, JSAMPARRAY output_data)
{
- int inrow, outrow, outcol, bias;
+ int inrow, outrow, outcol;
JDIMENSION output_cols = width_in_blocks * DCTSIZE;
JSAMPROW inptr0, inptr1, outptr;
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6 = 0.0, mm7;
+ __m64 bias, mask = 0.0, thisavg, nextavg, avg;
+ __m64 this0o, this0e, this0, this0sum, next0o, next0e, next0, next0sum;
+ __m64 this1o, this1e, this1, this1sum, next1o, next1e, next1, next1sum;
expand_right_edge(input_data, max_v_samp_factor, image_width,
output_cols * 2);
- bias = (1 << 17) + 1; /* 0x00020001 (bias pattern) */
- mm7 = _mm_set1_pi32(bias); /* mm7={1, 2, 1, 2} */
- mm6 = _mm_cmpeq_pi16(mm6, mm6);
- mm6 = _mm_srli_pi16(mm6, BYTE_BIT); /* mm6={0xFF 0x00 0xFF 0x00 ..} */
+ bias = _mm_set1_pi32((1 << 17) + 1); /* 0x00020001 (32-bit bias pattern) */
+ /* bias={1, 2, 1, 2} (16-bit) */
+ mask = _mm_cmpeq_pi16(mask, mask);
+ mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
for (inrow = 0, outrow = 0; outrow < v_samp_factor;
inrow += 2, outrow++) {
for (outcol = output_cols; outcol > 0;
outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
- mm0 = _mm_load_si64((__m64 *)&inptr0[0]);
- mm1 = _mm_load_si64((__m64 *)&inptr1[0]);
- mm2 = _mm_load_si64((__m64 *)&inptr0[8]);
- mm3 = _mm_load_si64((__m64 *)&inptr1[8]);
+ this0 = _mm_load_si64((__m64 *)&inptr0[0]);
+ this1 = _mm_load_si64((__m64 *)&inptr1[0]);
+ next0 = _mm_load_si64((__m64 *)&inptr0[8]);
+ next1 = _mm_load_si64((__m64 *)&inptr1[8]);
- mm4 = mm0;
- mm5 = mm1;
- mm0 = _mm_and_si64(mm0, mm6);
- mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
- mm1 = _mm_and_si64(mm1, mm6);
- mm5 = _mm_srli_pi16(mm5, BYTE_BIT);
- mm0 = _mm_add_pi16(mm0, mm4);
- mm1 = _mm_add_pi16(mm1, mm5);
+ this0o = _mm_and_si64(this0, mask);
+ this0e = _mm_srli_pi16(this0, BYTE_BIT);
+ this1o = _mm_and_si64(this1, mask);
+ this1e = _mm_srli_pi16(this1, BYTE_BIT);
+ this0sum = _mm_add_pi16(this0o, this0e);
+ this1sum = _mm_add_pi16(this1o, this1e);
- mm4 = mm2;
- mm5 = mm3;
- mm2 = _mm_and_si64(mm2, mm6);
- mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
- mm3 = _mm_and_si64(mm3, mm6);
- mm5 = _mm_srli_pi16(mm5, BYTE_BIT);
- mm2 = _mm_add_pi16(mm2, mm4);
- mm3 = _mm_add_pi16(mm3, mm5);
+ next0o = _mm_and_si64(next0, mask);
+ next0e = _mm_srli_pi16(next0, BYTE_BIT);
+ next1o = _mm_and_si64(next1, mask);
+ next1e = _mm_srli_pi16(next1, BYTE_BIT);
+ next0sum = _mm_add_pi16(next0o, next0e);
+ next1sum = _mm_add_pi16(next1o, next1e);
- mm0 = _mm_add_pi16(mm0, mm1);
- mm2 = _mm_add_pi16(mm2, mm3);
- mm0 = _mm_add_pi16(mm0, mm7);
- mm2 = _mm_add_pi16(mm2, mm7);
- mm0 = _mm_srli_pi16(mm0, 2);
- mm2 = _mm_srli_pi16(mm2, 2);
+ thisavg = _mm_add_pi16(this0sum, this1sum);
+ nextavg = _mm_add_pi16(next0sum, next1sum);
+ thisavg = _mm_add_pi16(thisavg, bias);
+ nextavg = _mm_add_pi16(nextavg, bias);
+ thisavg = _mm_srli_pi16(thisavg, 2);
+ nextavg = _mm_srli_pi16(nextavg, 2);
- mm0 = _mm_packs_pu16(mm0, mm2);
+ avg = _mm_packs_pu16(thisavg, nextavg);
- _mm_store_si64((__m64 *)&outptr[0], mm0);
+ _mm_store_si64((__m64 *)&outptr[0], avg);
}
}
}
#if RGB_RED == 0
-#define mmA mm0
-#define mmB mm1
+#define mmA re
+#define mmB ro
#elif RGB_GREEN == 0
-#define mmA mm2
-#define mmB mm3
+#define mmA ge
+#define mmB go
#elif RGB_BLUE == 0
-#define mmA mm4
-#define mmB mm5
+#define mmA be
+#define mmB bo
#else
-#define mmA mm6
-#define mmB mm7
+#define mmA xe
+#define mmB xo
#endif
#if RGB_RED == 1
-#define mmC mm0
-#define mmD mm1
+#define mmC re
+#define mmD ro
#elif RGB_GREEN == 1
-#define mmC mm2
-#define mmD mm3
+#define mmC ge
+#define mmD go
#elif RGB_BLUE == 1
-#define mmC mm4
-#define mmD mm5
+#define mmC be
+#define mmD bo
#else
-#define mmC mm6
-#define mmD mm7
+#define mmC xe
+#define mmD xo
#endif
#if RGB_RED == 2
-#define mmE mm0
-#define mmF mm1
+#define mmE re
+#define mmF ro
#elif RGB_GREEN == 2
-#define mmE mm2
-#define mmF mm3
+#define mmE ge
+#define mmF go
#elif RGB_BLUE == 2
-#define mmE mm4
-#define mmF mm5
+#define mmE be
+#define mmF bo
#else
-#define mmE mm6
-#define mmF mm7
+#define mmE xe
+#define mmF xo
#endif
#if RGB_RED == 3
-#define mmG mm0
-#define mmH mm1
+#define mmG re
+#define mmH ro
#elif RGB_GREEN == 3
-#define mmG mm2
-#define mmH mm3
+#define mmG ge
+#define mmH go
#elif RGB_BLUE == 3
-#define mmG mm4
-#define mmH mm5
+#define mmG be
+#define mmH bo
#else
-#define mmG mm6
-#define mmH mm7
+#define mmG xe
+#define mmH xo
#endif
{
JSAMPROW outptr, inptr0, inptr1, inptr2;
int num_cols, col;
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
- __m64 mm8, wk[2];
+ __m64 ye, yo, y, cbe, cbe2, cbo, cbo2, cb, cre, cre2, cro, cro2, cr;
+ __m64 re, ro, gle, ghe, ge, glo, gho, go, be, bo, xe = 0.0, xo = 0.0;
+ __m64 decenter, mask;
while (--num_rows >= 0) {
inptr0 = input_buf[0][input_row];
for (num_cols = out_width; num_cols > 0; num_cols -= 8,
inptr0 += 8, inptr1 += 8, inptr2 += 8) {
- mm5 = _mm_load_si64((__m64 *)inptr1);
- mm1 = _mm_load_si64((__m64 *)inptr2);
- mm8 = _mm_load_si64((__m64 *)inptr0);
- mm4 = 0;
- mm7 = 0;
- mm4 = _mm_cmpeq_pi16(mm4, mm4);
- mm7 = _mm_cmpeq_pi16(mm7, mm7);
- mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
- mm7 = _mm_slli_pi16(mm7, 7); /* mm7={0xFF80 0xFF80 0xFF80 0xFF80} */
- mm0 = mm4; /* mm0=mm4={0xFF 0x00 0xFF 0x00 ..} */
-
- mm4 = _mm_and_si64(mm4, mm5); /* mm4=Cb(0246)=CbE */
- mm5 = _mm_srli_pi16(mm5, BYTE_BIT); /* mm5=Cb(1357)=CbO */
- mm0 = _mm_and_si64(mm0, mm1); /* mm0=Cr(0246)=CrE */
- mm1 = _mm_srli_pi16(mm1, BYTE_BIT); /* mm1=Cr(1357)=CrO */
- mm4 = _mm_add_pi16(mm4, mm7);
- mm5 = _mm_add_pi16(mm5, mm7);
- mm0 = _mm_add_pi16(mm0, mm7);
- mm1 = _mm_add_pi16(mm1, mm7);
+ cb = _mm_load_si64((__m64 *)inptr1);
+ cr = _mm_load_si64((__m64 *)inptr2);
+ y = _mm_load_si64((__m64 *)inptr0);
+
+ mask = decenter = 0.0;
+ mask = _mm_cmpeq_pi16(mask, mask);
+ decenter = _mm_cmpeq_pi16(decenter, decenter);
+ mask = _mm_srli_pi16(mask, BYTE_BIT); /* {0xFF 0x00 0xFF 0x00 ..} */
+ decenter = _mm_slli_pi16(decenter, 7); /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+ cbe = _mm_and_si64(mask, cb); /* Cb(0246) */
+ cbo = _mm_srli_pi16(cb, BYTE_BIT); /* Cb(1357) */
+ cre = _mm_and_si64(mask, cr); /* Cr(0246) */
+ cro = _mm_srli_pi16(cr, BYTE_BIT); /* Cr(1357) */
+ cbe = _mm_add_pi16(cbe, decenter);
+ cbo = _mm_add_pi16(cbo, decenter);
+ cre = _mm_add_pi16(cre, decenter);
+ cro = _mm_add_pi16(cro, decenter);
/* (Original)
* R = Y + 1.40200 * Cr
* B = Y - 0.22800 * Cb + Cb + Cb
*/
- mm2 = mm4; /* mm2 = CbE */
- mm3 = mm5; /* mm3 = CbO */
- mm4 = _mm_add_pi16(mm4, mm4); /* mm4 = 2*CbE */
- mm5 = _mm_add_pi16(mm5, mm5); /* mm5 = 2*CbO */
- mm6 = mm0; /* mm6 = CrE */
- mm7 = mm1; /* mm7 = CrO */
- mm0 = _mm_add_pi16(mm0, mm0); /* mm0 = 2*CrE */
- mm1 = _mm_add_pi16(mm1, mm1); /* mm1 = 2*CrO */
-
- mm4 = _mm_mulhi_pi16(mm4, PW_MF0228); /* mm4=(2*CbE * -FIX(0.22800) */
- mm5 = _mm_mulhi_pi16(mm5, PW_MF0228); /* mm5=(2*CbO * -FIX(0.22800) */
- mm0 = _mm_mulhi_pi16(mm0, PW_F0402); /* mm0=(2*CrE * FIX(0.40200)) */
- mm1 = _mm_mulhi_pi16(mm1, PW_F0402); /* mm1=(2*CrO * FIX(0.40200)) */
-
- mm4 = _mm_add_pi16(mm4, PW_ONE);
- mm5 = _mm_add_pi16(mm5, PW_ONE);
- mm4 = _mm_srai_pi16(mm4, 1); /* mm4=(CbE * -FIX(0.22800)) */
- mm5 = _mm_srai_pi16(mm5, 1); /* mm5=(CbO * -FIX(0.22800)) */
- mm0 = _mm_add_pi16(mm0, PW_ONE);
- mm1 = _mm_add_pi16(mm1, PW_ONE);
- mm0 = _mm_srai_pi16(mm0, 1); /* mm0=(CrE * FIX(0.40200)) */
- mm1 = _mm_srai_pi16(mm1, 1); /* mm1=(CrO * FIX(0.40200)) */
-
- mm4 = _mm_add_pi16(mm4, mm2);
- mm5 = _mm_add_pi16(mm5, mm3);
- mm4 = _mm_add_pi16(mm4, mm2); /* mm4=(CbE * FIX(1.77200))=(B-Y)E */
- mm5 = _mm_add_pi16(mm5, mm3); /* mm5=(CbO * FIX(1.77200))=(B-Y)O */
- mm0 = _mm_add_pi16(mm0, mm6); /* mm0=(CrE * FIX(1.40200))=(R-Y)E */
- mm1 = _mm_add_pi16(mm1, mm7); /* mm1=(CrO * FIX(1.40200))=(R-Y)O */
-
- wk[0] = mm4; /* wk(0)=(B-Y)E */
- wk[1] = mm5; /* wk(1)=(B-Y)O */
-
- mm4 = mm2;
- mm5 = mm3;
- mm2 = _mm_unpacklo_pi16(mm2, mm6);
- mm4 = _mm_unpackhi_pi16(mm4, mm6);
- mm2 = _mm_madd_pi16(mm2, PW_MF0344_F0285);
- mm4 = _mm_madd_pi16(mm4, PW_MF0344_F0285);
- mm3 = _mm_unpacklo_pi16(mm3, mm7);
- mm5 = _mm_unpackhi_pi16(mm5, mm7);
- mm3 = _mm_madd_pi16(mm3, PW_MF0344_F0285);
- mm5 = _mm_madd_pi16(mm5, PW_MF0344_F0285);
-
- mm2 = _mm_add_pi32(mm2, PD_ONEHALF);
- mm4 = _mm_add_pi32(mm4, PD_ONEHALF);
- mm2 = _mm_srai_pi32(mm2, SCALEBITS);
- mm4 = _mm_srai_pi32(mm4, SCALEBITS);
- mm3 = _mm_add_pi32(mm3, PD_ONEHALF);
- mm5 = _mm_add_pi32(mm5, PD_ONEHALF);
- mm3 = _mm_srai_pi32(mm3, SCALEBITS);
- mm5 = _mm_srai_pi32(mm5, SCALEBITS);
-
- mm2 = _mm_packs_pi32(mm2, mm4); /* mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) */
- mm3 = _mm_packs_pi32(mm3, mm5); /* mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) */
- mm2 = _mm_sub_pi16(mm2, mm6); /* mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
- mm3 = _mm_sub_pi16(mm3, mm7); /* mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
-
- mm5 = mm8; /* mm5=Y(01234567) */
-
- mm4 = _mm_cmpeq_pi16(mm4, mm4);
- mm4 = _mm_srli_pi16(mm4, BYTE_BIT); /* mm4={0xFF 0x00 0xFF 0x00 ..} */
- mm4 = _mm_and_si64(mm4, mm5); /* mm4=Y(0246)=YE */
- mm5 = _mm_srli_pi16(mm5, BYTE_BIT); /* mm5=Y(1357)=YO */
-
- mm0 = _mm_add_pi16(mm0, mm4); /* mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) */
- mm1 = _mm_add_pi16(mm1, mm5); /* mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) */
- mm0 = _mm_packs_pu16(mm0, mm0); /* mm0=(R0 R2 R4 R6 ** ** ** **) */
- mm1 = _mm_packs_pu16(mm1, mm1); /* mm1=(R1 R3 R5 R7 ** ** ** **) */
-
- mm2 = _mm_add_pi16(mm2, mm4); /* mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) */
- mm3 = _mm_add_pi16(mm3, mm5); /* mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) */
- mm2 = _mm_packs_pu16(mm2, mm2); /* mm2=(G0 G2 G4 G6 ** ** ** **) */
- mm3 = _mm_packs_pu16(mm3, mm3); /* mm3=(G1 G3 G5 G7 ** ** ** **) */
-
- mm4 = _mm_add_pi16(mm4, wk[0]); /* mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) */
- mm5 = _mm_add_pi16(mm5, wk[1]); /* mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) */
- mm4 = _mm_packs_pu16(mm4, mm4); /* mm4=(B0 B2 B4 B6 ** ** ** **) */
- mm5 = _mm_packs_pu16(mm5, mm5); /* mm5=(B1 B3 B5 B7 ** ** ** **) */
+ cbe2 = _mm_add_pi16(cbe, cbe); /* 2*CbE */
+ cbo2 = _mm_add_pi16(cbo, cbo); /* 2*CbO */
+ cre2 = _mm_add_pi16(cre, cre); /* 2*CrE */
+ cro2 = _mm_add_pi16(cro, cro); /* 2*CrO */
+
+ be = _mm_mulhi_pi16(cbe2, PW_MF0228); /* (2*CbE * -FIX(0.22800) */
+ bo = _mm_mulhi_pi16(cbo2, PW_MF0228); /* (2*CbO * -FIX(0.22800) */
+ re = _mm_mulhi_pi16(cre2, PW_F0402); /* (2*CrE * FIX(0.40200)) */
+ ro = _mm_mulhi_pi16(cro2, PW_F0402); /* (2*CrO * FIX(0.40200)) */
+
+ be = _mm_add_pi16(be, PW_ONE);
+ bo = _mm_add_pi16(bo, PW_ONE);
+ be = _mm_srai_pi16(be, 1); /* (CbE * -FIX(0.22800)) */
+ bo = _mm_srai_pi16(bo, 1); /* (CbO * -FIX(0.22800)) */
+ re = _mm_add_pi16(re, PW_ONE);
+ ro = _mm_add_pi16(ro, PW_ONE);
+ re = _mm_srai_pi16(re, 1); /* (CrE * FIX(0.40200)) */
+ ro = _mm_srai_pi16(ro, 1); /* (CrO * FIX(0.40200)) */
+
+ be = _mm_add_pi16(be, cbe);
+ bo = _mm_add_pi16(bo, cbo);
+ be = _mm_add_pi16(be, cbe); /* (CbE * FIX(1.77200))=(B-Y)E */
+ bo = _mm_add_pi16(bo, cbo); /* (CbO * FIX(1.77200))=(B-Y)O */
+ re = _mm_add_pi16(re, cre); /* (CrE * FIX(1.40200))=(R-Y)E */
+ ro = _mm_add_pi16(ro, cro); /* (CrO * FIX(1.40200))=(R-Y)O */
+
+ gle = _mm_unpacklo_pi16(cbe, cre);
+ ghe = _mm_unpackhi_pi16(cbe, cre);
+ gle = _mm_madd_pi16(gle, PW_MF0344_F0285);
+ ghe = _mm_madd_pi16(ghe, PW_MF0344_F0285);
+ glo = _mm_unpacklo_pi16(cbo, cro);
+ gho = _mm_unpackhi_pi16(cbo, cro);
+ glo = _mm_madd_pi16(glo, PW_MF0344_F0285);
+ gho = _mm_madd_pi16(gho, PW_MF0344_F0285);
+
+ gle = _mm_add_pi32(gle, PD_ONEHALF);
+ ghe = _mm_add_pi32(ghe, PD_ONEHALF);
+ gle = _mm_srai_pi32(gle, SCALEBITS);
+ ghe = _mm_srai_pi32(ghe, SCALEBITS);
+ glo = _mm_add_pi32(glo, PD_ONEHALF);
+ gho = _mm_add_pi32(gho, PD_ONEHALF);
+ glo = _mm_srai_pi32(glo, SCALEBITS);
+ gho = _mm_srai_pi32(gho, SCALEBITS);
+
+ ge = _mm_packs_pi32(gle, ghe); /* CbE*-FIX(0.344)+CrE*FIX(0.285) */
+ go = _mm_packs_pi32(glo, gho); /* CbO*-FIX(0.344)+CrO*FIX(0.285) */
+ ge = _mm_sub_pi16(ge, cre); /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
+ go = _mm_sub_pi16(go, cro); /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
+
+ ye = _mm_and_si64(mask, y); /* Y(0246) */
+ yo = _mm_srli_pi16(y, BYTE_BIT); /* Y(1357) */
+
+ re = _mm_add_pi16(re, ye); /* ((R-Y)E+YE)=(R0 R2 R4 R6) */
+ ro = _mm_add_pi16(ro, yo); /* ((R-Y)O+YO)=(R1 R3 R5 R7) */
+ re = _mm_packs_pu16(re, re); /* (R0 R2 R4 R6 ** ** ** **) */
+ ro = _mm_packs_pu16(ro, ro); /* (R1 R3 R5 R7 ** ** ** **) */
+
+ ge = _mm_add_pi16(ge, ye); /* ((G-Y)E+YE)=(G0 G2 G4 G6) */
+ go = _mm_add_pi16(go, yo); /* ((G-Y)O+YO)=(G1 G3 G5 G7) */
+ ge = _mm_packs_pu16(ge, ge); /* (G0 G2 G4 G6 ** ** ** **) */
+ go = _mm_packs_pu16(go, go); /* (G1 G3 G5 G7 ** ** ** **) */
+
+ be = _mm_add_pi16(be, ye); /* (YE+(B-Y)E)=(B0 B2 B4 B6) */
+ bo = _mm_add_pi16(bo, yo); /* (YO+(B-Y)O)=(B1 B3 B5 B7) */
+ be = _mm_packs_pu16(be, be); /* (B0 B2 B4 B6 ** ** ** **) */
+ bo = _mm_packs_pu16(bo, bo); /* (B1 B3 B5 B7 ** ** ** **) */
#if RGB_PIXELSIZE == 3
/* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
/* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
- mmA = _mm_unpacklo_pi8(mmA, mmC); /* mmA=(00 10 02 12 04 14 06 16) */
- mmE = _mm_unpacklo_pi8(mmE, mmB); /* mmE=(20 01 22 03 24 05 26 07) */
- mmD = _mm_unpacklo_pi8(mmD, mmF); /* mmD=(11 21 13 23 15 25 17 27) */
+ mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
+ mmE = _mm_unpacklo_pi8(mmE, mmB); /* (20 01 22 03 24 05 26 07) */
+ mmD = _mm_unpacklo_pi8(mmD, mmF); /* (11 21 13 23 15 25 17 27) */
- mmG = mmA;
- mmH = mmA;
- mmA = _mm_unpacklo_pi16(mmA, mmE); /* mmA=(00 10 20 01 02 12 22 03) */
- mmG = _mm_unpackhi_pi16(mmG, mmE); /* mmG=(04 14 24 05 06 16 26 07) */
+ mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT);
- mmH = _mm_srli_si64(mmH, 2 * BYTE_BIT);
- mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
+ mmG = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 05 06 16 26 07) */
+ mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 01 02 12 22 03) */
- mmC = mmD;
- mmB = mmD;
- mmD = _mm_unpacklo_pi16(mmD, mmH); /* mmD=(11 21 02 12 13 23 04 14) */
- mmC = _mm_unpackhi_pi16(mmC, mmH); /* mmC=(15 25 06 16 17 27 -- --) */
+ mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
+ mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT); /* (13 23 15 25 17 27 -- --) */
- mmB = _mm_srli_si64(mmB, 2 * BYTE_BIT); /* mmB=(13 23 15 25 17 27 -- --) */
+ mmC = _mm_unpackhi_pi16(mmD, mmH); /* (15 25 06 16 17 27 -- --) */
+ mmD = _mm_unpacklo_pi16(mmD, mmH); /* (11 21 02 12 13 23 04 14) */
- mmF = mmE;
- mmE = _mm_unpacklo_pi16(mmE, mmB); /* mmE=(22 03 13 23 24 05 15 25) */
- mmF = _mm_unpackhi_pi16(mmF, mmB); /* mmF=(26 07 17 27 -- -- -- --) */
+ mmF = _mm_unpackhi_pi16(mmE, mmB); /* (26 07 17 27 -- -- -- --) */
+ mmE = _mm_unpacklo_pi16(mmE, mmB); /* (22 03 13 23 24 05 15 25) */
- mmA = _mm_unpacklo_pi32(mmA, mmD); /* mmA=(00 10 20 01 11 21 02 12) */
- mmE = _mm_unpacklo_pi32(mmE, mmG); /* mmE=(22 03 13 23 04 14 24 05) */
- mmC = _mm_unpacklo_pi32(mmC, mmF); /* mmC=(15 25 06 16 26 07 17 27) */
+ mmA = _mm_unpacklo_pi32(mmA, mmD); /* (00 10 20 01 11 21 02 12) */
+ mmE = _mm_unpacklo_pi32(mmE, mmG); /* (22 03 13 23 04 14 24 05) */
+ mmC = _mm_unpacklo_pi32(mmC, mmF); /* (15 25 06 16 26 07 17 27) */
if (num_cols >= 8) {
if (!(((long)outptr) & 7)) {
#else /* RGB_PIXELSIZE == 4 */
#ifdef RGBX_FILLER_0XFF
- mm6 = _mm_cmpeq_pi8(mm6, mm6);
- mm7 = _mm_cmpeq_pi8(mm7, mm7);
+ xe = _mm_cmpeq_pi8(xe, xe);
+ xo = _mm_cmpeq_pi8(xo, xo);
#else
- mm6 = _mm_xor_si64(mm6, mm6);
- mm7 = _mm_xor_si64(mm7, mm7);
+ xe = _mm_xor_si64(xe, xe);
+ xo = _mm_xor_si64(xo, xo);
#endif
/* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
/* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
/* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
/* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
- mmA = _mm_unpacklo_pi8(mmA, mmC); /* mmA=(00 10 02 12 04 14 06 16) */
- mmE = _mm_unpacklo_pi8(mmE, mmG); /* mmE=(20 30 22 32 24 34 26 36) */
- mmB = _mm_unpacklo_pi8(mmB, mmD); /* mmB=(01 11 03 13 05 15 07 17) */
- mmF = _mm_unpacklo_pi8(mmF, mmH); /* mmF=(21 31 23 33 25 35 27 37) */
-
- mmC = mmA;
- mmA = _mm_unpacklo_pi16(mmA, mmE); /* mmA=(00 10 20 30 02 12 22 32) */
- mmC = _mm_unpackhi_pi16(mmC, mmE); /* mmC=(04 14 24 34 06 16 26 36) */
- mmG = mmB;
- mmB = _mm_unpacklo_pi16(mmB, mmF); /* mmB=(01 11 21 31 03 13 23 33) */
- mmG = _mm_unpackhi_pi16(mmG, mmF); /* mmG=(05 15 25 35 07 17 27 37) */
-
- mmD = mmA;
- mmA = _mm_unpacklo_pi32(mmA, mmB); /* mmA=(00 10 20 30 01 11 21 31) */
- mmD = _mm_unpackhi_pi32(mmD, mmB); /* mmD=(02 12 22 32 03 13 23 33) */
- mmH = mmC;
- mmC = _mm_unpacklo_pi32(mmC, mmG); /* mmC=(04 14 24 34 05 15 25 35) */
- mmH = _mm_unpackhi_pi32(mmH, mmG); /* mmH=(06 16 26 36 07 17 27 37) */
+ mmA = _mm_unpacklo_pi8(mmA, mmC); /* (00 10 02 12 04 14 06 16) */
+ mmE = _mm_unpacklo_pi8(mmE, mmG); /* (20 30 22 32 24 34 26 36) */
+ mmB = _mm_unpacklo_pi8(mmB, mmD); /* (01 11 03 13 05 15 07 17) */
+ mmF = _mm_unpacklo_pi8(mmF, mmH); /* (21 31 23 33 25 35 27 37) */
+
+ mmC = _mm_unpackhi_pi16(mmA, mmE); /* (04 14 24 34 06 16 26 36) */
+ mmA = _mm_unpacklo_pi16(mmA, mmE); /* (00 10 20 30 02 12 22 32) */
+ mmG = _mm_unpackhi_pi16(mmB, mmF); /* (05 15 25 35 07 17 27 37) */
+ mmB = _mm_unpacklo_pi16(mmB, mmF); /* (01 11 21 31 03 13 23 33) */
+
+ mmD = _mm_unpackhi_pi32(mmA, mmB); /* (02 12 22 32 03 13 23 33) */
+ mmA = _mm_unpacklo_pi32(mmA, mmB); /* (00 10 20 30 01 11 21 31) */
+ mmH = _mm_unpackhi_pi32(mmC, mmG); /* (06 16 26 36 07 17 27 37) */
+ mmC = _mm_unpacklo_pi32(mmC, mmG); /* (04 14 24 34 05 15 25 35) */
if (num_cols >= 8) {
_mm_store_si64((__m64 *)outptr, mmA);
/*
* Loongson MMI optimizations for libjpeg-turbo
*
- * Copyright (C) 2015, 2018, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2015, 2018-2019, D. R. Commander. All Rights Reserved.
* Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
* All Rights Reserved.
* Authors: ZhuChen <zhuchen@loongson.cn>
#define PROCESS_ROW(r) { \
- mm7 = _mm_load_si64((__m64 *)outptr##r); /* mm7=IntrL=( 0 1 2 3) */ \
- mm3 = _mm_load_si64((__m64 *)outptr##r + 1); /* mm3=IntrH=( 4 5 6 7) */ \
+ __m64 samp0123, samp123X, samp3XXX, samp1234, sampX012, samp_1012; \
+ __m64 samp4567, sampXXX4, sampX456, samp3456, samp567X, samp7XXX, samp5678; \
+ __m64 outle, outhe, outlo, outho, outl, outh; \
\
- mm0 = mm7; \
- mm4 = mm3; \
- mm0 = _mm_srli_si64(mm0, 2 * BYTE_BIT); /* mm0=( 1 2 3 -) */ \
- mm4 = _mm_slli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( - - - 4) */ \
- mm5 = mm7; \
- mm6 = mm3; \
- mm5 = _mm_srli_si64(mm5, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm5=( 3 - - -) */ \
- mm6 = _mm_slli_si64(mm6, 2 * BYTE_BIT); /* mm6=( - 4 5 6) */ \
+ samp0123 = _mm_load_si64((__m64 *)outptr##r); /* ( 0 1 2 3) */ \
+ samp4567 = _mm_load_si64((__m64 *)outptr##r + 1); /* ( 4 5 6 7) */ \
\
- mm0 = _mm_or_si64(mm0, mm4); /* mm0=( 1 2 3 4) */ \
- mm5 = _mm_or_si64(mm5, mm6); /* mm5=( 3 4 5 6) */ \
+ samp123X = _mm_srli_si64(samp0123, 2 * BYTE_BIT); /* ( 1 2 3 -) */ \
+ sampXXX4 = _mm_slli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 4) */ \
+ samp3XXX = _mm_srli_si64(samp0123, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( 3 - - -) */ \
+ sampX456 = _mm_slli_si64(samp4567, 2 * BYTE_BIT); /* ( - 4 5 6) */ \
\
- mm1 = mm7; \
- mm2 = mm3; \
- mm1 = _mm_slli_si64(mm1, 2 * BYTE_BIT); /* mm1=( - 0 1 2) */ \
- mm2 = _mm_srli_si64(mm2, 2 * BYTE_BIT); /* mm2=( 5 6 7 -) */ \
- mm4 = mm3; \
- mm4 = _mm_srli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( 7 - - -) */ \
+ samp1234 = _mm_or_si64(samp123X, sampXXX4); /* ( 1 2 3 4) */ \
+ samp3456 = _mm_or_si64(samp3XXX, sampX456); /* ( 3 4 5 6) */ \
\
- mm1 = _mm_or_si64(mm1, wk[r]); /* mm1=(-1 0 1 2) */ \
- mm2 = _mm_or_si64(mm2, wk[r + 2]); /* mm2=( 5 6 6 8) */ \
+ sampX012 = _mm_slli_si64(samp0123, 2 * BYTE_BIT); /* ( - 0 1 2) */ \
+ samp567X = _mm_srli_si64(samp4567, 2 * BYTE_BIT); /* ( 5 6 7 -) */ \
+ samp7XXX = _mm_srli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( 7 - - -) */ \
\
- wk[r] = mm4; \
+ samp_1012 = _mm_or_si64(sampX012, wk[r]); /* (-1 0 1 2) */ \
+ samp5678 = _mm_or_si64(samp567X, wk[r + 2]); /* ( 5 6 7 8) */ \
\
- mm7 = _mm_mullo_pi16(mm7, PW_THREE); \
- mm3 = _mm_mullo_pi16(mm3, PW_THREE); \
- mm1 = _mm_add_pi16(mm1, PW_EIGHT); \
- mm5 = _mm_add_pi16(mm5, PW_EIGHT); \
- mm0 = _mm_add_pi16(mm0, PW_SEVEN); \
- mm2 = _mm_add_pi16(mm2, PW_SEVEN); \
+ wk[r] = samp7XXX; \
\
- mm1 = _mm_add_pi16(mm1, mm7); \
- mm5 = _mm_add_pi16(mm5, mm3); \
- mm1 = _mm_srli_pi16(mm1, 4); /* mm1=OutrLE=( 0 2 4 6) */ \
- mm5 = _mm_srli_pi16(mm5, 4); /* mm5=OutrHE=( 8 10 12 14) */ \
- mm0 = _mm_add_pi16(mm0, mm7); \
- mm2 = _mm_add_pi16(mm2, mm3); \
- mm0 = _mm_srli_pi16(mm0, 4); /* mm0=OutrLO=( 1 3 5 7) */ \
- mm2 = _mm_srli_pi16(mm2, 4); /* mm2=OutrHO=( 9 11 13 15) */ \
+ samp0123 = _mm_mullo_pi16(samp0123, PW_THREE); \
+ samp4567 = _mm_mullo_pi16(samp4567, PW_THREE); \
+ samp_1012 = _mm_add_pi16(samp_1012, PW_EIGHT); \
+ samp3456 = _mm_add_pi16(samp3456, PW_EIGHT); \
+ samp1234 = _mm_add_pi16(samp1234, PW_SEVEN); \
+ samp5678 = _mm_add_pi16(samp5678, PW_SEVEN); \
\
- mm0 = _mm_slli_pi16(mm0, BYTE_BIT); \
- mm2 = _mm_slli_pi16(mm2, BYTE_BIT); \
- mm1 = _mm_or_si64(mm1, mm0); /* mm1=OutrL=( 0 1 2 3 4 5 6 7) */ \
- mm5 = _mm_or_si64(mm5, mm2); /* mm5=OutrH=( 8 9 10 11 12 13 14 15) */ \
+ outle = _mm_add_pi16(samp_1012, samp0123); \
+ outhe = _mm_add_pi16(samp3456, samp4567); \
+ outle = _mm_srli_pi16(outle, 4); /* ( 0 2 4 6) */ \
+ outhe = _mm_srli_pi16(outhe, 4); /* ( 8 10 12 14) */ \
+ outlo = _mm_add_pi16(samp1234, samp0123); \
+ outho = _mm_add_pi16(samp5678, samp4567); \
+ outlo = _mm_srli_pi16(outlo, 4); /* ( 1 3 5 7) */ \
+ outho = _mm_srli_pi16(outho, 4); /* ( 9 11 13 15) */ \
\
- _mm_store_si64((__m64 *)outptr##r, mm1); \
- _mm_store_si64((__m64 *)outptr##r + 1, mm5); \
+ outlo = _mm_slli_pi16(outlo, BYTE_BIT); \
+ outho = _mm_slli_pi16(outho, BYTE_BIT); \
+ outl = _mm_or_si64(outle, outlo); /* ( 0 1 2 3 4 5 6 7) */ \
+ outh = _mm_or_si64(outhe, outho); /* ( 8 9 10 11 12 13 14 15) */ \
+ \
+ _mm_store_si64((__m64 *)outptr##r, outl); \
+ _mm_store_si64((__m64 *)outptr##r + 1, outh); \
}
void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
JSAMPARRAY output_data = *output_data_ptr;
JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
int inrow, outrow, incol, tmp, tmp1;
- __m64 mm0, mm1, mm2, mm3 = 0.0, mm4, mm5, mm6, mm7 = 0.0;
- __m64 wk[4], mm_tmp;
+ __m64 this_1l, this_1h, this_1, thiscolsum_1l, thiscolsum_1h;
+ __m64 this0l, this0h, this0;
+ __m64 this1l, this1h, this1, thiscolsum1l, thiscolsum1h;
+ __m64 next_1l, next_1h, next_1, nextcolsum_1l, nextcolsum_1h;
+ __m64 next0l, next0h, next0;
+ __m64 next1l, next1h, next1, nextcolsum1l, nextcolsum1h;
+ __m64 zero = 0.0, mask0 = 0.0, masklast, wk[4];
+
+ mask0 = _mm_cmpeq_pi8(mask0, mask0);
+ masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+ mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
if (downsampled_width & 7) {
tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
- tmp1 = downsampled_width * sizeof(JSAMPLE);
+ tmp1 = downsampled_width * sizeof(JSAMPLE);
asm("daddu $8, %3, %6\r\n"
"lb $9, ($8)\r\n"
"daddu $8, %3, %7\r\n"
}
/* process the first column block */
- mm0 = _mm_load_si64((__m64 *)inptr0); /* mm0 = row[ 0][0] */
- mm1 = _mm_load_si64((__m64 *)inptr_1); /* mm1 = row[-1][0] */
- mm2 = _mm_load_si64((__m64 *)inptr1); /* mm2 = row[ 1][0] */
-
- mm3 = _mm_xor_si64(mm3, mm3); /* mm3 = (all 0's) */
- mm4 = mm0;
- mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][0]( 0 1 2 3) */
- mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][0]( 4 5 6 7) */
- mm5 = mm1;
- mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][0]( 0 1 2 3) */
- mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][0]( 4 5 6 7) */
- mm6 = mm2;
- mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][0]( 0 1 2 3) */
- mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][0]( 4 5 6 7) */
-
- mm0 = _mm_mullo_pi16(mm0, PW_THREE);
- mm4 = _mm_mullo_pi16(mm4, PW_THREE);
-
- mm7 = _mm_cmpeq_pi8(mm7, mm7);
- mm7 = _mm_srli_si64(mm7, (SIZEOF_MMWORD - 2) * BYTE_BIT);
-
- mm1 = _mm_add_pi16(mm1, mm0); /* mm1=Int0L=( 0 1 2 3) */
- mm5 = _mm_add_pi16(mm5, mm4); /* mm5=Int0H=( 4 5 6 7) */
- mm2 = _mm_add_pi16(mm2, mm0); /* mm2=Int1L=( 0 1 2 3) */
- mm6 = _mm_add_pi16(mm6, mm4); /* mm6=Int1H=( 4 5 6 7) */
-
- _mm_store_si64((__m64 *)outptr0, mm1); /* temporarily save */
- _mm_store_si64((__m64 *)outptr0 + 1, mm5); /* the intermediate data */
- _mm_store_si64((__m64 *)outptr1, mm2);
- _mm_store_si64((__m64 *)outptr1 + 1, mm6);
-
- mm1 = _mm_and_si64(mm1, mm7); /* mm1=( 0 - - -) */
- mm2 = _mm_and_si64(mm2, mm7); /* mm2=( 0 - - -) */
-
- wk[0] = mm1;
- wk[1] = mm2;
+ this0 = _mm_load_si64((__m64 *)inptr0); /* row[ 0][0] */
+ this_1 = _mm_load_si64((__m64 *)inptr_1); /* row[-1][0] */
+ this1 = _mm_load_si64((__m64 *)inptr1); /* row[ 1][0] */
+
+ this0l = _mm_unpacklo_pi8(this0, zero); /* row[ 0][0]( 0 1 2 3) */
+ this0h = _mm_unpackhi_pi8(this0, zero); /* row[ 0][0]( 4 5 6 7) */
+ this_1l = _mm_unpacklo_pi8(this_1, zero); /* row[-1][0]( 0 1 2 3) */
+ this_1h = _mm_unpackhi_pi8(this_1, zero); /* row[-1][0]( 4 5 6 7) */
+ this1l = _mm_unpacklo_pi8(this1, zero); /* row[+1][0]( 0 1 2 3) */
+ this1h = _mm_unpackhi_pi8(this1, zero); /* row[+1][0]( 4 5 6 7) */
+
+ this0l = _mm_mullo_pi16(this0l, PW_THREE);
+ this0h = _mm_mullo_pi16(this0h, PW_THREE);
+
+ thiscolsum_1l = _mm_add_pi16(this_1l, this0l); /* ( 0 1 2 3) */
+ thiscolsum_1h = _mm_add_pi16(this_1h, this0h); /* ( 4 5 6 7) */
+ thiscolsum1l = _mm_add_pi16(this0l, this1l); /* ( 0 1 2 3) */
+ thiscolsum1h = _mm_add_pi16(this0h, this1h); /* ( 4 5 6 7) */
+
+ /* temporarily save the intermediate data */
+ _mm_store_si64((__m64 *)outptr0, thiscolsum_1l);
+ _mm_store_si64((__m64 *)outptr0 + 1, thiscolsum_1h);
+ _mm_store_si64((__m64 *)outptr1, thiscolsum1l);
+ _mm_store_si64((__m64 *)outptr1 + 1, thiscolsum1h);
+
+ wk[0] = _mm_and_si64(thiscolsum_1l, mask0); /* ( 0 - - -) */
+ wk[1] = _mm_and_si64(thiscolsum1l, mask0); /* ( 0 - - -) */
for (incol = downsampled_width; incol > 0;
incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
if (incol > 8) {
/* process the next column block */
- mm0 = _mm_load_si64((__m64 *)inptr0 + 1); /* mm0 = row[ 0][1] */
- mm1 = _mm_load_si64((__m64 *)inptr_1 + 1); /* mm1 = row[-1][1] */
- mm2 = _mm_load_si64((__m64 *)inptr1 + 1); /* mm2 = row[+1][1] */
-
- mm3 = _mm_setzero_si64(); /* mm3 = (all 0's) */
- mm4 = mm0;
- mm0 = _mm_unpacklo_pi8(mm0, mm3); /* mm0 = row[ 0][1]( 0 1 2 3) */
- mm4 = _mm_unpackhi_pi8(mm4, mm3); /* mm4 = row[ 0][1]( 4 5 6 7) */
- mm5 = mm1;
- mm1 = _mm_unpacklo_pi8(mm1, mm3); /* mm1 = row[-1][1]( 0 1 2 3) */
- mm5 = _mm_unpackhi_pi8(mm5, mm3); /* mm5 = row[-1][1]( 4 5 6 7) */
- mm6 = mm2;
- mm2 = _mm_unpacklo_pi8(mm2, mm3); /* mm2 = row[+1][1]( 0 1 2 3) */
- mm6 = _mm_unpackhi_pi8(mm6, mm3); /* mm6 = row[+1][1]( 4 5 6 7) */
-
- mm0 = _mm_mullo_pi16(mm0, PW_THREE);
- mm4 = _mm_mullo_pi16(mm4, PW_THREE);
-
- mm1 = _mm_add_pi16(mm1, mm0); /* mm1 = Int0L = ( 0 1 2 3) */
- mm5 = _mm_add_pi16(mm5, mm4); /* mm5 = Int0H = ( 4 5 6 7) */
- mm2 = _mm_add_pi16(mm2, mm0); /* mm2 = Int1L = ( 0 1 2 3) */
- mm6 = _mm_add_pi16(mm6, mm4); /* mm6 = Int1H = ( 4 5 6 7) */
-
- _mm_store_si64((__m64 *)outptr0 + 2, mm1); /* temporarily save */
- _mm_store_si64((__m64 *)outptr0 + 3, mm5); /* the intermediate data */
- _mm_store_si64((__m64 *)outptr1 + 2, mm2);
- _mm_store_si64((__m64 *)outptr1 + 3, mm6);
-
- mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm1=( - - - 0) */
- mm2 = _mm_slli_si64(mm2, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm2=( - - - 0) */
-
- wk[2] = mm1;
- wk[3] = mm2;
+ next0 = _mm_load_si64((__m64 *)inptr0 + 1); /* row[ 0][1] */
+ next_1 = _mm_load_si64((__m64 *)inptr_1 + 1); /* row[-1][1] */
+ next1 = _mm_load_si64((__m64 *)inptr1 + 1); /* row[+1][1] */
+
+ next0l = _mm_unpacklo_pi8(next0, zero); /* row[ 0][1]( 0 1 2 3) */
+ next0h = _mm_unpackhi_pi8(next0, zero); /* row[ 0][1]( 4 5 6 7) */
+ next_1l = _mm_unpacklo_pi8(next_1, zero); /* row[-1][1]( 0 1 2 3) */
+ next_1h = _mm_unpackhi_pi8(next_1, zero); /* row[-1][1]( 4 5 6 7) */
+ next1l = _mm_unpacklo_pi8(next1, zero); /* row[+1][1]( 0 1 2 3) */
+ next1h = _mm_unpackhi_pi8(next1, zero); /* row[+1][1]( 4 5 6 7) */
+
+ next0l = _mm_mullo_pi16(next0l, PW_THREE);
+ next0h = _mm_mullo_pi16(next0h, PW_THREE);
+
+ nextcolsum_1l = _mm_add_pi16(next_1l, next0l); /* ( 0 1 2 3) */
+ nextcolsum_1h = _mm_add_pi16(next_1h, next0h); /* ( 4 5 6 7) */
+ nextcolsum1l = _mm_add_pi16(next0l, next1l); /* ( 0 1 2 3) */
+ nextcolsum1h = _mm_add_pi16(next0h, next1h); /* ( 4 5 6 7) */
+
+ /* temporarily save the intermediate data */
+ _mm_store_si64((__m64 *)outptr0 + 2, nextcolsum_1l);
+ _mm_store_si64((__m64 *)outptr0 + 3, nextcolsum_1h);
+ _mm_store_si64((__m64 *)outptr1 + 2, nextcolsum1l);
+ _mm_store_si64((__m64 *)outptr1 + 3, nextcolsum1h);
+
+ wk[2] = _mm_slli_si64(nextcolsum_1l, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 0) */
+ wk[3] = _mm_slli_si64(nextcolsum1l, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* ( - - - 0) */
} else {
- /* process the last column block */
- mm1 = _mm_cmpeq_pi8(mm1, mm1);
- mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT);
- mm2 = mm1;
+ __m64 tmp;
- mm_tmp = _mm_load_si64((__m64 *)outptr0 + 1);
- mm1 = _mm_and_si64(mm1, mm_tmp); /* mm1=( - - - 7) */
- mm_tmp = _mm_load_si64((__m64 *)outptr1 + 1);
- mm2 = _mm_and_si64(mm2, mm_tmp); /* mm2=( - - - 7) */
-
- wk[2] = mm1;
- wk[3] = mm2;
+ /* process the last column block */
+ tmp = _mm_load_si64((__m64 *)outptr0 + 1);
+ wk[2] = _mm_and_si64(masklast, tmp); /* ( - - - 7) */
+ tmp = _mm_load_si64((__m64 *)outptr1 + 1);
+ wk[3] = _mm_and_si64(masklast, tmp); /* ( - - - 7) */
}
/* process the upper row */
* Authors: ZhuChen <zhuchen@loongson.cn>
* CaiWanwei <caiwanwei@loongson.cn>
* SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- * Copyright (C) 2018, D. R. Commander. All Rights Reserved.
+ * Copyright (C) 2018-2019, D. R. Commander. All Rights Reserved.
*
* Based on the x86 SIMD extension for IJG JPEG library
* Copyright (C) 1999-2006, MIYASAKA Masaru.
#define DO_QUANT() { \
- mm2 = _mm_load_si64((__m64 *)&workspace[0]); \
- mm3 = _mm_load_si64((__m64 *)&workspace[4]); \
+ __m64 rowl, rowh, rowls, rowhs, rowlsave, rowhsave; \
+ __m64 corrl, corrh, recipl, reciph, scalel, scaleh; \
\
- mm0 = mm2; \
- mm1 = mm3; \
+ rowl = _mm_load_si64((__m64 *)&workspace[0]); \
+ rowh = _mm_load_si64((__m64 *)&workspace[4]); \
\
- mm2 = _mm_srai_pi16(mm2, (WORD_BIT - 1)); /* -1 if value < 0, */ \
- /* 0 otherwise */ \
- mm3 = _mm_srai_pi16(mm3, (WORD_BIT - 1)); \
+ /* Branch-less absolute value */ \
+ rowls = _mm_srai_pi16(rowl, (WORD_BIT - 1)); /* -1 if value < 0, */ \
+ /* 0 otherwise */ \
+ rowhs = _mm_srai_pi16(rowh, (WORD_BIT - 1)); \
\
- mm0 = _mm_xor_si64(mm0, mm2); /* val = -val */ \
- mm1 = _mm_xor_si64(mm1, mm3); \
- mm0 = _mm_sub_pi16(mm0, mm2); \
- mm1 = _mm_sub_pi16(mm1, mm3); \
+ rowl = _mm_xor_si64(rowl, rowls); /* val = -val */ \
+ rowh = _mm_xor_si64(rowh, rowhs); \
+ rowl = _mm_sub_pi16(rowl, rowls); \
+ rowh = _mm_sub_pi16(rowh, rowhs); \
\
- corr0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]); /* correction */ \
- corr1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
+ corrl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]); /* correction */ \
+ corrh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
\
- mm0 = _mm_add_pi16(mm0, corr0); /* correction + roundfactor */ \
- mm1 = _mm_add_pi16(mm1, corr1); \
+ rowlsave = rowl = _mm_add_pi16(rowl, corrl); /* correction + roundfactor */ \
+ rowhsave = rowh = _mm_add_pi16(rowh, corrh); \
\
- mm4 = mm0; \
- mm5 = mm1; \
+ recipl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]); /* reciprocal */ \
+ reciph = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
\
- recip0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]); /* reciprocal */ \
- recip1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
+ rowl = _mm_mulhi_pi16(rowl, recipl); \
+ rowh = _mm_mulhi_pi16(rowh, reciph); \
\
- mm0 = _mm_mulhi_pi16(mm0, recip0); \
- mm1 = _mm_mulhi_pi16(mm1, recip1); \
+ /* reciprocal is always negative (MSB=1), so we always need to add the */ \
+ /* initial value (input value is never negative as we inverted it at the */ \
+ /* start of this routine) */ \
+ rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \
+ rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \
\
- mm0 = _mm_add_pi16(mm0, mm4); /* reciprocal is always negative */ \
- mm1 = _mm_add_pi16(mm1, mm5); /* (MSB=1), so we always need to add the */ \
- /* initial value (input value is never */ \
- /* negative as we inverted it at the */ \
- /* start of this routine) */ \
+ scalel = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]); /* scale */ \
+ scaleh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
\
- scale0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]); /* scale */ \
- scale1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
+ rowl = _mm_mulhi_pi16(rowl, scalel); \
+ rowh = _mm_mulhi_pi16(rowh, scaleh); \
\
- mm6 = scale0; \
- mm7 = scale1; \
- mm4 = mm0; \
- mm5 = mm1; \
+ /* determine if scale is negative */ \
+ scalel = _mm_srai_pi16(scalel, (WORD_BIT - 1)); \
+ scaleh = _mm_srai_pi16(scaleh, (WORD_BIT - 1)); \
\
- mm0 = _mm_mulhi_pi16(mm0, mm6); \
- mm1 = _mm_mulhi_pi16(mm1, mm7); \
+ /* and add input if it is */ \
+ scalel = _mm_and_si64(scalel, rowlsave); \
+ scaleh = _mm_and_si64(scaleh, rowhsave); \
+ rowl = _mm_add_pi16(rowl, scalel); \
+ rowh = _mm_add_pi16(rowh, scaleh); \
\
- mm6 = _mm_srai_pi16(mm6, (WORD_BIT - 1)); /* determine if scale... */ \
- /* is negative */ \
- mm7 = _mm_srai_pi16(mm7, (WORD_BIT - 1)); \
+ /* then check if negative input */ \
+ rowlsave = _mm_srai_pi16(rowlsave, (WORD_BIT - 1)); \
+ rowhsave = _mm_srai_pi16(rowhsave, (WORD_BIT - 1)); \
\
- mm6 = _mm_and_si64(mm6, mm4); /* and add input if it is */ \
- mm7 = _mm_and_si64(mm7, mm5); \
- mm0 = _mm_add_pi16(mm0, mm6); \
- mm1 = _mm_add_pi16(mm1, mm7); \
+ /* and add scale if it is */ \
+ rowlsave = _mm_and_si64(rowlsave, scalel); \
+ rowhsave = _mm_and_si64(rowhsave, scaleh); \
+ rowl = _mm_add_pi16(rowl, rowlsave); \
+ rowh = _mm_add_pi16(rowh, rowhsave); \
\
- mm4 = _mm_srai_pi16(mm4, (WORD_BIT - 1)); /* then check if... */ \
- mm5 = _mm_srai_pi16(mm5, (WORD_BIT - 1)); /* negative input */ \
+ rowl = _mm_xor_si64(rowl, rowls); /* val = -val */ \
+ rowh = _mm_xor_si64(rowh, rowhs); \
+ rowl = _mm_sub_pi16(rowl, rowls); \
+ rowh = _mm_sub_pi16(rowh, rowhs); \
\
- mm4 = _mm_and_si64(mm4, scale0); /* and add scale if it is */ \
- mm5 = _mm_and_si64(mm5, scale1); \
- mm0 = _mm_add_pi16(mm0, mm4); \
- mm1 = _mm_add_pi16(mm1, mm5); \
- \
- mm0 = _mm_xor_si64(mm0, mm2); /* val = -val */ \
- mm1 = _mm_xor_si64(mm1, mm3); \
- mm0 = _mm_sub_pi16(mm0, mm2); \
- mm1 = _mm_sub_pi16(mm1, mm3); \
- \
- _mm_store_si64((__m64 *)&output_ptr[0], mm0); \
- _mm_store_si64((__m64 *)&output_ptr[4], mm1); \
+ _mm_store_si64((__m64 *)&output_ptr[0], rowl); \
+ _mm_store_si64((__m64 *)&output_ptr[4], rowh); \
\
workspace += DCTSIZE; \
divisors += DCTSIZE; \
DCTELEM *workspace)
{
JCOEFPTR output_ptr = coef_block;
- __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
- __m64 corr0, corr1, recip0, recip1, scale0, scale1;
DO_QUANT()
DO_QUANT()