]> granicus.if.org Git - libjpeg-turbo/commitdiff
Add colorspace extensions to merged upsampling routines
authorDRC <dcommander@users.sourceforge.net>
Sun, 5 Apr 2009 21:51:25 +0000 (21:51 +0000)
committerDRC <dcommander@users.sourceforge.net>
Sun, 5 Apr 2009 21:51:25 +0000 (21:51 +0000)
git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@42 632fc199-4ca6-4c93-a231-07263d6284db

jdmaster.c
jsimd.c
simd/jdmermmx.asm
simd/jdmerss2.asm
simd/jdmrgmmx.asm [new file with mode: 0644]
simd/jdmrgss2.asm [new file with mode: 0644]
simd/jsimd.h

index 2802c5b7b29757e27b561ccccedba169deb9b42c..38db792fb48c0901383e1b72d1f21e7343878e2c 100644 (file)
@@ -49,8 +49,14 @@ use_merged_upsample (j_decompress_ptr cinfo)
     return FALSE;
   /* jdmerge.c only supports YCC=>RGB color conversion */
   if (cinfo->jpeg_color_space != JCS_YCbCr || cinfo->num_components != 3 ||
-      cinfo->out_color_space != JCS_RGB ||
-      cinfo->out_color_components != RGB_PIXELSIZE)
+      (cinfo->out_color_space != JCS_RGB &&
+      cinfo->out_color_space != JCS_EXT_RGB &&
+      cinfo->out_color_space != JCS_EXT_RGBX &&
+      cinfo->out_color_space != JCS_EXT_BGR &&
+      cinfo->out_color_space != JCS_EXT_BGRX &&
+      cinfo->out_color_space != JCS_EXT_XBGR &&
+      cinfo->out_color_space != JCS_EXT_XRGB) ||
+      cinfo->out_color_components != rgb_pixelsize[cinfo->out_color_space])
     return FALSE;
   /* and it only handles 2h1v or 2h2v sampling ratios */
   if (cinfo->comp_info[0].h_samp_factor != 2 ||
@@ -175,10 +181,14 @@ jpeg_calc_output_dimensions (j_decompress_ptr cinfo)
     cinfo->out_color_components = 1;
     break;
   case JCS_RGB:
-#if RGB_PIXELSIZE != 3
-    cinfo->out_color_components = RGB_PIXELSIZE;
+  case JCS_EXT_RGB:
+  case JCS_EXT_RGBX:
+  case JCS_EXT_BGR:
+  case JCS_EXT_BGRX:
+  case JCS_EXT_XBGR:
+  case JCS_EXT_XRGB:
+    cinfo->out_color_components = rgb_pixelsize[cinfo->out_color_space];
     break;
-#endif /* else share code with YCbCr */
   case JCS_YCbCr:
     cinfo->out_color_components = 3;
     break;
diff --git a/jsimd.c b/jsimd.c
index 2c49a6bde1b863f79ed0f4a8ed4c5888163a3e36..861309ab71d5f8bfd8d4e5ae4d7da1ce99139dac 100644 (file)
--- a/jsimd.c
+++ b/jsimd.c
@@ -461,12 +461,45 @@ jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
                             JSAMPARRAY output_buf)
 {
 #ifdef WITH_SIMD
+  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_h2v2_extrgb_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extrgb_merged_upsample_mmx;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_h2v2_extrgbx_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extrgbx_merged_upsample_mmx;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_h2v2_extbgr_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extbgr_merged_upsample_mmx;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_h2v2_extbgrx_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extbgrx_merged_upsample_mmx;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_h2v2_extxbgr_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extxbgr_merged_upsample_mmx;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_h2v2_extxrgb_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_extxrgb_merged_upsample_mmx;
+      break;
+    default:
+      sse2fct=jsimd_h2v2_merged_upsample_sse2;
+      mmxfct=jsimd_h2v2_merged_upsample_mmx;
+      break;
+  }
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    jsimd_h2v2_merged_upsample_sse2(cinfo->output_width, input_buf,
+    sse2fct(cinfo->output_width, input_buf,
         in_row_group_ctr, output_buf);
   else if (simd_support & JSIMD_MMX)
-    jsimd_h2v2_merged_upsample_mmx(cinfo->output_width, input_buf,
+    mmxfct(cinfo->output_width, input_buf,
         in_row_group_ctr, output_buf);
 #endif
 }
@@ -478,12 +511,45 @@ jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
                             JSAMPARRAY output_buf)
 {
 #ifdef WITH_SIMD
+  void (*sse2fct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  void (*mmxfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+  switch(cinfo->out_color_space)
+  {
+    case JCS_EXT_RGB:
+      sse2fct=jsimd_h2v1_extrgb_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extrgb_merged_upsample_mmx;
+      break;
+    case JCS_EXT_RGBX:
+      sse2fct=jsimd_h2v1_extrgbx_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extrgbx_merged_upsample_mmx;
+      break;
+    case JCS_EXT_BGR:
+      sse2fct=jsimd_h2v1_extbgr_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extbgr_merged_upsample_mmx;
+      break;
+    case JCS_EXT_BGRX:
+      sse2fct=jsimd_h2v1_extbgrx_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extbgrx_merged_upsample_mmx;
+      break;
+    case JCS_EXT_XBGR:
+      sse2fct=jsimd_h2v1_extxbgr_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extxbgr_merged_upsample_mmx;
+      break;
+    case JCS_EXT_XRGB:
+      sse2fct=jsimd_h2v1_extxrgb_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_extxrgb_merged_upsample_mmx;
+      break;
+    default:
+      sse2fct=jsimd_h2v1_merged_upsample_sse2;
+      mmxfct=jsimd_h2v1_merged_upsample_mmx;
+      break;
+  }
   if ((simd_support & JSIMD_SSE2) &&
       IS_ALIGNED_SSE(jconst_merged_upsample_sse2))
-    jsimd_h2v1_merged_upsample_sse2(cinfo->output_width, input_buf,
+    sse2fct(cinfo->output_width, input_buf,
         in_row_group_ctr, output_buf);
   else if (simd_support & JSIMD_MMX)
-    jsimd_h2v1_merged_upsample_mmx(cinfo->output_width, input_buf,
+    mmxfct(cinfo->output_width, input_buf,
         in_row_group_ctr, output_buf);
 #endif
 }
index 8ebe243dd2068f81ecbc332c5fa0b4fc6ffb110e..fd587fbc1247d8504b07313d7486e4d2e04e6286 100644 (file)
@@ -2,6 +2,7 @@
 ; jdmermmx.asm - merged upsampling/color conversion (MMX)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
 ;
 ; Based on
 ; x86 SIMD extension for IJG JPEG library
@@ -17,7 +18,6 @@
 ; [TAB8]
 
 %include "jsimdext.inc"
-%include "jcolsamp.inc"
 
 ; --------------------------------------------------------------------------
 
@@ -48,445 +48,76 @@ PD_ONEHALF times 2 dd  1 << (SCALEBITS-1)
        alignz  16
 
 ; --------------------------------------------------------------------------
-       SECTION SEG_TEXT
-       BITS    32
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
-;                                 JSAMPIMAGE input_buf,
-;                                 JDIMENSION in_row_group_ctr,
-;                                 JSAMPARRAY output_buf);
-;
-
-%define output_width(b)        (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)           (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)    (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)          (b)+20          ; JSAMPARRAY output_buf
-
-%define original_ebp   ebp+0
-%define wk(i)          ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
-%define WK_NUM         3
-%define gotptr         wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-       align   16
-       global  EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-EXTN(jsimd_h2v1_merged_upsample_mmx):
-       push    ebp
-       mov     eax,esp                         ; eax = original ebp
-       sub     esp, byte 4
-       and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
-       mov     [esp],eax
-       mov     ebp,esp                         ; ebp = aligned ebp
-       lea     esp, [wk(0)]
-       pushpic eax             ; make a room for GOT address
-       push    ebx
-;      push    ecx             ; need not be preserved
-;      push    edx             ; need not be preserved
-       push    esi
-       push    edi
-
-       get_GOT ebx                     ; get GOT address
-       movpic  POINTER [gotptr], ebx   ; save GOT address
-
-       mov     ecx, JDIMENSION [output_width(eax)]     ; col
-       test    ecx,ecx
-       jz      near .return
-
-       push    ecx
-
-       mov     edi, JSAMPIMAGE [input_buf(eax)]
-       mov     ecx, JDIMENSION [in_row_group_ctr(eax)]
-       mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-       mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-       mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-       mov     edi, JSAMPARRAY [output_buf(eax)]
-       mov     esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]         ; inptr0
-       mov     ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]         ; inptr1
-       mov     edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]         ; inptr2
-       mov     edi, JSAMPROW [edi]                             ; outptr
-
-       pop     ecx                     ; col
-
-       alignx  16,7
-.columnloop:
-       movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-       movq      mm6, MMWORD [ebx]     ; mm6=Cb(01234567)
-       movq      mm7, MMWORD [edx]     ; mm7=Cr(01234567)
-
-       pxor      mm1,mm1               ; mm1=(all 0's)
-       pcmpeqw   mm3,mm3
-       psllw     mm3,7                 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
-
-       movq      mm4,mm6
-       punpckhbw mm6,mm1               ; mm6=Cb(4567)=CbH
-       punpcklbw mm4,mm1               ; mm4=Cb(0123)=CbL
-       movq      mm0,mm7
-       punpckhbw mm7,mm1               ; mm7=Cr(4567)=CrH
-       punpcklbw mm0,mm1               ; mm0=Cr(0123)=CrL
-
-       paddw     mm6,mm3
-       paddw     mm4,mm3
-       paddw     mm7,mm3
-       paddw     mm0,mm3
-
-       ; (Original)
-       ; R = Y                + 1.40200 * Cr
-       ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-       ; B = Y + 1.77200 * Cb
-       ;
-       ; (This implementation)
-       ; R = Y                + 0.40200 * Cr + Cr
-       ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-       ; B = Y - 0.22800 * Cb + Cb + Cb
-
-       movq    mm5,mm6                 ; mm5=CbH
-       movq    mm2,mm4                 ; mm2=CbL
-       paddw   mm6,mm6                 ; mm6=2*CbH
-       paddw   mm4,mm4                 ; mm4=2*CbL
-       movq    mm1,mm7                 ; mm1=CrH
-       movq    mm3,mm0                 ; mm3=CrL
-       paddw   mm7,mm7                 ; mm7=2*CrH
-       paddw   mm0,mm0                 ; mm0=2*CrL
-
-       pmulhw  mm6,[GOTOFF(eax,PW_MF0228)]     ; mm6=(2*CbH * -FIX(0.22800))
-       pmulhw  mm4,[GOTOFF(eax,PW_MF0228)]     ; mm4=(2*CbL * -FIX(0.22800))
-       pmulhw  mm7,[GOTOFF(eax,PW_F0402)]      ; mm7=(2*CrH * FIX(0.40200))
-       pmulhw  mm0,[GOTOFF(eax,PW_F0402)]      ; mm0=(2*CrL * FIX(0.40200))
-
-       paddw   mm6,[GOTOFF(eax,PW_ONE)]
-       paddw   mm4,[GOTOFF(eax,PW_ONE)]
-       psraw   mm6,1                   ; mm6=(CbH * -FIX(0.22800))
-       psraw   mm4,1                   ; mm4=(CbL * -FIX(0.22800))
-       paddw   mm7,[GOTOFF(eax,PW_ONE)]
-       paddw   mm0,[GOTOFF(eax,PW_ONE)]
-       psraw   mm7,1                   ; mm7=(CrH * FIX(0.40200))
-       psraw   mm0,1                   ; mm0=(CrL * FIX(0.40200))
-
-       paddw   mm6,mm5
-       paddw   mm4,mm2
-       paddw   mm6,mm5                 ; mm6=(CbH * FIX(1.77200))=(B-Y)H
-       paddw   mm4,mm2                 ; mm4=(CbL * FIX(1.77200))=(B-Y)L
-       paddw   mm7,mm1                 ; mm7=(CrH * FIX(1.40200))=(R-Y)H
-       paddw   mm0,mm3                 ; mm0=(CrL * FIX(1.40200))=(R-Y)L
-
-       movq    MMWORD [wk(0)], mm6     ; wk(0)=(B-Y)H
-       movq    MMWORD [wk(1)], mm7     ; wk(1)=(R-Y)H
-
-       movq      mm6,mm5
-       movq      mm7,mm2
-       punpcklwd mm5,mm1
-       punpckhwd mm6,mm1
-       pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
-       pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
-       punpcklwd mm2,mm3
-       punpckhwd mm7,mm3
-       pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
-       pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
-       paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
-       paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
-       psrad     mm5,SCALEBITS
-       psrad     mm6,SCALEBITS
-       paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
-       paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
-       psrad     mm2,SCALEBITS
-       psrad     mm7,SCALEBITS
-
-       packssdw  mm5,mm6       ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-       packssdw  mm2,mm7       ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-       psubw     mm5,mm1       ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-       psubw     mm2,mm3       ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-       movq    MMWORD [wk(2)], mm5     ; wk(2)=(G-Y)H
-
-       mov     al,2                    ; Yctr
-       jmp     short .Yloop_1st
-       alignx  16,7
-
-.Yloop_2nd:
-       movq    mm0, MMWORD [wk(1)]     ; mm0=(R-Y)H
-       movq    mm2, MMWORD [wk(2)]     ; mm2=(G-Y)H
-       movq    mm4, MMWORD [wk(0)]     ; mm4=(B-Y)H
-       alignx  16,7
-
-.Yloop_1st:
-       movq    mm7, MMWORD [esi]       ; mm7=Y(01234567)
-
-       pcmpeqw mm6,mm6
-       psrlw   mm6,BYTE_BIT            ; mm6={0xFF 0x00 0xFF 0x00 ..}
-       pand    mm6,mm7                 ; mm6=Y(0246)=YE
-       psrlw   mm7,BYTE_BIT            ; mm7=Y(1357)=YO
-
-       movq    mm1,mm0                 ; mm1=mm0=(R-Y)(L/H)
-       movq    mm3,mm2                 ; mm3=mm2=(G-Y)(L/H)
-       movq    mm5,mm4                 ; mm5=mm4=(B-Y)(L/H)
-
-       paddw     mm0,mm6               ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
-       paddw     mm1,mm7               ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
-       packuswb  mm0,mm0               ; mm0=(R0 R2 R4 R6 ** ** ** **)
-       packuswb  mm1,mm1               ; mm1=(R1 R3 R5 R7 ** ** ** **)
-
-       paddw     mm2,mm6               ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
-       paddw     mm3,mm7               ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
-       packuswb  mm2,mm2               ; mm2=(G0 G2 G4 G6 ** ** ** **)
-       packuswb  mm3,mm3               ; mm3=(G1 G3 G5 G7 ** ** ** **)
-
-       paddw     mm4,mm6               ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
-       paddw     mm5,mm7               ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
-       packuswb  mm4,mm4               ; mm4=(B0 B2 B4 B6 ** ** ** **)
-       packuswb  mm5,mm5               ; mm5=(B1 B3 B5 B7 ** ** ** **)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-       ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-       ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-       ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-       ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
-
-       punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
-       punpcklbw mmE,mmB               ; mmE=(20 01 22 03 24 05 26 07)
-       punpcklbw mmD,mmF               ; mmD=(11 21 13 23 15 25 17 27)
-
-       movq      mmG,mmA
-       movq      mmH,mmA
-       punpcklwd mmA,mmE               ; mmA=(00 10 20 01 02 12 22 03)
-       punpckhwd mmG,mmE               ; mmG=(04 14 24 05 06 16 26 07)
-
-       psrlq     mmH,2*BYTE_BIT        ; mmH=(02 12 04 14 06 16 -- --)
-       psrlq     mmE,2*BYTE_BIT        ; mmE=(22 03 24 05 26 07 -- --)
-
-       movq      mmC,mmD
-       movq      mmB,mmD
-       punpcklwd mmD,mmH               ; mmD=(11 21 02 12 13 23 04 14)
-       punpckhwd mmC,mmH               ; mmC=(15 25 06 16 17 27 -- --)
-
-       psrlq     mmB,2*BYTE_BIT        ; mmB=(13 23 15 25 17 27 -- --)
-
-       movq      mmF,mmE
-       punpcklwd mmE,mmB               ; mmE=(22 03 13 23 24 05 15 25)
-       punpckhwd mmF,mmB               ; mmF=(26 07 17 27 -- -- -- --)
-
-       punpckldq mmA,mmD               ; mmA=(00 10 20 01 11 21 02 12)
-       punpckldq mmE,mmG               ; mmE=(22 03 13 23 04 14 24 05)
-       punpckldq mmC,mmF               ; mmC=(15 25 06 16 26 07 17 27)
-
-       cmp     ecx, byte SIZEOF_MMWORD
-       jb      short .column_st16
-
-       movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-       movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
-       movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
-
-       sub     ecx, byte SIZEOF_MMWORD
-       jz      short .endcolumn
-
-       add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
-       add     esi, byte SIZEOF_MMWORD                 ; inptr0
-       dec     al                      ; Yctr
-       jnz     near .Yloop_2nd
-
-       add     ebx, byte SIZEOF_MMWORD                 ; inptr1
-       add     edx, byte SIZEOF_MMWORD                 ; inptr2
-       jmp     near .columnloop
-       alignx  16,7
-
-.column_st16:
-       lea     ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
-       cmp     ecx, byte 2*SIZEOF_MMWORD
-       jb      short .column_st8
-       movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-       movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
-       movq    mmA,mmC
-       sub     ecx, byte 2*SIZEOF_MMWORD
-       add     edi, byte 2*SIZEOF_MMWORD
-       jmp     short .column_st4
-.column_st8:
-       cmp     ecx, byte SIZEOF_MMWORD
-       jb      short .column_st4
-       movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-       movq    mmA,mmE
-       sub     ecx, byte SIZEOF_MMWORD
-       add     edi, byte SIZEOF_MMWORD
-.column_st4:
-       movd    eax,mmA
-       cmp     ecx, byte SIZEOF_DWORD
-       jb      short .column_st2
-       mov     DWORD [edi+0*SIZEOF_DWORD], eax
-       psrlq   mmA,DWORD_BIT
-       movd    eax,mmA
-       sub     ecx, byte SIZEOF_DWORD
-       add     edi, byte SIZEOF_DWORD
-.column_st2:
-       cmp     ecx, byte SIZEOF_WORD
-       jb      short .column_st1
-       mov     WORD [edi+0*SIZEOF_WORD], ax
-       shr     eax,WORD_BIT
-       sub     ecx, byte SIZEOF_WORD
-       add     edi, byte SIZEOF_WORD
-.column_st1:
-       cmp     ecx, byte SIZEOF_BYTE
-       jb      short .endcolumn
-       mov     BYTE [edi+0*SIZEOF_BYTE], al
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-       pcmpeqb   mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
-       pcmpeqb   mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%else
-       pxor      mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
-       pxor      mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
-%endif
-       ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
-       ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
-       ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
-       ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
-
-       punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
-       punpcklbw mmE,mmG               ; mmE=(20 30 22 32 24 34 26 36)
-       punpcklbw mmB,mmD               ; mmB=(01 11 03 13 05 15 07 17)
-       punpcklbw mmF,mmH               ; mmF=(21 31 23 33 25 35 27 37)
-
-       movq      mmC,mmA
-       punpcklwd mmA,mmE               ; mmA=(00 10 20 30 02 12 22 32)
-       punpckhwd mmC,mmE               ; mmC=(04 14 24 34 06 16 26 36)
-       movq      mmG,mmB
-       punpcklwd mmB,mmF               ; mmB=(01 11 21 31 03 13 23 33)
-       punpckhwd mmG,mmF               ; mmG=(05 15 25 35 07 17 27 37)
-
-       movq      mmD,mmA
-       punpckldq mmA,mmB               ; mmA=(00 10 20 30 01 11 21 31)
-       punpckhdq mmD,mmB               ; mmD=(02 12 22 32 03 13 23 33)
-       movq      mmH,mmC
-       punpckldq mmC,mmG               ; mmC=(04 14 24 34 05 15 25 35)
-       punpckhdq mmH,mmG               ; mmH=(06 16 26 36 07 17 27 37)
-
-       cmp     ecx, byte SIZEOF_MMWORD
-       jb      short .column_st16
-
-       movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-       movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
-       movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
-       movq    MMWORD [edi+3*SIZEOF_MMWORD], mmH
-
-       sub     ecx, byte SIZEOF_MMWORD
-       jz      short .endcolumn
-
-       add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
-       add     esi, byte SIZEOF_MMWORD                 ; inptr0
-       dec     al                      ; Yctr
-       jnz     near .Yloop_2nd
-
-       add     ebx, byte SIZEOF_MMWORD                 ; inptr1
-       add     edx, byte SIZEOF_MMWORD                 ; inptr2
-       jmp     near .columnloop
-       alignx  16,7
-
-.column_st16:
-       cmp     ecx, byte SIZEOF_MMWORD/2
-       jb      short .column_st8
-       movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-       movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
-       movq    mmA,mmC
-       movq    mmD,mmH
-       sub     ecx, byte SIZEOF_MMWORD/2
-       add     edi, byte 2*SIZEOF_MMWORD
-.column_st8:
-       cmp     ecx, byte SIZEOF_MMWORD/4
-       jb      short .column_st4
-       movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
-       movq    mmA,mmD
-       sub     ecx, byte SIZEOF_MMWORD/4
-       add     edi, byte 1*SIZEOF_MMWORD
-.column_st4:
-       cmp     ecx, byte SIZEOF_MMWORD/8
-       jb      short .endcolumn
-       movd    DWORD [edi+0*SIZEOF_DWORD], mmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-       emms            ; empty MMX state
-
-.return:
-       pop     edi
-       pop     esi
-;      pop     edx             ; need not be preserved
-;      pop     ecx             ; need not be preserved
-       pop     ebx
-       mov     esp,ebp         ; esp <- aligned ebp
-       pop     esp             ; esp <- original ebp
-       pop     ebp
-       ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
-;                                 JSAMPIMAGE input_buf,
-;                                 JDIMENSION in_row_group_ctr,
-;                                 JSAMPARRAY output_buf);
-;
-
-%define output_width(b)        (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)           (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)    (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)          (b)+20          ; JSAMPARRAY output_buf
-
-       align   16
-       global  EXTN(jsimd_h2v2_merged_upsample_mmx)
-
-EXTN(jsimd_h2v2_merged_upsample_mmx):
-       push    ebp
-       mov     ebp,esp
-       push    ebx
-;      push    ecx             ; need not be preserved
-;      push    edx             ; need not be preserved
-       push    esi
-       push    edi
-
-       mov     eax, JDIMENSION [output_width(ebp)]
-
-       mov     edi, JSAMPIMAGE [input_buf(ebp)]
-       mov     ecx, JDIMENSION [in_row_group_ctr(ebp)]
-       mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-       mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-       mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-       mov     edi, JSAMPARRAY [output_buf(ebp)]
-       lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
-
-       push    edx                     ; inptr2
-       push    ebx                     ; inptr1
-       push    esi                     ; inptr00
-       mov     ebx,esp
-
-       push    edi                     ; output_buf (outptr0)
-       push    ecx                     ; in_row_group_ctr
-       push    ebx                     ; input_buf
-       push    eax                     ; output_width
-
-       call    near EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-       add     esi, byte SIZEOF_JSAMPROW       ; inptr01
-       add     edi, byte SIZEOF_JSAMPROW       ; outptr1
-       mov     POINTER [ebx+0*SIZEOF_POINTER], esi
-       mov     POINTER [ebx-1*SIZEOF_POINTER], edi
-
-       call    near EXTN(jsimd_h2v1_merged_upsample_mmx)
-
-       add     esp, byte 7*SIZEOF_DWORD
-
-       pop     edi
-       pop     esi
-;      pop     edx             ; need not be preserved
-;      pop     ecx             ; need not be preserved
-       pop     ebx
-       pop     ebp
-       ret
-
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgb_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extrgbx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extrgbx_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgr_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extbgrx_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extbgrx_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxbgr_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxbgr_merged_upsample_mmx
+%include "jdmrgmmx.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_mmx jsimd_h2v1_extxrgb_merged_upsample_mmx
+%define jsimd_h2v2_merged_upsample_mmx jsimd_h2v2_extxrgb_merged_upsample_mmx
+%include "jdmrgmmx.asm"
index 8a7c68f02b35b0547daa533057e3fefc7f69fa23..2294e0d3ef3150cfbc8f323cfeaa0c4d9e96361d 100644 (file)
@@ -2,6 +2,7 @@
 ; jdmerss2.asm - merged upsampling/color conversion (SSE2)
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009 D. R. Commander
 ;
 ; Based on
 ; x86 SIMD extension for IJG JPEG library
@@ -17,7 +18,6 @@
 ; [TAB8]
 
 %include "jsimdext.inc"
-%include "jcolsamp.inc"
 
 ; --------------------------------------------------------------------------
 
@@ -48,543 +48,76 @@ PD_ONEHALF times 4 dd  1 << (SCALEBITS-1)
        alignz  16
 
 ; --------------------------------------------------------------------------
-       SECTION SEG_TEXT
-       BITS    32
-;
-; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-%define output_width(b)        (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)           (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)    (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)          (b)+20          ; JSAMPARRAY output_buf
-
-%define original_ebp   ebp+0
-%define wk(i)          ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
-%define WK_NUM         3
-%define gotptr         wk(0)-SIZEOF_POINTER    ; void * gotptr
-
-       align   16
-       global  EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-EXTN(jsimd_h2v1_merged_upsample_sse2):
-       push    ebp
-       mov     eax,esp                         ; eax = original ebp
-       sub     esp, byte 4
-       and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
-       mov     [esp],eax
-       mov     ebp,esp                         ; ebp = aligned ebp
-       lea     esp, [wk(0)]
-       pushpic eax             ; make a room for GOT address
-       push    ebx
-;      push    ecx             ; need not be preserved
-;      push    edx             ; need not be preserved
-       push    esi
-       push    edi
-
-       get_GOT ebx                     ; get GOT address
-       movpic  POINTER [gotptr], ebx   ; save GOT address
-
-       mov     ecx, JDIMENSION [output_width(eax)]     ; col
-       test    ecx,ecx
-       jz      near .return
-
-       push    ecx
-
-       mov     edi, JSAMPIMAGE [input_buf(eax)]
-       mov     ecx, JDIMENSION [in_row_group_ctr(eax)]
-       mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-       mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-       mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-       mov     edi, JSAMPARRAY [output_buf(eax)]
-       mov     esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]         ; inptr0
-       mov     ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]         ; inptr1
-       mov     edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]         ; inptr2
-       mov     edi, JSAMPROW [edi]                             ; outptr
-
-       pop     ecx                     ; col
-
-       alignx  16,7
-.columnloop:
-       movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
-
-       movdqa    xmm6, XMMWORD [ebx]   ; xmm6=Cb(0123456789ABCDEF)
-       movdqa    xmm7, XMMWORD [edx]   ; xmm7=Cr(0123456789ABCDEF)
-
-       pxor      xmm1,xmm1             ; xmm1=(all 0's)
-       pcmpeqw   xmm3,xmm3
-       psllw     xmm3,7                ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
-
-       movdqa    xmm4,xmm6
-       punpckhbw xmm6,xmm1             ; xmm6=Cb(89ABCDEF)=CbH
-       punpcklbw xmm4,xmm1             ; xmm4=Cb(01234567)=CbL
-       movdqa    xmm0,xmm7
-       punpckhbw xmm7,xmm1             ; xmm7=Cr(89ABCDEF)=CrH
-       punpcklbw xmm0,xmm1             ; xmm0=Cr(01234567)=CrL
-
-       paddw     xmm6,xmm3
-       paddw     xmm4,xmm3
-       paddw     xmm7,xmm3
-       paddw     xmm0,xmm3
-
-       ; (Original)
-       ; R = Y                + 1.40200 * Cr
-       ; G = Y - 0.34414 * Cb - 0.71414 * Cr
-       ; B = Y + 1.77200 * Cb
-       ;
-       ; (This implementation)
-       ; R = Y                + 0.40200 * Cr + Cr
-       ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-       ; B = Y - 0.22800 * Cb + Cb + Cb
-
-       movdqa  xmm5,xmm6               ; xmm5=CbH
-       movdqa  xmm2,xmm4               ; xmm2=CbL
-       paddw   xmm6,xmm6               ; xmm6=2*CbH
-       paddw   xmm4,xmm4               ; xmm4=2*CbL
-       movdqa  xmm1,xmm7               ; xmm1=CrH
-       movdqa  xmm3,xmm0               ; xmm3=CrL
-       paddw   xmm7,xmm7               ; xmm7=2*CrH
-       paddw   xmm0,xmm0               ; xmm0=2*CrL
-
-       pmulhw  xmm6,[GOTOFF(eax,PW_MF0228)]    ; xmm6=(2*CbH * -FIX(0.22800))
-       pmulhw  xmm4,[GOTOFF(eax,PW_MF0228)]    ; xmm4=(2*CbL * -FIX(0.22800))
-       pmulhw  xmm7,[GOTOFF(eax,PW_F0402)]     ; xmm7=(2*CrH * FIX(0.40200))
-       pmulhw  xmm0,[GOTOFF(eax,PW_F0402)]     ; xmm0=(2*CrL * FIX(0.40200))
-
-       paddw   xmm6,[GOTOFF(eax,PW_ONE)]
-       paddw   xmm4,[GOTOFF(eax,PW_ONE)]
-       psraw   xmm6,1                  ; xmm6=(CbH * -FIX(0.22800))
-       psraw   xmm4,1                  ; xmm4=(CbL * -FIX(0.22800))
-       paddw   xmm7,[GOTOFF(eax,PW_ONE)]
-       paddw   xmm0,[GOTOFF(eax,PW_ONE)]
-       psraw   xmm7,1                  ; xmm7=(CrH * FIX(0.40200))
-       psraw   xmm0,1                  ; xmm0=(CrL * FIX(0.40200))
-
-       paddw   xmm6,xmm5
-       paddw   xmm4,xmm2
-       paddw   xmm6,xmm5               ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
-       paddw   xmm4,xmm2               ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
-       paddw   xmm7,xmm1               ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
-       paddw   xmm0,xmm3               ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
-
-       movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
-       movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
-
-       movdqa    xmm6,xmm5
-       movdqa    xmm7,xmm2
-       punpcklwd xmm5,xmm1
-       punpckhwd xmm6,xmm1
-       pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
-       pmaddwd   xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
-       punpcklwd xmm2,xmm3
-       punpckhwd xmm7,xmm3
-       pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
-       pmaddwd   xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
-
-       paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
-       paddd     xmm6,[GOTOFF(eax,PD_ONEHALF)]
-       psrad     xmm5,SCALEBITS
-       psrad     xmm6,SCALEBITS
-       paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
-       paddd     xmm7,[GOTOFF(eax,PD_ONEHALF)]
-       psrad     xmm2,SCALEBITS
-       psrad     xmm7,SCALEBITS
-
-       packssdw  xmm5,xmm6     ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
-       packssdw  xmm2,xmm7     ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
-       psubw     xmm5,xmm1     ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
-       psubw     xmm2,xmm3     ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
-
-       movdqa  XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
-
-       mov     al,2                    ; Yctr
-       jmp     short .Yloop_1st
-       alignx  16,7
-
-.Yloop_2nd:
-       movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
-       movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
-       movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
-       alignx  16,7
-
-.Yloop_1st:
-       movdqa  xmm7, XMMWORD [esi]     ; xmm7=Y(0123456789ABCDEF)
-
-       pcmpeqw xmm6,xmm6
-       psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
-       pand    xmm6,xmm7               ; xmm6=Y(02468ACE)=YE
-       psrlw   xmm7,BYTE_BIT           ; xmm7=Y(13579BDF)=YO
-
-       movdqa  xmm1,xmm0               ; xmm1=xmm0=(R-Y)(L/H)
-       movdqa  xmm3,xmm2               ; xmm3=xmm2=(G-Y)(L/H)
-       movdqa  xmm5,xmm4               ; xmm5=xmm4=(B-Y)(L/H)
-
-       paddw     xmm0,xmm6             ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
-       paddw     xmm1,xmm7             ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
-       packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
-       packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
-
-       paddw     xmm2,xmm6             ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
-       paddw     xmm3,xmm7             ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
-       packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
-       packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
-
-       paddw     xmm4,xmm6             ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
-       paddw     xmm5,xmm7             ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
-       packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
-       packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
-
-%if RGB_PIXELSIZE == 3 ; ---------------
-
-       ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-       ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-       ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-       ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
-
-       punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-       punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
-       punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
-
-       movdqa    xmmG,xmmA
-       movdqa    xmmH,xmmA
-       punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
-       punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
-
-       psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
-       psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
-
-       movdqa    xmmC,xmmD
-       movdqa    xmmB,xmmD
-       punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
-       punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
-
-       psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
-
-       movdqa    xmmF,xmmE
-       punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
-       punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
-
-       pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
-       movdqa    xmmB,xmmE
-       punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
-       punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
-       punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
-
-       pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
-       movdqa    xmmB,xmmF
-       punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
-       punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
-       punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
-
-       punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
-       punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
-       punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
-
-       cmp     ecx, byte SIZEOF_XMMWORD
-       jb      short .column_st32
-
-       test    edi, SIZEOF_XMMWORD-1
-       jnz     short .out1
-       ; --(aligned)-------------------
-       movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-       movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-       movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
-       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-       jmp     short .out0
-.out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmH,xmmH                    ; xmmH=(all 1's)
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmF,xmmH                    ; movntdqu XMMWORD [edi], xmmF
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-.out0:
-       sub     ecx, byte SIZEOF_XMMWORD
-       jz      near .endcolumn
-
-       add     esi, byte SIZEOF_XMMWORD        ; inptr0
-       dec     al                      ; Yctr
-       jnz     near .Yloop_2nd
-
-       add     ebx, byte SIZEOF_XMMWORD        ; inptr1
-       add     edx, byte SIZEOF_XMMWORD        ; inptr2
-       jmp     near .columnloop
-       alignx  16,7
-
-.column_st32:
-       pcmpeqb xmmH,xmmH                       ; xmmH=(all 1's)
-       lea     ecx, [ecx+ecx*2]                ; imul ecx, RGB_PIXELSIZE
-       cmp     ecx, byte 2*SIZEOF_XMMWORD
-       jb      short .column_st16
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       movdqa  xmmA,xmmF
-       sub     ecx, byte 2*SIZEOF_XMMWORD
-       jmp     short .column_st15
-.column_st16:
-       cmp     ecx, byte SIZEOF_XMMWORD
-       jb      short .column_st15
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       movdqa  xmmA,xmmD
-       sub     ecx, byte SIZEOF_XMMWORD
-.column_st15:
-       mov     eax,ecx
-       xor     ecx, byte 0x0F
-       shl     ecx, 2
-       movd    xmmB,ecx
-       psrlq   xmmH,4
-       pcmpeqb xmmE,xmmE
-       psrlq   xmmH,xmmB
-       psrlq   xmmE,xmmB
-       punpcklbw xmmE,xmmH
-       ; ----------------
-       mov     ecx,edi
-       and     ecx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       add     eax,ecx
-       cmp     eax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     edi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     ecx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmG,xmmA
-       movdqa  xmmC,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmD,ecx
-       sub     ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmF,ecx
-       psllq   xmmA,xmmF
-       psllq   xmmE,xmmF
-       jmp     short .adj0
-.adj1: neg     ecx
-       movd    xmmF,ecx
-       psrlq   xmmA,xmmF
-       psrlq   xmmE,xmmF
-       psllq   xmmG,xmmD
-       psllq   xmmC,xmmD
-       por     xmmA,xmmG
-       por     xmmE,xmmC
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-
-%else ; RGB_PIXELSIZE == 4 ; -----------
-
-%ifdef RGBX_FILLER_0XFF
-       pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-       pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%else
-       pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
-       pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
-%endif
-       ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
-       ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
-       ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
-       ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
-
-       punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
-       punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
-       punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
-       punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
-
-       movdqa    xmmC,xmmA
-       punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
-       punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
-       movdqa    xmmG,xmmB
-       punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
-       punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
-
-       movdqa    xmmD,xmmA
-       punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
-       punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
-       movdqa    xmmH,xmmC
-       punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
-       punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
-
-       cmp     ecx, byte SIZEOF_XMMWORD
-       jb      short .column_st32
-
-       test    edi, SIZEOF_XMMWORD-1
-       jnz     short .out1
-       ; --(aligned)-------------------
-       movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
-       movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
-       movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
-       movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
-       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
-       jmp     short .out0
-.out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmE,xmmE                    ; xmmE=(all 1's)
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmC,xmmE                    ; movntdqu XMMWORD [edi], xmmC
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmH,xmmE                    ; movntdqu XMMWORD [edi], xmmH
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-.out0:
-       sub     ecx, byte SIZEOF_XMMWORD
-       jz      near .endcolumn
-
-       add     esi, byte SIZEOF_XMMWORD        ; inptr0
-       dec     al                      ; Yctr
-       jnz     near .Yloop_2nd
-
-       add     ebx, byte SIZEOF_XMMWORD        ; inptr1
-       add     edx, byte SIZEOF_XMMWORD        ; inptr2
-       jmp     near .columnloop
-       alignx  16,7
-
-.column_st32:
-       pcmpeqb xmmE,xmmE                       ; xmmE=(all 1's)
-       cmp     ecx, byte SIZEOF_XMMWORD/2
-       jb      short .column_st16
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [edi], xmmD
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       movdqa  xmmA,xmmC
-       movdqa  xmmD,xmmH
-       sub     ecx, byte SIZEOF_XMMWORD/2
-.column_st16:
-       cmp     ecx, byte SIZEOF_XMMWORD/4
-       jb      short .column_st15
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-       add     edi, byte SIZEOF_XMMWORD        ; outptr
-       movdqa  xmmA,xmmD
-       sub     ecx, byte SIZEOF_XMMWORD/4
-.column_st15:
-       cmp     ecx, byte SIZEOF_XMMWORD/16
-       jb      short .endcolumn
-       mov     eax,ecx
-       xor     ecx, byte 0x03
-       inc     ecx
-       shl     ecx, 4
-       movd    xmmF,ecx
-       psrlq   xmmE,xmmF
-       punpcklbw xmmE,xmmE
-       ; ----------------
-       mov     ecx,edi
-       and     ecx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       lea     eax, [ecx+eax*4]        ; RGB_PIXELSIZE
-       cmp     eax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     edi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     ecx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmB,xmmA
-       movdqa  xmmG,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmC,ecx
-       sub     ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmH,ecx
-       psllq   xmmA,xmmH
-       psllq   xmmE,xmmH
-       jmp     short .adj0
-.adj1: neg     ecx
-       movd    xmmH,ecx
-       psrlq   xmmA,xmmH
-       psrlq   xmmE,xmmH
-       psllq   xmmB,xmmC
-       psllq   xmmG,xmmC
-       por     xmmA,xmmB
-       por     xmmE,xmmG
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-
-%endif ; RGB_PIXELSIZE ; ---------------
-
-.endcolumn:
-       sfence          ; flush the write buffer
-
-.return:
-       pop     edi
-       pop     esi
-;      pop     edx             ; need not be preserved
-;      pop     ecx             ; need not be preserved
-       pop     ebx
-       mov     esp,ebp         ; esp <- aligned ebp
-       pop     esp             ; esp <- original ebp
-       pop     ebp
-       ret
-
-; --------------------------------------------------------------------------
-;
-; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
-;
-; GLOBAL(void)
-; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
-;                                  JSAMPIMAGE input_buf,
-;                                  JDIMENSION in_row_group_ctr,
-;                                  JSAMPARRAY output_buf);
-;
-
-%define output_width(b)        (b)+8                   ; JDIMENSION output_width
-%define input_buf(b)           (b)+12          ; JSAMPIMAGE input_buf
-%define in_row_group_ctr(b)    (b)+16          ; JDIMENSION in_row_group_ctr
-%define output_buf(b)          (b)+20          ; JSAMPARRAY output_buf
-
-       align   16
-       global  EXTN(jsimd_h2v2_merged_upsample_sse2)
-
-EXTN(jsimd_h2v2_merged_upsample_sse2):
-       push    ebp
-       mov     ebp,esp
-       push    ebx
-;      push    ecx             ; need not be preserved
-;      push    edx             ; need not be preserved
-       push    esi
-       push    edi
-
-       mov     eax, POINTER [output_width(ebp)]
-
-       mov     edi, JSAMPIMAGE [input_buf(ebp)]
-       mov     ecx, JDIMENSION [in_row_group_ctr(ebp)]
-       mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
-       mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
-       mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
-       mov     edi, JSAMPARRAY [output_buf(ebp)]
-       lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
-
-       push    edx                     ; inptr2
-       push    ebx                     ; inptr1
-       push    esi                     ; inptr00
-       mov     ebx,esp
-
-       push    edi                     ; output_buf (outptr0)
-       push    ecx                     ; in_row_group_ctr
-       push    ebx                     ; input_buf
-       push    eax                     ; output_width
-
-       call    near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-       add     esi, byte SIZEOF_JSAMPROW       ; inptr01
-       add     edi, byte SIZEOF_JSAMPROW       ; outptr1
-       mov     POINTER [ebx+0*SIZEOF_POINTER], esi
-       mov     POINTER [ebx-1*SIZEOF_POINTER], edi
-
-       call    near EXTN(jsimd_h2v1_merged_upsample_sse2)
-
-       add     esp, byte 7*SIZEOF_DWORD
-
-       pop     edi
-       pop     esi
-;      pop     edx             ; need not be preserved
-;      pop     ecx             ; need not be preserved
-       pop     ebx
-       pop     ebp
-       ret
-
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgb_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 0
+%define RGB_GREEN 1
+%define RGB_BLUE 2
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extrgbx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extrgbx_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 3
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgr_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 2
+%define RGB_GREEN 1
+%define RGB_BLUE 0
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extbgrx_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extbgrx_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 3
+%define RGB_GREEN 2
+%define RGB_BLUE 1
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxbgr_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxbgr_merged_upsample_sse2
+%include "jdmrgss2.asm"
+
+%undef RGB_RED
+%undef RGB_GREEN
+%undef RGB_BLUE
+%undef RGB_PIXELSIZE
+%define RGB_RED 1
+%define RGB_GREEN 2
+%define RGB_BLUE 3
+%define RGB_PIXELSIZE 4
+%define jsimd_h2v1_merged_upsample_sse2 jsimd_h2v1_extxrgb_merged_upsample_sse2
+%define jsimd_h2v2_merged_upsample_sse2 jsimd_h2v2_extxrgb_merged_upsample_sse2
+%include "jdmrgss2.asm"
diff --git a/simd/jdmrgmmx.asm b/simd/jdmrgmmx.asm
new file mode 100644 (file)
index 0000000..08b0985
--- /dev/null
@@ -0,0 +1,463 @@
+;
+; jdmrgmmx.asm - merged upsampling/color conversion (MMX)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    32
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)        (b)+8                   ; JDIMENSION output_width
+%define input_buf(b)           (b)+12          ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)    (b)+16          ; JDIMENSION in_row_group_ctr
+%define output_buf(b)          (b)+20          ; JSAMPARRAY output_buf
+
+%define original_ebp   ebp+0
+%define wk(i)          ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
+%define WK_NUM         3
+%define gotptr         wk(0)-SIZEOF_POINTER    ; void * gotptr
+
+       align   16
+       global  EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+EXTN(jsimd_h2v1_merged_upsample_mmx):
+       push    ebp
+       mov     eax,esp                         ; eax = original ebp
+       sub     esp, byte 4
+       and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
+       mov     [esp],eax
+       mov     ebp,esp                         ; ebp = aligned ebp
+       lea     esp, [wk(0)]
+       pushpic eax             ; make a room for GOT address
+       push    ebx
+;      push    ecx             ; need not be preserved
+;      push    edx             ; need not be preserved
+       push    esi
+       push    edi
+
+       get_GOT ebx                     ; get GOT address
+       movpic  POINTER [gotptr], ebx   ; save GOT address
+
+       mov     ecx, JDIMENSION [output_width(eax)]     ; col
+       test    ecx,ecx
+       jz      near .return
+
+       push    ecx
+
+       mov     edi, JSAMPIMAGE [input_buf(eax)]
+       mov     ecx, JDIMENSION [in_row_group_ctr(eax)]
+       mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+       mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+       mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+       mov     edi, JSAMPARRAY [output_buf(eax)]
+       mov     esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]         ; inptr0
+       mov     ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]         ; inptr1
+       mov     edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]         ; inptr2
+       mov     edi, JSAMPROW [edi]                             ; outptr
+
+       pop     ecx                     ; col
+
+       alignx  16,7
+.columnloop:
+       movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
+
+       movq      mm6, MMWORD [ebx]     ; mm6=Cb(01234567)
+       movq      mm7, MMWORD [edx]     ; mm7=Cr(01234567)
+
+       pxor      mm1,mm1               ; mm1=(all 0's)
+       pcmpeqw   mm3,mm3
+       psllw     mm3,7                 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
+
+       movq      mm4,mm6
+       punpckhbw mm6,mm1               ; mm6=Cb(4567)=CbH
+       punpcklbw mm4,mm1               ; mm4=Cb(0123)=CbL
+       movq      mm0,mm7
+       punpckhbw mm7,mm1               ; mm7=Cr(4567)=CrH
+       punpcklbw mm0,mm1               ; mm0=Cr(0123)=CrL
+
+       paddw     mm6,mm3
+       paddw     mm4,mm3
+       paddw     mm7,mm3
+       paddw     mm0,mm3
+
+       ; (Original)
+       ; R = Y                + 1.40200 * Cr
+       ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+       ; B = Y + 1.77200 * Cb
+       ;
+       ; (This implementation)
+       ; R = Y                + 0.40200 * Cr + Cr
+       ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       ; B = Y - 0.22800 * Cb + Cb + Cb
+
+       movq    mm5,mm6                 ; mm5=CbH
+       movq    mm2,mm4                 ; mm2=CbL
+       paddw   mm6,mm6                 ; mm6=2*CbH
+       paddw   mm4,mm4                 ; mm4=2*CbL
+       movq    mm1,mm7                 ; mm1=CrH
+       movq    mm3,mm0                 ; mm3=CrL
+       paddw   mm7,mm7                 ; mm7=2*CrH
+       paddw   mm0,mm0                 ; mm0=2*CrL
+
+       pmulhw  mm6,[GOTOFF(eax,PW_MF0228)]     ; mm6=(2*CbH * -FIX(0.22800))
+       pmulhw  mm4,[GOTOFF(eax,PW_MF0228)]     ; mm4=(2*CbL * -FIX(0.22800))
+       pmulhw  mm7,[GOTOFF(eax,PW_F0402)]      ; mm7=(2*CrH * FIX(0.40200))
+       pmulhw  mm0,[GOTOFF(eax,PW_F0402)]      ; mm0=(2*CrL * FIX(0.40200))
+
+       paddw   mm6,[GOTOFF(eax,PW_ONE)]
+       paddw   mm4,[GOTOFF(eax,PW_ONE)]
+       psraw   mm6,1                   ; mm6=(CbH * -FIX(0.22800))
+       psraw   mm4,1                   ; mm4=(CbL * -FIX(0.22800))
+       paddw   mm7,[GOTOFF(eax,PW_ONE)]
+       paddw   mm0,[GOTOFF(eax,PW_ONE)]
+       psraw   mm7,1                   ; mm7=(CrH * FIX(0.40200))
+       psraw   mm0,1                   ; mm0=(CrL * FIX(0.40200))
+
+       paddw   mm6,mm5
+       paddw   mm4,mm2
+       paddw   mm6,mm5                 ; mm6=(CbH * FIX(1.77200))=(B-Y)H
+       paddw   mm4,mm2                 ; mm4=(CbL * FIX(1.77200))=(B-Y)L
+       paddw   mm7,mm1                 ; mm7=(CrH * FIX(1.40200))=(R-Y)H
+       paddw   mm0,mm3                 ; mm0=(CrL * FIX(1.40200))=(R-Y)L
+
+       movq    MMWORD [wk(0)], mm6     ; wk(0)=(B-Y)H
+       movq    MMWORD [wk(1)], mm7     ; wk(1)=(R-Y)H
+
+       movq      mm6,mm5
+       movq      mm7,mm2
+       punpcklwd mm5,mm1
+       punpckhwd mm6,mm1
+       pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
+       pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
+       punpcklwd mm2,mm3
+       punpckhwd mm7,mm3
+       pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
+       pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+       paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
+       paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
+       psrad     mm5,SCALEBITS
+       psrad     mm6,SCALEBITS
+       paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
+       paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
+       psrad     mm2,SCALEBITS
+       psrad     mm7,SCALEBITS
+
+       packssdw  mm5,mm6       ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+       packssdw  mm2,mm7       ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+       psubw     mm5,mm1       ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+       psubw     mm2,mm3       ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+       movq    MMWORD [wk(2)], mm5     ; wk(2)=(G-Y)H
+
+       mov     al,2                    ; Yctr
+       jmp     short .Yloop_1st
+       alignx  16,7
+
+.Yloop_2nd:
+       movq    mm0, MMWORD [wk(1)]     ; mm0=(R-Y)H
+       movq    mm2, MMWORD [wk(2)]     ; mm2=(G-Y)H
+       movq    mm4, MMWORD [wk(0)]     ; mm4=(B-Y)H
+       alignx  16,7
+
+.Yloop_1st:
+       movq    mm7, MMWORD [esi]       ; mm7=Y(01234567)
+
+       pcmpeqw mm6,mm6
+       psrlw   mm6,BYTE_BIT            ; mm6={0xFF 0x00 0xFF 0x00 ..}
+       pand    mm6,mm7                 ; mm6=Y(0246)=YE
+       psrlw   mm7,BYTE_BIT            ; mm7=Y(1357)=YO
+
+       movq    mm1,mm0                 ; mm1=mm0=(R-Y)(L/H)
+       movq    mm3,mm2                 ; mm3=mm2=(G-Y)(L/H)
+       movq    mm5,mm4                 ; mm5=mm4=(B-Y)(L/H)
+
+       paddw     mm0,mm6               ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
+       paddw     mm1,mm7               ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
+       packuswb  mm0,mm0               ; mm0=(R0 R2 R4 R6 ** ** ** **)
+       packuswb  mm1,mm1               ; mm1=(R1 R3 R5 R7 ** ** ** **)
+
+       paddw     mm2,mm6               ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
+       paddw     mm3,mm7               ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
+       packuswb  mm2,mm2               ; mm2=(G0 G2 G4 G6 ** ** ** **)
+       packuswb  mm3,mm3               ; mm3=(G1 G3 G5 G7 ** ** ** **)
+
+       paddw     mm4,mm6               ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
+       paddw     mm5,mm7               ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
+       packuswb  mm4,mm4               ; mm4=(B0 B2 B4 B6 ** ** ** **)
+       packuswb  mm5,mm5               ; mm5=(B1 B3 B5 B7 ** ** ** **)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+       ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+       ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+       ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+       ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
+
+       punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
+       punpcklbw mmE,mmB               ; mmE=(20 01 22 03 24 05 26 07)
+       punpcklbw mmD,mmF               ; mmD=(11 21 13 23 15 25 17 27)
+
+       movq      mmG,mmA
+       movq      mmH,mmA
+       punpcklwd mmA,mmE               ; mmA=(00 10 20 01 02 12 22 03)
+       punpckhwd mmG,mmE               ; mmG=(04 14 24 05 06 16 26 07)
+
+       psrlq     mmH,2*BYTE_BIT        ; mmH=(02 12 04 14 06 16 -- --)
+       psrlq     mmE,2*BYTE_BIT        ; mmE=(22 03 24 05 26 07 -- --)
+
+       movq      mmC,mmD
+       movq      mmB,mmD
+       punpcklwd mmD,mmH               ; mmD=(11 21 02 12 13 23 04 14)
+       punpckhwd mmC,mmH               ; mmC=(15 25 06 16 17 27 -- --)
+
+       psrlq     mmB,2*BYTE_BIT        ; mmB=(13 23 15 25 17 27 -- --)
+
+       movq      mmF,mmE
+       punpcklwd mmE,mmB               ; mmE=(22 03 13 23 24 05 15 25)
+       punpckhwd mmF,mmB               ; mmF=(26 07 17 27 -- -- -- --)
+
+       punpckldq mmA,mmD               ; mmA=(00 10 20 01 11 21 02 12)
+       punpckldq mmE,mmG               ; mmE=(22 03 13 23 04 14 24 05)
+       punpckldq mmC,mmF               ; mmC=(15 25 06 16 26 07 17 27)
+
+       cmp     ecx, byte SIZEOF_MMWORD
+       jb      short .column_st16
+
+       movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+       movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
+       movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
+
+       sub     ecx, byte SIZEOF_MMWORD
+       jz      short .endcolumn
+
+       add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
+       add     esi, byte SIZEOF_MMWORD                 ; inptr0
+       dec     al                      ; Yctr
+       jnz     near .Yloop_2nd
+
+       add     ebx, byte SIZEOF_MMWORD                 ; inptr1
+       add     edx, byte SIZEOF_MMWORD                 ; inptr2
+       jmp     near .columnloop
+       alignx  16,7
+
+.column_st16:
+       lea     ecx, [ecx+ecx*2]        ; imul ecx, RGB_PIXELSIZE
+       cmp     ecx, byte 2*SIZEOF_MMWORD
+       jb      short .column_st8
+       movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+       movq    MMWORD [edi+1*SIZEOF_MMWORD], mmE
+       movq    mmA,mmC
+       sub     ecx, byte 2*SIZEOF_MMWORD
+       add     edi, byte 2*SIZEOF_MMWORD
+       jmp     short .column_st4
+.column_st8:
+       cmp     ecx, byte SIZEOF_MMWORD
+       jb      short .column_st4
+       movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+       movq    mmA,mmE
+       sub     ecx, byte SIZEOF_MMWORD
+       add     edi, byte SIZEOF_MMWORD
+.column_st4:
+       movd    eax,mmA
+       cmp     ecx, byte SIZEOF_DWORD
+       jb      short .column_st2
+       mov     DWORD [edi+0*SIZEOF_DWORD], eax
+       psrlq   mmA,DWORD_BIT
+       movd    eax,mmA
+       sub     ecx, byte SIZEOF_DWORD
+       add     edi, byte SIZEOF_DWORD
+.column_st2:
+       cmp     ecx, byte SIZEOF_WORD
+       jb      short .column_st1
+       mov     WORD [edi+0*SIZEOF_WORD], ax
+       shr     eax,WORD_BIT
+       sub     ecx, byte SIZEOF_WORD
+       add     edi, byte SIZEOF_WORD
+.column_st1:
+       cmp     ecx, byte SIZEOF_BYTE
+       jb      short .endcolumn
+       mov     BYTE [edi+0*SIZEOF_BYTE], al
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+       pcmpeqb   mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
+       pcmpeqb   mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%else
+       pxor      mm6,mm6               ; mm6=(X0 X2 X4 X6 ** ** ** **)
+       pxor      mm7,mm7               ; mm7=(X1 X3 X5 X7 ** ** ** **)
+%endif
+       ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
+       ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
+       ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
+       ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
+
+       punpcklbw mmA,mmC               ; mmA=(00 10 02 12 04 14 06 16)
+       punpcklbw mmE,mmG               ; mmE=(20 30 22 32 24 34 26 36)
+       punpcklbw mmB,mmD               ; mmB=(01 11 03 13 05 15 07 17)
+       punpcklbw mmF,mmH               ; mmF=(21 31 23 33 25 35 27 37)
+
+       movq      mmC,mmA
+       punpcklwd mmA,mmE               ; mmA=(00 10 20 30 02 12 22 32)
+       punpckhwd mmC,mmE               ; mmC=(04 14 24 34 06 16 26 36)
+       movq      mmG,mmB
+       punpcklwd mmB,mmF               ; mmB=(01 11 21 31 03 13 23 33)
+       punpckhwd mmG,mmF               ; mmG=(05 15 25 35 07 17 27 37)
+
+       movq      mmD,mmA
+       punpckldq mmA,mmB               ; mmA=(00 10 20 30 01 11 21 31)
+       punpckhdq mmD,mmB               ; mmD=(02 12 22 32 03 13 23 33)
+       movq      mmH,mmC
+       punpckldq mmC,mmG               ; mmC=(04 14 24 34 05 15 25 35)
+       punpckhdq mmH,mmG               ; mmH=(06 16 26 36 07 17 27 37)
+
+       cmp     ecx, byte SIZEOF_MMWORD
+       jb      short .column_st16
+
+       movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+       movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
+       movq    MMWORD [edi+2*SIZEOF_MMWORD], mmC
+       movq    MMWORD [edi+3*SIZEOF_MMWORD], mmH
+
+       sub     ecx, byte SIZEOF_MMWORD
+       jz      short .endcolumn
+
+       add     edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; outptr
+       add     esi, byte SIZEOF_MMWORD                 ; inptr0
+       dec     al                      ; Yctr
+       jnz     near .Yloop_2nd
+
+       add     ebx, byte SIZEOF_MMWORD                 ; inptr1
+       add     edx, byte SIZEOF_MMWORD                 ; inptr2
+       jmp     near .columnloop
+       alignx  16,7
+
+.column_st16:
+       cmp     ecx, byte SIZEOF_MMWORD/2
+       jb      short .column_st8
+       movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+       movq    MMWORD [edi+1*SIZEOF_MMWORD], mmD
+       movq    mmA,mmC
+       movq    mmD,mmH
+       sub     ecx, byte SIZEOF_MMWORD/2
+       add     edi, byte 2*SIZEOF_MMWORD
+.column_st8:
+       cmp     ecx, byte SIZEOF_MMWORD/4
+       jb      short .column_st4
+       movq    MMWORD [edi+0*SIZEOF_MMWORD], mmA
+       movq    mmA,mmD
+       sub     ecx, byte SIZEOF_MMWORD/4
+       add     edi, byte 1*SIZEOF_MMWORD
+.column_st4:
+       cmp     ecx, byte SIZEOF_MMWORD/8
+       jb      short .endcolumn
+       movd    DWORD [edi+0*SIZEOF_DWORD], mmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+       emms            ; empty MMX state
+
+.return:
+       pop     edi
+       pop     esi
+;      pop     edx             ; need not be preserved
+;      pop     ecx             ; need not be preserved
+       pop     ebx
+       mov     esp,ebp         ; esp <- aligned ebp
+       pop     esp             ; esp <- original ebp
+       pop     ebp
+       ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
+;                                 JSAMPIMAGE input_buf,
+;                                 JDIMENSION in_row_group_ctr,
+;                                 JSAMPARRAY output_buf);
+;
+
+%define output_width(b)        (b)+8                   ; JDIMENSION output_width
+%define input_buf(b)           (b)+12          ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)    (b)+16          ; JDIMENSION in_row_group_ctr
+%define output_buf(b)          (b)+20          ; JSAMPARRAY output_buf
+
+       align   16
+       global  EXTN(jsimd_h2v2_merged_upsample_mmx)
+
+EXTN(jsimd_h2v2_merged_upsample_mmx):
+       push    ebp
+       mov     ebp,esp
+       push    ebx
+;      push    ecx             ; need not be preserved
+;      push    edx             ; need not be preserved
+       push    esi
+       push    edi
+
+       mov     eax, JDIMENSION [output_width(ebp)]
+
+       mov     edi, JSAMPIMAGE [input_buf(ebp)]
+       mov     ecx, JDIMENSION [in_row_group_ctr(ebp)]
+       mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+       mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+       mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+       mov     edi, JSAMPARRAY [output_buf(ebp)]
+       lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+       push    edx                     ; inptr2
+       push    ebx                     ; inptr1
+       push    esi                     ; inptr00
+       mov     ebx,esp
+
+       push    edi                     ; output_buf (outptr0)
+       push    ecx                     ; in_row_group_ctr
+       push    ebx                     ; input_buf
+       push    eax                     ; output_width
+
+       call    near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+       add     esi, byte SIZEOF_JSAMPROW       ; inptr01
+       add     edi, byte SIZEOF_JSAMPROW       ; outptr1
+       mov     POINTER [ebx+0*SIZEOF_POINTER], esi
+       mov     POINTER [ebx-1*SIZEOF_POINTER], edi
+
+       call    near EXTN(jsimd_h2v1_merged_upsample_mmx)
+
+       add     esp, byte 7*SIZEOF_DWORD
+
+       pop     edi
+       pop     esi
+;      pop     edx             ; need not be preserved
+;      pop     ecx             ; need not be preserved
+       pop     ebx
+       pop     ebp
+       ret
+
diff --git a/simd/jdmrgss2.asm b/simd/jdmrgss2.asm
new file mode 100644 (file)
index 0000000..5018a23
--- /dev/null
@@ -0,0 +1,561 @@
+;
+; jdmrgss2.asm - merged upsampling/color conversion (SSE2)
+;
+; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+;
+; Based on
+; x86 SIMD extension for IJG JPEG library
+; Copyright (C) 1999-2006, MIYASAKA Masaru.
+; For conditions of distribution and use, see copyright notice in jsimdext.inc
+;
+; This file should be assembled with NASM (Netwide Assembler),
+; can *not* be assembled with Microsoft's MASM or any compatible
+; assembler (including Borland's Turbo Assembler).
+; NASM is available from http://nasm.sourceforge.net/ or
+; http://sourceforge.net/project/showfiles.php?group_id=6208
+;
+; [TAB8]
+
+%include "jcolsamp.inc"
+                               
+; --------------------------------------------------------------------------
+       SECTION SEG_TEXT
+       BITS    32
+;
+; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+%define output_width(b)        (b)+8                   ; JDIMENSION output_width
+%define input_buf(b)           (b)+12          ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)    (b)+16          ; JDIMENSION in_row_group_ctr
+%define output_buf(b)          (b)+20          ; JSAMPARRAY output_buf
+
+%define original_ebp   ebp+0
+%define wk(i)          ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
+%define WK_NUM         3
+%define gotptr         wk(0)-SIZEOF_POINTER    ; void * gotptr
+
+       align   16
+       global  EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+EXTN(jsimd_h2v1_merged_upsample_sse2):
+       push    ebp
+       mov     eax,esp                         ; eax = original ebp
+       sub     esp, byte 4
+       and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
+       mov     [esp],eax
+       mov     ebp,esp                         ; ebp = aligned ebp
+       lea     esp, [wk(0)]
+       pushpic eax             ; make a room for GOT address
+       push    ebx
+;      push    ecx             ; need not be preserved
+;      push    edx             ; need not be preserved
+       push    esi
+       push    edi
+
+       get_GOT ebx                     ; get GOT address
+       movpic  POINTER [gotptr], ebx   ; save GOT address
+
+       mov     ecx, JDIMENSION [output_width(eax)]     ; col
+       test    ecx,ecx
+       jz      near .return
+
+       push    ecx
+
+       mov     edi, JSAMPIMAGE [input_buf(eax)]
+       mov     ecx, JDIMENSION [in_row_group_ctr(eax)]
+       mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+       mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+       mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+       mov     edi, JSAMPARRAY [output_buf(eax)]
+       mov     esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]         ; inptr0
+       mov     ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]         ; inptr1
+       mov     edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]         ; inptr2
+       mov     edi, JSAMPROW [edi]                             ; outptr
+
+       pop     ecx                     ; col
+
+       alignx  16,7
+.columnloop:
+       movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
+
+       movdqa    xmm6, XMMWORD [ebx]   ; xmm6=Cb(0123456789ABCDEF)
+       movdqa    xmm7, XMMWORD [edx]   ; xmm7=Cr(0123456789ABCDEF)
+
+       pxor      xmm1,xmm1             ; xmm1=(all 0's)
+       pcmpeqw   xmm3,xmm3
+       psllw     xmm3,7                ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..}
+
+       movdqa    xmm4,xmm6
+       punpckhbw xmm6,xmm1             ; xmm6=Cb(89ABCDEF)=CbH
+       punpcklbw xmm4,xmm1             ; xmm4=Cb(01234567)=CbL
+       movdqa    xmm0,xmm7
+       punpckhbw xmm7,xmm1             ; xmm7=Cr(89ABCDEF)=CrH
+       punpcklbw xmm0,xmm1             ; xmm0=Cr(01234567)=CrL
+
+       paddw     xmm6,xmm3
+       paddw     xmm4,xmm3
+       paddw     xmm7,xmm3
+       paddw     xmm0,xmm3
+
+       ; (Original)
+       ; R = Y                + 1.40200 * Cr
+       ; G = Y - 0.34414 * Cb - 0.71414 * Cr
+       ; B = Y + 1.77200 * Cb
+       ;
+       ; (This implementation)
+       ; R = Y                + 0.40200 * Cr + Cr
+       ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       ; B = Y - 0.22800 * Cb + Cb + Cb
+
+       movdqa  xmm5,xmm6               ; xmm5=CbH
+       movdqa  xmm2,xmm4               ; xmm2=CbL
+       paddw   xmm6,xmm6               ; xmm6=2*CbH
+       paddw   xmm4,xmm4               ; xmm4=2*CbL
+       movdqa  xmm1,xmm7               ; xmm1=CrH
+       movdqa  xmm3,xmm0               ; xmm3=CrL
+       paddw   xmm7,xmm7               ; xmm7=2*CrH
+       paddw   xmm0,xmm0               ; xmm0=2*CrL
+
+       pmulhw  xmm6,[GOTOFF(eax,PW_MF0228)]    ; xmm6=(2*CbH * -FIX(0.22800))
+       pmulhw  xmm4,[GOTOFF(eax,PW_MF0228)]    ; xmm4=(2*CbL * -FIX(0.22800))
+       pmulhw  xmm7,[GOTOFF(eax,PW_F0402)]     ; xmm7=(2*CrH * FIX(0.40200))
+       pmulhw  xmm0,[GOTOFF(eax,PW_F0402)]     ; xmm0=(2*CrL * FIX(0.40200))
+
+       paddw   xmm6,[GOTOFF(eax,PW_ONE)]
+       paddw   xmm4,[GOTOFF(eax,PW_ONE)]
+       psraw   xmm6,1                  ; xmm6=(CbH * -FIX(0.22800))
+       psraw   xmm4,1                  ; xmm4=(CbL * -FIX(0.22800))
+       paddw   xmm7,[GOTOFF(eax,PW_ONE)]
+       paddw   xmm0,[GOTOFF(eax,PW_ONE)]
+       psraw   xmm7,1                  ; xmm7=(CrH * FIX(0.40200))
+       psraw   xmm0,1                  ; xmm0=(CrL * FIX(0.40200))
+
+       paddw   xmm6,xmm5
+       paddw   xmm4,xmm2
+       paddw   xmm6,xmm5               ; xmm6=(CbH * FIX(1.77200))=(B-Y)H
+       paddw   xmm4,xmm2               ; xmm4=(CbL * FIX(1.77200))=(B-Y)L
+       paddw   xmm7,xmm1               ; xmm7=(CrH * FIX(1.40200))=(R-Y)H
+       paddw   xmm0,xmm3               ; xmm0=(CrL * FIX(1.40200))=(R-Y)L
+
+       movdqa  XMMWORD [wk(0)], xmm6   ; wk(0)=(B-Y)H
+       movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=(R-Y)H
+
+       movdqa    xmm6,xmm5
+       movdqa    xmm7,xmm2
+       punpcklwd xmm5,xmm1
+       punpckhwd xmm6,xmm1
+       pmaddwd   xmm5,[GOTOFF(eax,PW_MF0344_F0285)]
+       pmaddwd   xmm6,[GOTOFF(eax,PW_MF0344_F0285)]
+       punpcklwd xmm2,xmm3
+       punpckhwd xmm7,xmm3
+       pmaddwd   xmm2,[GOTOFF(eax,PW_MF0344_F0285)]
+       pmaddwd   xmm7,[GOTOFF(eax,PW_MF0344_F0285)]
+
+       paddd     xmm5,[GOTOFF(eax,PD_ONEHALF)]
+       paddd     xmm6,[GOTOFF(eax,PD_ONEHALF)]
+       psrad     xmm5,SCALEBITS
+       psrad     xmm6,SCALEBITS
+       paddd     xmm2,[GOTOFF(eax,PD_ONEHALF)]
+       paddd     xmm7,[GOTOFF(eax,PD_ONEHALF)]
+       psrad     xmm2,SCALEBITS
+       psrad     xmm7,SCALEBITS
+
+       packssdw  xmm5,xmm6     ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
+       packssdw  xmm2,xmm7     ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
+       psubw     xmm5,xmm1     ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
+       psubw     xmm2,xmm3     ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
+
+       movdqa  XMMWORD [wk(2)], xmm5   ; wk(2)=(G-Y)H
+
+       mov     al,2                    ; Yctr
+       jmp     short .Yloop_1st
+       alignx  16,7
+
+.Yloop_2nd:
+       movdqa  xmm0, XMMWORD [wk(1)]   ; xmm0=(R-Y)H
+       movdqa  xmm2, XMMWORD [wk(2)]   ; xmm2=(G-Y)H
+       movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(B-Y)H
+       alignx  16,7
+
+.Yloop_1st:
+       movdqa  xmm7, XMMWORD [esi]     ; xmm7=Y(0123456789ABCDEF)
+
+       pcmpeqw xmm6,xmm6
+       psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
+       pand    xmm6,xmm7               ; xmm6=Y(02468ACE)=YE
+       psrlw   xmm7,BYTE_BIT           ; xmm7=Y(13579BDF)=YO
+
+       movdqa  xmm1,xmm0               ; xmm1=xmm0=(R-Y)(L/H)
+       movdqa  xmm3,xmm2               ; xmm3=xmm2=(G-Y)(L/H)
+       movdqa  xmm5,xmm4               ; xmm5=xmm4=(B-Y)(L/H)
+
+       paddw     xmm0,xmm6             ; xmm0=((R-Y)+YE)=RE=R(02468ACE)
+       paddw     xmm1,xmm7             ; xmm1=((R-Y)+YO)=RO=R(13579BDF)
+       packuswb  xmm0,xmm0             ; xmm0=R(02468ACE********)
+       packuswb  xmm1,xmm1             ; xmm1=R(13579BDF********)
+
+       paddw     xmm2,xmm6             ; xmm2=((G-Y)+YE)=GE=G(02468ACE)
+       paddw     xmm3,xmm7             ; xmm3=((G-Y)+YO)=GO=G(13579BDF)
+       packuswb  xmm2,xmm2             ; xmm2=G(02468ACE********)
+       packuswb  xmm3,xmm3             ; xmm3=G(13579BDF********)
+
+       paddw     xmm4,xmm6             ; xmm4=((B-Y)+YE)=BE=B(02468ACE)
+       paddw     xmm5,xmm7             ; xmm5=((B-Y)+YO)=BO=B(13579BDF)
+       packuswb  xmm4,xmm4             ; xmm4=B(02468ACE********)
+       packuswb  xmm5,xmm5             ; xmm5=B(13579BDF********)
+
+%if RGB_PIXELSIZE == 3 ; ---------------
+
+       ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+       ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+       ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+       ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **)
+
+       punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+       punpcklbw xmmE,xmmB     ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F)
+       punpcklbw xmmD,xmmF     ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F)
+
+       movdqa    xmmG,xmmA
+       movdqa    xmmH,xmmA
+       punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07)
+       punpckhwd xmmG,xmmE     ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F)
+
+       psrldq    xmmH,2        ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --)
+       psrldq    xmmE,2        ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --)
+
+       movdqa    xmmC,xmmD
+       movdqa    xmmB,xmmD
+       punpcklwd xmmD,xmmH     ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18)
+       punpckhwd xmmC,xmmH     ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --)
+
+       psrldq    xmmB,2        ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --)
+
+       movdqa    xmmF,xmmE
+       punpcklwd xmmE,xmmB     ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29)
+       punpckhwd xmmF,xmmB     ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --)
+
+       pshufd    xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03)
+       movdqa    xmmB,xmmE
+       punpckldq xmmA,xmmD     ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14)
+       punpckldq xmmE,xmmH     ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07)
+       punpckhdq xmmD,xmmB     ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29)
+
+       pshufd    xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B)
+       movdqa    xmmB,xmmF
+       punpckldq xmmG,xmmC     ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C)
+       punpckldq xmmF,xmmH     ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F)
+       punpckhdq xmmC,xmmB     ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --)
+
+       punpcklqdq xmmA,xmmE    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
+       punpcklqdq xmmD,xmmG    ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
+       punpcklqdq xmmF,xmmC    ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
+
+       cmp     ecx, byte SIZEOF_XMMWORD
+       jb      short .column_st32
+
+       test    edi, SIZEOF_XMMWORD-1
+       jnz     short .out1
+       ; --(aligned)-------------------
+       movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF
+       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+       jmp     short .out0
+.out1: ; --(unaligned)-----------------
+       pcmpeqb    xmmH,xmmH                    ; xmmH=(all 1's)
+       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
+       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [edi], xmmD
+       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmF,xmmH                    ; movntdqu XMMWORD [edi], xmmF
+       add     edi, byte SIZEOF_XMMWORD        ; outptr
+.out0:
+       sub     ecx, byte SIZEOF_XMMWORD
+       jz      near .endcolumn
+
+       add     esi, byte SIZEOF_XMMWORD        ; inptr0
+       dec     al                      ; Yctr
+       jnz     near .Yloop_2nd
+
+       add     ebx, byte SIZEOF_XMMWORD        ; inptr1
+       add     edx, byte SIZEOF_XMMWORD        ; inptr2
+       jmp     near .columnloop
+       alignx  16,7
+
+.column_st32:
+       pcmpeqb xmmH,xmmH                       ; xmmH=(all 1's)
+       lea     ecx, [ecx+ecx*2]                ; imul ecx, RGB_PIXELSIZE
+       cmp     ecx, byte 2*SIZEOF_XMMWORD
+       jb      short .column_st16
+       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
+       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [edi], xmmD
+       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqa  xmmA,xmmF
+       sub     ecx, byte 2*SIZEOF_XMMWORD
+       jmp     short .column_st15
+.column_st16:
+       cmp     ecx, byte SIZEOF_XMMWORD
+       jb      short .column_st15
+       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [edi], xmmA
+       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqa  xmmA,xmmD
+       sub     ecx, byte SIZEOF_XMMWORD
+.column_st15:
+       mov     eax,ecx
+       xor     ecx, byte 0x0F
+       shl     ecx, 2
+       movd    xmmB,ecx
+       psrlq   xmmH,4
+       pcmpeqb xmmE,xmmE
+       psrlq   xmmH,xmmB
+       psrlq   xmmE,xmmB
+       punpcklbw xmmE,xmmH
+       ; ----------------
+       mov     ecx,edi
+       and     ecx, byte SIZEOF_XMMWORD-1
+       jz      short .adj0
+       add     eax,ecx
+       cmp     eax, byte SIZEOF_XMMWORD
+       ja      short .adj0
+       and     edi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
+       shl     ecx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
+       movdqa  xmmG,xmmA
+       movdqa  xmmC,xmmE
+       pslldq  xmmA, SIZEOF_XMMWORD/2
+       pslldq  xmmE, SIZEOF_XMMWORD/2
+       movd    xmmD,ecx
+       sub     ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+       jb      short .adj1
+       movd    xmmF,ecx
+       psllq   xmmA,xmmF
+       psllq   xmmE,xmmF
+       jmp     short .adj0
+.adj1: neg     ecx
+       movd    xmmF,ecx
+       psrlq   xmmA,xmmF
+       psrlq   xmmE,xmmF
+       psllq   xmmG,xmmD
+       psllq   xmmC,xmmD
+       por     xmmA,xmmG
+       por     xmmE,xmmC
+.adj0: ; ----------------
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+
+%else ; RGB_PIXELSIZE == 4 ; -----------
+
+%ifdef RGBX_FILLER_0XFF
+       pcmpeqb   xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
+       pcmpeqb   xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
+%else
+       pxor      xmm6,xmm6             ; xmm6=XE=X(02468ACE********)
+       pxor      xmm7,xmm7             ; xmm7=XO=X(13579BDF********)
+%endif
+       ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **)
+       ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **)
+       ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **)
+       ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **)
+
+       punpcklbw xmmA,xmmC     ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E)
+       punpcklbw xmmE,xmmG     ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E)
+       punpcklbw xmmB,xmmD     ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F)
+       punpcklbw xmmF,xmmH     ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F)
+
+       movdqa    xmmC,xmmA
+       punpcklwd xmmA,xmmE     ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36)
+       punpckhwd xmmC,xmmE     ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E)
+       movdqa    xmmG,xmmB
+       punpcklwd xmmB,xmmF     ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37)
+       punpckhwd xmmG,xmmF     ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F)
+
+       movdqa    xmmD,xmmA
+       punpckldq xmmA,xmmB     ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
+       punpckhdq xmmD,xmmB     ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
+       movdqa    xmmH,xmmC
+       punpckldq xmmC,xmmG     ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
+       punpckhdq xmmH,xmmG     ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
+
+       cmp     ecx, byte SIZEOF_XMMWORD
+       jb      short .column_st32
+
+       test    edi, SIZEOF_XMMWORD-1
+       jnz     short .out1
+       ; --(aligned)-------------------
+       movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA
+       movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD
+       movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC
+       movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH
+       add     edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
+       jmp     short .out0
+.out1: ; --(unaligned)-----------------
+       pcmpeqb    xmmE,xmmE                    ; xmmE=(all 1's)
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [edi], xmmD
+       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmC,xmmE                    ; movntdqu XMMWORD [edi], xmmC
+       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmH,xmmE                    ; movntdqu XMMWORD [edi], xmmH
+       add     edi, byte SIZEOF_XMMWORD        ; outptr
+.out0:
+       sub     ecx, byte SIZEOF_XMMWORD
+       jz      near .endcolumn
+
+       add     esi, byte SIZEOF_XMMWORD        ; inptr0
+       dec     al                      ; Yctr
+       jnz     near .Yloop_2nd
+
+       add     ebx, byte SIZEOF_XMMWORD        ; inptr1
+       add     edx, byte SIZEOF_XMMWORD        ; inptr2
+       jmp     near .columnloop
+       alignx  16,7
+
+.column_st32:
+       pcmpeqb xmmE,xmmE                       ; xmmE=(all 1's)
+       cmp     ecx, byte SIZEOF_XMMWORD/2
+       jb      short .column_st16
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [edi], xmmD
+       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqa  xmmA,xmmC
+       movdqa  xmmD,xmmH
+       sub     ecx, byte SIZEOF_XMMWORD/2
+.column_st16:
+       cmp     ecx, byte SIZEOF_XMMWORD/4
+       jb      short .column_st15
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+       add     edi, byte SIZEOF_XMMWORD        ; outptr
+       movdqa  xmmA,xmmD
+       sub     ecx, byte SIZEOF_XMMWORD/4
+.column_st15:
+       cmp     ecx, byte SIZEOF_XMMWORD/16
+       jb      short .endcolumn
+       mov     eax,ecx
+       xor     ecx, byte 0x03
+       inc     ecx
+       shl     ecx, 4
+       movd    xmmF,ecx
+       psrlq   xmmE,xmmF
+       punpcklbw xmmE,xmmE
+       ; ----------------
+       mov     ecx,edi
+       and     ecx, byte SIZEOF_XMMWORD-1
+       jz      short .adj0
+       lea     eax, [ecx+eax*4]        ; RGB_PIXELSIZE
+       cmp     eax, byte SIZEOF_XMMWORD
+       ja      short .adj0
+       and     edi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
+       shl     ecx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
+       movdqa  xmmB,xmmA
+       movdqa  xmmG,xmmE
+       pslldq  xmmA, SIZEOF_XMMWORD/2
+       pslldq  xmmE, SIZEOF_XMMWORD/2
+       movd    xmmC,ecx
+       sub     ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
+       jb      short .adj1
+       movd    xmmH,ecx
+       psllq   xmmA,xmmH
+       psllq   xmmE,xmmH
+       jmp     short .adj0
+.adj1: neg     ecx
+       movd    xmmH,ecx
+       psrlq   xmmA,xmmH
+       psrlq   xmmE,xmmH
+       psllq   xmmB,xmmC
+       psllq   xmmG,xmmC
+       por     xmmA,xmmB
+       por     xmmE,xmmG
+.adj0: ; ----------------
+       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+
+%endif ; RGB_PIXELSIZE ; ---------------
+
+.endcolumn:
+       sfence          ; flush the write buffer
+
+.return:
+       pop     edi
+       pop     esi
+;      pop     edx             ; need not be preserved
+;      pop     ecx             ; need not be preserved
+       pop     ebx
+       mov     esp,ebp         ; esp <- aligned ebp
+       pop     esp             ; esp <- original ebp
+       pop     ebp
+       ret
+
+; --------------------------------------------------------------------------
+;
+; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+;
+; GLOBAL(void)
+; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width,
+;                                  JSAMPIMAGE input_buf,
+;                                  JDIMENSION in_row_group_ctr,
+;                                  JSAMPARRAY output_buf);
+;
+
+%define output_width(b)        (b)+8                   ; JDIMENSION output_width
+%define input_buf(b)           (b)+12          ; JSAMPIMAGE input_buf
+%define in_row_group_ctr(b)    (b)+16          ; JDIMENSION in_row_group_ctr
+%define output_buf(b)          (b)+20          ; JSAMPARRAY output_buf
+
+       align   16
+       global  EXTN(jsimd_h2v2_merged_upsample_sse2)
+
+EXTN(jsimd_h2v2_merged_upsample_sse2):
+       push    ebp
+       mov     ebp,esp
+       push    ebx
+;      push    ecx             ; need not be preserved
+;      push    edx             ; need not be preserved
+       push    esi
+       push    edi
+
+       mov     eax, POINTER [output_width(ebp)]
+
+       mov     edi, JSAMPIMAGE [input_buf(ebp)]
+       mov     ecx, JDIMENSION [in_row_group_ctr(ebp)]
+       mov     esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
+       mov     ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
+       mov     edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
+       mov     edi, JSAMPARRAY [output_buf(ebp)]
+       lea     esi, [esi+ecx*SIZEOF_JSAMPROW]
+
+       push    edx                     ; inptr2
+       push    ebx                     ; inptr1
+       push    esi                     ; inptr00
+       mov     ebx,esp
+
+       push    edi                     ; output_buf (outptr0)
+       push    ecx                     ; in_row_group_ctr
+       push    ebx                     ; input_buf
+       push    eax                     ; output_width
+
+       call    near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+       add     esi, byte SIZEOF_JSAMPROW       ; inptr01
+       add     edi, byte SIZEOF_JSAMPROW       ; outptr1
+       mov     POINTER [ebx+0*SIZEOF_POINTER], esi
+       mov     POINTER [ebx-1*SIZEOF_POINTER], edi
+
+       call    near EXTN(jsimd_h2v1_merged_upsample_sse2)
+
+       add     esp, byte 7*SIZEOF_DWORD
+
+       pop     edi
+       pop     esi
+;      pop     edx             ; need not be preserved
+;      pop     ecx             ; need not be preserved
+       pop     ebx
+       pop     ebp
+       ret
+
index a5b432ffad4fbbc18747e1308d107653570b7822..c21cf29b975c98d4fd68470e03b965f8cae78b9c 100644 (file)
 #define jsimd_h2v2_fancy_upsample_mmx         jSFUpH2V2M
 #define jsimd_h2v1_fancy_upsample_mmx         jSFUpH2V1M
 #define jsimd_h2v2_merged_upsample_mmx        jSMUpH2V2M
+#define jsimd_h2v2_extrgb_merged_upsample_mmx jSMUpH2V2EXTRGBM
+#define jsimd_h2v2_extrgbx_merged_upsample_mmx jSMUpH2V2EXTRGBXM
+#define jsimd_h2v2_extbgr_merged_upsample_mmx jSMUpH2V2EXTBGRM
+#define jsimd_h2v2_extbgrx_merged_upsample_mmx jSMUpH2V2EXTBGRXM
+#define jsimd_h2v2_extxbgr_merged_upsample_mmx jSMUpH2V2EXTXBGRM
+#define jsimd_h2v2_extxrgb_merged_upsample_mmx jSMUpH2V2EXTXRGBM
 #define jsimd_h2v1_merged_upsample_mmx        jSMUpH2V1M
+#define jsimd_h2v1_extrgb_merged_upsample_mmx jSMUpH2V1EXTRGBM
+#define jsimd_h2v1_extrgbx_merged_upsample_mmx jSMUpH2V1EXTRGBXM
+#define jsimd_h2v1_extbgr_merged_upsample_mmx jSMUpH2V1EXTBGRM
+#define jsimd_h2v1_extbgrx_merged_upsample_mmx jSMUpH2V1EXTBGRXM
+#define jsimd_h2v1_extxbgr_merged_upsample_mmx jSMUpH2V1EXTXBGRM
+#define jsimd_h2v1_extxrgb_merged_upsample_mmx jSMUpH2V1EXTXRGBM
 #define jsimd_h2v2_upsample_sse2              jSUpH2V2S2
 #define jsimd_h2v1_upsample_sse2              jSUpH2V1S2
 #define jconst_fancy_upsample_sse2            jSCFUpS2
 #define jsimd_h2v1_fancy_upsample_sse2        jSFUpH2V1S2
 #define jconst_merged_upsample_sse2           jSCMUpS2
 #define jsimd_h2v2_merged_upsample_sse2       jSMUpH2V2S2
+#define jsimd_h2v2_extrgb_merged_upsample_sse2 jSMUpH2V2EXTRGBS2
+#define jsimd_h2v2_extrgbx_merged_upsample_sse2 jSMUpH2V2EXTRGBXS2
+#define jsimd_h2v2_extbgr_merged_upsample_sse2 jSMUpH2V2EXTBGRS2
+#define jsimd_h2v2_extbgrx_merged_upsample_sse2 jSMUpH2V2EXTBGRXS2
+#define jsimd_h2v2_extxbgr_merged_upsample_sse2 jSMUpH2V2EXTXBGRS2
+#define jsimd_h2v2_extxrgb_merged_upsample_sse2 jSMUpH2V2EXTXRGBS2
 #define jsimd_h2v1_merged_upsample_sse2       jSMUpH2V1S2
+#define jsimd_h2v1_extrgb_merged_upsample_sse2 jSMUpH2V1EXTRGBS2
+#define jsimd_h2v1_extrgbx_merged_upsample_sse2 jSMUpH2V1EXTRGBXS2
+#define jsimd_h2v1_extbgr_merged_upsample_sse2 jSMUpH2V1EXTBGRS2
+#define jsimd_h2v1_extbgrx_merged_upsample_sse2 jSMUpH2V1EXTBGRXS2
+#define jsimd_h2v1_extxbgr_merged_upsample_sse2 jSMUpH2V1EXTXBGRS2
+#define jsimd_h2v1_extxrgb_merged_upsample_sse2 jSMUpH2V1EXTXRGBS2
 #define jsimd_convsamp_mmx                    jSConvM
 #define jsimd_convsamp_sse2                   jSConvS2
 #define jsimd_convsamp_float_3dnow            jSConvF3D
@@ -264,9 +288,45 @@ EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
 EXTERN(void) jsimd_h2v2_merged_upsample_mmx
         JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
              JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
 EXTERN(void) jsimd_h2v1_merged_upsample_mmx
         JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
              JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmx
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
 
 EXTERN(void) jsimd_h2v2_upsample_sse2
         JPP((int max_v_samp_factor, JDIMENSION output_width,
@@ -287,9 +347,45 @@ extern const int jconst_merged_upsample_sse2[];
 EXTERN(void) jsimd_h2v2_merged_upsample_sse2
         JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
              JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
 EXTERN(void) jsimd_h2v1_merged_upsample_sse2
         JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
              JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_sse2
+        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
+             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
 
 /* SIMD Sample Conversion */
 EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,