]> granicus.if.org Git - libx264/commitdiff
Add AltiVec implementation of
authorGuillaume Poirier <gpoirier@mplayerhq.hu>
Mon, 10 Dec 2007 22:09:13 +0000 (22:09 +0000)
committerGuillaume Poirier <gpoirier@mplayerhq.hu>
Mon, 10 Dec 2007 22:09:13 +0000 (22:09 +0000)
- x264_zigzag_scan_4x4_frame_altivec()
- x264_zigzag_scan_4x4ac_frame_altivec()
- x264_zigzag_scan_4x4_field_altivec()
- x264_zigzag_scan_4x4ac_field_altivec()
each around 1.3 tp 1.8x faster than C version
Patch by Noboru Asai % noboru P asai A gmail P com%

git-svn-id: svn://svn.videolan.org/x264/trunk@711 df754926-b1dd-0310-bc7b-ec298dee348c

common/dct.c
common/ppc/dct.c
common/ppc/dct.h

index 0e724e42a2a3fdeefcb103d0e036e6687c827f57..c3983501bbc0a4286cbcee973c44411a8fcdacc4 100644 (file)
@@ -601,6 +601,14 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         if( cpu&X264_CPU_SSE2 )
             pf->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
 #endif
+#endif
+
+#ifdef ARCH_PPC
+        if( cpu&X264_CPU_ALTIVEC )
+        {
+            pf->scan_4x4   = x264_zigzag_scan_4x4_field_altivec;
+            pf->scan_4x4ac = x264_zigzag_scan_4x4ac_field_altivec;
+        }
 #endif
     }
     else
@@ -610,5 +618,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         pf->scan_4x4ac = zigzag_scan_4x4ac_frame;
         pf->sub_4x4    = zigzag_sub_4x4_frame;
         pf->sub_4x4ac  = zigzag_sub_4x4ac_frame;
+
+#ifdef ARCH_PPC
+        if( cpu&X264_CPU_ALTIVEC )
+        {
+            pf->scan_4x4   = x264_zigzag_scan_4x4_frame_altivec;
+            pf->scan_4x4ac = x264_zigzag_scan_4x4ac_frame_altivec;
+        }
+#endif
     }
 }
index 87685305658b33eb13a173d50699ed0ca7e69a0a..2be989373436d7f00d9c3649d48781b62d86f548 100644 (file)
@@ -455,3 +455,106 @@ void x264_add16x16_idct8_altivec( uint8_t *dst, int16_t dct[4][8][8] )
     x264_add8x8_idct8_altivec( &dst[8*FDEC_STRIDE+0], dct[2] );
     x264_add8x8_idct8_altivec( &dst[8*FDEC_STRIDE+8], dct[3] );
 }
+
+void x264_zigzag_scan_4x4_frame_altivec( int level[16], int16_t dct[4][4] )
+{
+    vec_s16_t dct0v, dct1v;
+    vec_s16_t tmp0v, tmp1v;
+    vec_s32_t level0v, level1v, level2v, level3v;
+
+    dct0v = vec_ld(0x00, (int16_t*)dct);
+    dct1v = vec_ld(0x10, (int16_t*)dct);
+
+    const vec_u8_t sel0 = (vec_u8_t) CV(0,1,8,9,2,3,4,5,10,11,16,17,24,25,18,19);
+    const vec_u8_t sel1 = (vec_u8_t) CV(12,13,6,7,14,15,20,21,26,27,28,29,22,23,30,31);
+
+    tmp0v = vec_perm( dct0v, dct1v, sel0 );
+    tmp1v = vec_perm( dct0v, dct1v, sel1 );
+
+    level0v = vec_unpackh( tmp0v );
+    level1v = vec_unpackl( tmp0v );
+    level2v = vec_unpackh( tmp1v );
+    level3v = vec_unpackl( tmp1v );
+
+    vec_st( level0v, 0x00, level );
+    vec_st( level1v, 0x10, level );
+    vec_st( level2v, 0x20, level );
+    vec_st( level3v, 0x30, level );
+}
+
+void x264_zigzag_scan_4x4_field_altivec( int level[16], int16_t dct[4][4] )
+{
+    vec_s16_t dct0v, dct1v;
+    vec_s16_t tmp0v, tmp1v;
+    vec_s32_t level0v, level1v, level2v, level3v;
+
+    dct0v = vec_ld(0x00, (int16_t*)dct);
+    dct1v = vec_ld(0x10, (int16_t*)dct);
+
+    const vec_u8_t sel0 = (vec_u8_t) CV(0,1,2,3,8,9,4,5,6,7,10,11,12,13,14,15);
+
+    tmp0v = vec_perm( dct0v, dct1v, sel0 );
+    tmp1v = dct1v;
+
+    level0v = vec_unpackh( tmp0v );
+    level1v = vec_unpackl( tmp0v );
+    level2v = vec_unpackh( tmp1v );
+    level3v = vec_unpackl( tmp1v );
+
+    vec_st( level0v, 0x00, level );
+    vec_st( level1v, 0x10, level );
+    vec_st( level2v, 0x20, level );
+    vec_st( level3v, 0x30, level );
+}
+
+void x264_zigzag_scan_4x4ac_frame_altivec( int level[15], int16_t dct[4][4] )
+{
+    vec_s16_t dct0v, dct1v;
+    vec_s16_t tmp0v, tmp1v;
+    vec_s32_t level0v, level1v, level2v, level3v;
+
+    dct0v = vec_ld(0x00, (int16_t*)dct);
+    dct1v = vec_ld(0x10, (int16_t*)dct);
+
+    const vec_u8_t sel0 = (vec_u8_t) CV(8,9,2,3,4,5,10,11,16,17,24,25,18,19,12,13);
+    const vec_u8_t sel1 = (vec_u8_t) CV(6,7,14,15,20,21,26,27,28,29,22,23,30,31,0,1);
+
+    tmp0v = vec_perm( dct0v, dct1v, sel0 );
+    tmp1v = vec_perm( dct0v, dct1v, sel1 );
+
+    level0v = vec_unpackh( tmp0v );
+    level1v = vec_unpackl( tmp0v );
+    level2v = vec_unpackh( tmp1v );
+    level3v = vec_unpackl( tmp1v );
+
+    vec_st( level0v, 0x00, level );
+    vec_st( level1v, 0x10, level );
+    vec_st( level2v, 0x20, level );
+    vec_st( level3v, 0x30, level ); // FIXME?: write level[15]
+}
+
+void x264_zigzag_scan_4x4ac_field_altivec( int level[15], int16_t dct[4][4] )
+{
+    vec_s16_t dct0v, dct1v;
+    vec_s16_t tmp0v, tmp1v;
+    vec_s32_t level0v, level1v, level2v, level3v;
+
+    dct0v = vec_ld(0x00, (int16_t*)dct);
+    dct1v = vec_ld(0x10, (int16_t*)dct);
+
+    const vec_u8_t sel0 = (vec_u8_t) CV(2,3,8,9,4,5,6,7,10,11,12,13,14,15,16,17);
+    const vec_u8_t sel1 = (vec_u8_t) CV(18,19,20,21,22,23,24,25,26,27,28,29,30,31,0,1);
+
+    tmp0v = vec_perm( dct0v, dct1v, sel0 );
+    tmp1v = vec_perm( dct0v, dct1v, sel1 );
+
+    level0v = vec_unpackh( tmp0v );
+    level1v = vec_unpackl( tmp0v );
+    level2v = vec_unpackh( tmp1v );
+    level3v = vec_unpackl( tmp1v );
+
+    vec_st( level0v, 0x00, level );
+    vec_st( level1v, 0x10, level );
+    vec_st( level2v, 0x20, level );
+    vec_st( level3v, 0x30, level ); // FIXME?: write level[15]
+}
index 9aa60456ec06a1ebd3a0a111cd514216cc2924de..7bcde437d05d3ac0bee7264338e4879984d70f44 100644 (file)
@@ -44,4 +44,10 @@ void x264_sub16x16_dct8_altivec( int16_t dct[4][8][8],
 void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[8][8] );
 void x264_add16x16_idct8_altivec( uint8_t *dst, int16_t dct[4][8][8] );
 
+void x264_zigzag_scan_4x4_frame_altivec( int level[16], int16_t dct[4][4] );
+void x264_zigzag_scan_4x4ac_frame_altivec( int level[15], int16_t dct[4][4] );
+
+void x264_zigzag_scan_4x4_field_altivec( int level[16], int16_t dct[4][4] );
+void x264_zigzag_scan_4x4ac_field_altivec( int level[15], int16_t dct[4][4] );
+
 #endif