granicus.if.org Git - libvpx/blob - vp9/encoder/x86/vp9_error_sse2.asm

   1 ;
   2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3 ;
   4 ;  Use of this source code is governed by a BSD-style license
   5 ;  that can be found in the LICENSE file in the root of the source
   6 ;  tree. An additional intellectual property rights grant can be found
   7 ;  in the file PATENTS.  All contributing project authors may
   8 ;  be found in the AUTHORS file in the root of the source tree.
   9 ;
  10
  11 %define private_prefix vp9
  12
  13 %include "third_party/x86inc/x86inc.asm"
  14 %include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
  15
  16 SECTION .text
  17
  18 ; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
  19 ;                         int64_t *ssz)
  20
  21 INIT_XMM sse2
  22 cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
  23   pxor      m4, m4                 ; sse accumulator
  24   pxor      m6, m6                 ; ssz accumulator
  25   pxor      m5, m5                 ; dedicated zero register
  26 .loop:
  27   LOAD_TRAN_LOW 2, uqcq, 0
  28   LOAD_TRAN_LOW 0, dqcq, 0
  29   LOAD_TRAN_LOW 3, uqcq, 8
  30   LOAD_TRAN_LOW 1, dqcq, 8
  31   INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
  32   INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
  33   sub    sizeq, 16
  34   psubw     m0, m2
  35   psubw     m1, m3
  36   ; individual errors are max. 15bit+sign, so squares are 30bit, and
  37   ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
  38   pmaddwd   m0, m0
  39   pmaddwd   m1, m1
  40   pmaddwd   m2, m2
  41   pmaddwd   m3, m3
  42   ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
  43   paddd     m0, m1
  44   paddd     m2, m3
  45   ; accumulate in 64bit
  46   punpckldq m7, m0, m5
  47   punpckhdq m0, m5
  48   paddq     m4, m7
  49   punpckldq m7, m2, m5
  50   paddq     m4, m0
  51   punpckhdq m2, m5
  52   paddq     m6, m7
  53   paddq     m6, m2
  54   jg .loop
  55
  56   ; accumulate horizontally and store in return value
  57   movhlps   m5, m4
  58   movhlps   m7, m6
  59   paddq     m4, m5
  60   paddq     m6, m7
  61 %if VPX_ARCH_X86_64
  62   movq    rax, m4
  63   movq [sszq], m6
  64 %else
  65   mov     eax, sszm
  66   pshufd   m5, m4, 0x1
  67   movq  [eax], m6
  68   movd    eax, m4
  69   movd    edx, m5
  70 %endif
  71   RET
  72
  73 ; Compute the sum of squared difference between two tran_low_t vectors.
  74 ; Vectors are converted (if necessary) to int16_t for calculations.
  75 ; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff,
  76 ;                            intptr_t block_size)
  77
  78 INIT_XMM sse2
  79 cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
  80   pxor      m4, m4                 ; sse accumulator
  81   pxor      m5, m5                 ; dedicated zero register
  82 .loop:
  83   LOAD_TRAN_LOW 2, uqcq, 0
  84   LOAD_TRAN_LOW 0, dqcq, 0
  85   LOAD_TRAN_LOW 3, uqcq, 8
  86   LOAD_TRAN_LOW 1, dqcq, 8
  87   INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
  88   INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
  89   sub    sizeq, 16
  90   psubw     m0, m2
  91   psubw     m1, m3
  92   ; individual errors are max. 15bit+sign, so squares are 30bit, and
  93   ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
  94   pmaddwd   m0, m0
  95   pmaddwd   m1, m1
  96   ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
  97   paddd     m0, m1
  98   ; accumulate in 64bit
  99   punpckldq m3, m0, m5
 100   punpckhdq m0, m5
 101   paddq     m4, m3
 102   paddq     m4, m0
 103   jnz .loop
 104
 105   ; accumulate horizontally and store in return value
 106   movhlps   m5, m4
 107   paddq     m4, m5
 108 %if VPX_ARCH_X86_64
 109   movq    rax, m4
 110 %else
 111   pshufd   m5, m4, 0x1
 112   movd    eax, m4
 113   movd    edx, m5
 114 %endif
 115   RET