From d7422b2b1eb9f0011a8c379c2be680d6892b16bc Mon Sep 17 00:00:00 2001 From: Peter de Rivaz Date: Thu, 16 Oct 2014 14:00:54 +0100 Subject: [PATCH] Added sse2 acceleration for highbitdepth variance Change-Id: I446bdf3a405e4e9d2aa633d6281d66ea0cdfd79f --- vp9/common/vp9_rtcd_defs.pl | 216 ++-- .../x86/vp9_highbd_subpel_variance.asm | 1043 +++++++++++++++++ .../x86/vp9_highbd_variance_impl_sse2.asm | 313 +++++ vp9/encoder/x86/vp9_highbd_variance_sse2.c | 613 ++++++++++ vp9/vp9cx.mk | 3 + 5 files changed, 2080 insertions(+), 108 deletions(-) create mode 100644 vp9/encoder/x86/vp9_highbd_subpel_variance.asm create mode 100644 vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm create mode 100644 vp9/encoder/x86/vp9_highbd_variance_sse2.c diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 5e523385b..2c8fd94f5 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1283,34 +1283,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # variance add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x16/; + specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x32/; + specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance64x32/; + specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x64/; + specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance32x32/; + specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance64x64/; + specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x16/; + specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance16x8/; + specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance8x16/; + specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_variance8x8/; + specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_variance8x4/; @@ -1322,40 +1322,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_variance4x4/; add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_get8x8var/; + specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc"; add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_get16x16var/; + specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x16/; + specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x32/; + specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance64x32/; + specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x64/; + specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance32x32/; + specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance64x64/; + specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x16/; + specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance16x8/; + specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance8x16/; + specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_variance8x8/; + specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_10_variance8x4/; @@ -1367,40 +1367,40 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_10_variance4x4/; add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_10_get8x8var/; + specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc"; add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_10_get16x16var/; + specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x16/; + specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x32/; + specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance64x32/; + specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x64/; + specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance32x32/; + specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance64x64/; + specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x16/; + specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance16x8/; + specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance8x16/; + specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_variance8x8/; + specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_12_variance8x4/; @@ -1412,76 +1412,76 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_12_variance4x4/; add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_12_get8x8var/; + specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc"; add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; - specialize qw/vp9_highbd_12_get16x16var/; + specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance64x64/; + specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/; + specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance32x64/; + specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/; + specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance64x32/; + specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/; + specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance32x16/; + specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/; + specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance16x32/; + specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/; + specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance32x32/; + specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/; + specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance16x16/; + specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/; + specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance8x16/; + specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/; + specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance16x8/; + specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/; + specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance8x8/; + specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/; + specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_sub_pixel_variance8x4/; + specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/; + specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_sub_pixel_variance4x8/; @@ -1496,70 +1496,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance64x64/; + specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance32x64/; + specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance64x32/; + specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance32x16/; + specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance16x32/; + specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance32x32/; + specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance16x16/; + specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance8x16/; + specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance16x8/; + specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance8x8/; + specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_sub_pixel_variance8x4/; + specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/; + specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_10_sub_pixel_variance4x8/; @@ -1574,70 +1574,70 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance64x64/; + specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance32x64/; + specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance64x32/; + specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance32x16/; + specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance16x32/; + specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance32x32/; + specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance16x16/; + specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance8x16/; + specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance16x8/; + specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance8x8/; + specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_sub_pixel_variance8x4/; + specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; - specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/; + specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_highbd_12_sub_pixel_variance4x8/; @@ -1817,7 +1817,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_sad4x4x4d sse2/; add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse16x16/; + specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/vp9_highbd_mse8x16/; @@ -1826,10 +1826,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_mse16x8/; add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_mse8x8/; + specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse16x16/; + specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/vp9_highbd_10_mse8x16/; @@ -1838,10 +1838,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_10_mse16x8/; add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_10_mse8x8/; + specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse16x16/; + specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/vp9_highbd_12_mse8x16/; @@ -1850,7 +1850,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_12_mse16x8/; add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vp9_highbd_12_mse8x8/; + specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc"; # ENCODEMB INVOKE diff --git a/vp9/encoder/x86/vp9_highbd_subpel_variance.asm b/vp9/encoder/x86/vp9_highbd_subpel_variance.asm new file mode 100644 index 000000000..aebe63b74 --- /dev/null +++ b/vp9/encoder/x86/vp9_highbd_subpel_variance.asm @@ -0,0 +1,1043 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 15 + times 8 dw 1 + times 8 dw 14 + times 8 dw 2 + times 8 dw 13 + times 8 dw 3 + times 8 dw 12 + times 8 dw 4 + times 8 dw 11 + times 8 dw 5 + times 8 dw 10 + times 8 dw 6 + times 8 dw 9 + times 8 dw 7 + times 16 dw 8 + times 8 dw 7 + times 8 dw 9 + times 8 dw 6 + times 8 dw 10 + times 8 dw 5 + times 8 dw 11 + times 8 dw 4 + times 8 dw 12 + times 8 dw 3 + times 8 dw 13 + times 8 dw 2 + times 8 dw 14 + times 8 dw 1 + times 8 dw 15 + +SECTION .text + +; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + mova %4, %3 ; make copies to manipulate to calc sum + mova %2, %1 ; use originals for calc sse + pmaddwd %3, %3 + paddw %4, %2 + pmaddwd %1, %1 + movhlps %2, %4 + paddd %6, %3 + paddw %4, %2 + pxor %2, %2 + pcmpgtw %2, %4 ; mask for 0 > %4 (sum) + punpcklwd %4, %2 ; sign-extend word to dword + paddd %6, %1 + paddd %5, %4 + +%endmacro + +%macro STORE_AND_RET 0 +%if mmsize == 16 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + movhlps m3, m7 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + pshufd m4, m6, 0x1 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + movd [r1], m7 ; store sse + movd rax, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if ARCH_X86=1 && CONFIG_PIC=1 + lea srcq, [srcq + src_stridemp*2] +%else + lea srcq, [srcq + src_strideq*2] +%endif +%endmacro + +%macro INC_SRC_BY_SRC_2STRIDE 0 +%if ARCH_X86=1 && CONFIG_PIC=1 + lea srcq, [srcq + src_stridemp*4] +%else + lea srcq, [srcq + src_strideq*4] +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 + + +%ifdef PIC ; 64bit PIC + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \ + y_offset, dst, dst_stride, height, sse + %endif + %define h heightd + %define bilin_filter sseq +%else + %if ARCH_X86=1 && CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse, g_bilin_filter, g_pw_8 + %define h dword heightm + %define sec_str sec_stridemp + + ; Store bilin_filter and pw_8 location in stack + GET_GOT eax + add esp, 4 ; restore esp + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, height, \ + sse, g_bilin_filter, g_pw_8 + %define h heightd + + ; Store bilin_filter and pw_8 location in stack + GET_GOT eax + add esp, 4 ; restore esp + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %endif + %else + %if %2 == 1 ; avg + cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ + 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse + %if ARCH_X86_64 + %define h heightd + %define sec_str sec_strideq + %else + %define h dword heightm + %define sec_str sec_stridemp + %endif + %else + cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, height, sse + %define h heightd + %endif + + %define bilin_filter bilin_filter_m + %endif +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + +%if %1 < 16 + sar h, 1 +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq + 16] + mova m1, [dstq] + mova m3, [dstq + 16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m2, [secq+16] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq + src_strideq*2] + mova m1, [dstq] + mova m3, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m2, [secq + sec_str*2] +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_zero_y_zero_loop + STORE_AND_RET + +.x_zero_y_nonzero: + cmp y_offsetd, 8 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [dstq] + mova m3, [dstq+16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m2, [dstq] + mova m3, [dstq+dst_strideq*2] + pavgw m0, m1 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+sec_str*2] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_zero_y_half_loop + STORE_AND_RET + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+16] + mova m2, [dstq] + mova m3, [dstq+16] + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*4] + mova m4, m1 + mova m2, [dstq] + mova m3, [dstq+dst_strideq*2] + pmullw m1, filter_y_a + pmullw m5, filter_y_b + paddw m1, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m1, m5 + paddw m0, m4 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+sec_str*2] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonzero: + cmp x_offsetd, 8 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + mova m2, [dstq] + mova m3, [dstq + 16] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + mova m2, [dstq] + mova m3, [dstq + dst_strideq*2] + pavgw m0, m4 + pavgw m1, m5 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+sec_str*2] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_half_y_zero_loop + STORE_AND_RET + +.x_half_y_nonzero: + cmp y_offsetd, 8 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + 16] + movu m4, [srcq + 2] + movu m5, [srcq + 18] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m1, m3 + mova m4, [dstq] + mova m5, [dstq + 16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_half_loop: + movu m2, [srcq] + movu m3, [srcq + src_strideq*2] + movu m4, [srcq + 2] + movu m5, [srcq + src_strideq*2 + 2] + pavgw m2, m4 + pavgw m3, m5 + pavgw m0, m2 + pavgw m2, m3 + mova m4, [dstq] + mova m5, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m2, [secq+sec_str*2] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_half_y_half_loop + STORE_AND_RET + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] + mova m9, [bilin_filter+y_offsetq+16] + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86_32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 + pavgw m1, m3 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m1, filter_rnd + paddw m1, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m1, 4 + paddw m0, m2 + mova m2, [dstq] + psrlw m0, 4 + mova m3, [dstq+16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + lea srcq, [srcq + src_strideq*2] + lea dstq, [dstq + dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + lea srcq, [srcq + src_strideq*2] + pavgw m0, m2 +.x_half_y_other_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pavgw m2, m4 + pavgw m3, m5 + mova m4, m2 + mova m5, m3 + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m4, filter_rnd + paddw m4, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + psrlw m4, 4 + paddw m0, m2 + mova m2, [dstq] + psrlw m0, 4 + mova m3, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m4, [secq+sec_str*2] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + lea srcq, [srcq + src_strideq*4] + lea dstq, [dstq + dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + mova m4, [dstq] + mova m5, [dstq+16] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m1, [srcq+src_strideq*2] + movu m2, [srcq+2] + movu m3, [srcq+src_strideq*2+2] + mova m4, [dstq] + mova m5, [dstq+dst_strideq*2] + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m1, m3 + paddw m0, m2 + psrlw m1, 4 + psrlw m0, 4 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+sec_str*2] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 8 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+2] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+16] + movu m4, [srcq+2] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [dstq] + mova m5, [dstq+16] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m1, m3 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m4, m1, m5, m6, m7 + mova m0, m2 + mova m1, m3 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + lea srcq, [srcq+src_strideq*2] +.x_other_y_half_loop: + movu m2, [srcq] + movu m3, [srcq+src_strideq*2] + movu m4, [srcq+2] + movu m5, [srcq+src_strideq*2+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + mova m4, [dstq] + mova m5, [dstq+dst_strideq*2] + psrlw m2, 4 + psrlw m3, 4 + pavgw m0, m2 + pavgw m2, m3 +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m2, [secq+sec_str*2] +%endif + SUM_SSE m0, m4, m2, m5, m6, m7 + mova m0, m3 + + lea srcq, [srcq+src_strideq*4] + lea dstq, [dstq+dst_strideq*4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonhalf: +; loading filter - this is same as in 8-bit depth +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5 + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] + mova m9, [bilin_filter+x_offsetq+16] + mova m10, [bilin_filter+y_offsetq] + mova m11, [bilin_filter+y_offsetq+16] + mova m12, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%endif +; end of load filter + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m2, [srcq+2] + movu m1, [srcq+16] + movu m3, [srcq+18] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + pmullw m1, filter_x_a + pmullw m3, filter_x_b + paddw m1, filter_rnd + paddw m0, m2 + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + movu m3, [srcq+16] + movu m5, [srcq+18] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m1, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m1, filter_rnd + mova m2, [dstq] + paddw m1, m3 + psrlw m0, 4 + psrlw m1, 4 + mova m3, [dstq+16] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m1, [secq+16] +%endif + SUM_SSE m0, m2, m1, m3, m6, m7 + mova m0, m4 + mova m1, m5 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq + dst_strideq * 2] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*2] +%endif +%else ; %1 < 16 + movu m0, [srcq] + movu m2, [srcq+2] + pmullw m0, filter_x_a + pmullw m2, filter_x_b + paddw m0, filter_rnd + paddw m0, m2 + psrlw m0, 4 + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movu m2, [srcq] + movu m4, [srcq+2] + movu m3, [srcq+src_strideq*2] + movu m5, [srcq+src_strideq*2+2] + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m2, filter_rnd + pmullw m3, filter_x_a + pmullw m5, filter_x_b + paddw m3, filter_rnd + paddw m2, m4 + paddw m3, m5 + psrlw m2, 4 + psrlw m3, 4 + mova m4, m2 + mova m5, m3 + pmullw m0, filter_y_a + pmullw m2, filter_y_b + paddw m0, filter_rnd + pmullw m4, filter_y_a + pmullw m3, filter_y_b + paddw m0, m2 + paddw m4, filter_rnd + mova m2, [dstq] + paddw m4, m3 + psrlw m0, 4 + psrlw m4, 4 + mova m3, [dstq+dst_strideq*2] +%if %2 == 1 ; avg + pavgw m0, [secq] + pavgw m4, [secq+sec_str*2] +%endif + SUM_SSE m0, m2, m4, m3, m6, m7 + mova m0, m5 + + INC_SRC_BY_SRC_2STRIDE + lea dstq, [dstq + dst_strideq * 4] +%if %2 == 1 ; avg + lea secq, [secq + sec_str*4] +%endif +%endif + dec h + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET +%endmacro + +INIT_XMM sse2 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM sse2 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm new file mode 100644 index 000000000..821dd0660 --- /dev/null +++ b/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm @@ -0,0 +1,313 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;unsigned int vp9_highbd_calc16x16var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(vp9_highbd_calc16x16var_sse2) PRIVATE +sym(vp9_highbd_calc16x16var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+16] + prefetcht0 [rsi+rax] + prefetcht0 [rsi+rax+16] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + + prefetcht0 [rdi] + prefetcht0 [rdi+16] + prefetcht0 [rdi+rdx] + prefetcht0 [rdi+rdx+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 16 + +.var16loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rax] + prefetcht0 [rbx+rax+16] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+16] + prefetcht0 [rbx+rdx] + prefetcht0 [rbx+rdx+16] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax+16] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx+16] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 2 + jnz .var16loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp9_highbd_calc8x8var_sse2 +;( +; unsigned char * src_ptr, +; int source_stride, +; unsigned char * ref_ptr, +; int recon_stride, +; unsigned int * SSE, +; int * Sum +;) +global sym(vp9_highbd_calc8x8var_sse2) PRIVATE +sym(vp9_highbd_calc8x8var_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rbx + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;[src_ptr] + mov rdi, arg(2) ;[ref_ptr] + + movsxd rax, DWORD PTR arg(1) ;[source_stride] + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] + add rax, rax ; source stride in bytes + add rdx, rdx ; recon stride in bytes + + ; Prefetch data + prefetcht0 [rsi] + prefetcht0 [rsi+rax] + lea rbx, [rsi+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + + prefetcht0 [rdi] + prefetcht0 [rdi+rdx] + lea rbx, [rdi+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm0, xmm0 ; clear xmm0 for unpack + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs + + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse + mov rcx, 8 + +.var8loop: + movdqu xmm1, XMMWORD PTR [rsi] + movdqu xmm2, XMMWORD PTR [rdi] + + lea rbx, [rsi+rax*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rbx+rax*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rax] + lea rbx, [rdi+rdx*4] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + lea rbx, [rbx+rdx*2] + prefetcht0 [rbx] + prefetcht0 [rbx+rdx] + + pxor xmm5, xmm5 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + + psubw xmm3, xmm2 + movdqu xmm1, XMMWORD PTR [rsi] + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + movdqu xmm2, XMMWORD PTR [rdi] + paddd xmm6, xmm3 + + psubw xmm1, xmm2 + movdqu xmm3, XMMWORD PTR [rsi+rax] + paddw xmm5, xmm1 + pmaddwd xmm1, xmm1 + movdqu xmm2, XMMWORD PTR [rdi+rdx] + paddd xmm6, xmm1 + + psubw xmm3, xmm2 + paddw xmm5, xmm3 + pmaddwd xmm3, xmm3 + paddd xmm6, xmm3 + + movdqa xmm1, xmm5 + movdqa xmm2, xmm5 + pcmpgtw xmm1, xmm0 + pcmpeqw xmm2, xmm0 + por xmm1, xmm2 + pcmpeqw xmm1, xmm0 + movdqa xmm2, xmm5 + punpcklwd xmm5, xmm1 + punpckhwd xmm2, xmm1 + paddd xmm7, xmm5 + paddd xmm7, xmm2 + + lea rsi, [rsi + 2*rax] + lea rdi, [rdi + 2*rdx] + sub rcx, 4 + jnz .var8loop + + movdqa xmm4, xmm6 + punpckldq xmm6, xmm0 + + punpckhdq xmm4, xmm0 + movdqa xmm5, xmm7 + + paddd xmm6, xmm4 + punpckldq xmm7, xmm0 + + punpckhdq xmm5, xmm0 + paddd xmm7, xmm5 + + movdqa xmm4, xmm6 + movdqa xmm5, xmm7 + + psrldq xmm4, 8 + psrldq xmm5, 8 + + paddd xmm6, xmm4 + paddd xmm7, xmm5 + + mov rdi, arg(4) ; [SSE] + mov rax, arg(5) ; [Sum] + + movd DWORD PTR [rdi], xmm6 + movd DWORD PTR [rax], xmm7 + + ; begin epilog + pop rdi + pop rsi + pop rbx + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp9/encoder/x86/vp9_highbd_variance_sse2.c b/vp9/encoder/x86/vp9_highbd_variance_sse2.c new file mode 100644 index 000000000..58c40860a --- /dev/null +++ b/vp9/encoder/x86/vp9_highbd_variance_sse2.c @@ -0,0 +1,613 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" + +#include "vp9/encoder/vp9_variance.h" +#include "vpx_ports/mem.h" + +typedef unsigned int (*high_variance_fn_t) (const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +static void highbd_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + + *sse = 0; + *sum = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + *sse += sse0; + *sum += sum0; + } + } +} + +static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int64_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 2); + *sse = ROUND_POWER_OF_TWO(sse_long, 4); +} + +static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + high_variance_fn_t var_fn, int block_size) { + int i, j; + uint64_t sse_long = 0; + int64_t sum_long = 0; + + for (i = 0; i < h; i += block_size) { + for (j = 0; j < w; j += block_size) { + unsigned int sse0; + int sum0; + var_fn(src + src_stride * i + j, src_stride, + ref + ref_stride * i + j, ref_stride, &sse0, &sum0); + sse_long += sse0; + sum_long += sum0; + } + } + *sum = ROUND_POWER_OF_TWO(sum_long, 4); + *sse = ROUND_POWER_OF_TWO(sse_long, 8); +} + + +#define HIGH_GET_VAR(S) \ +void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + unsigned int *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ +} \ +\ +void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + unsigned int *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 2); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ +} \ +\ +void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + unsigned int *sse, int *sum) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \ + sse, sum); \ + *sum = ROUND_POWER_OF_TWO(*sum, 4); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ +} + +HIGH_GET_VAR(16); +HIGH_GET_VAR(8); + +#undef HIGH_GET_VAR + +#define VAR_FN(w, h, block_size, shift) \ +unsigned int vp9_highbd_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vp9_highbd_calc##block_size##x##block_size##var_sse2, \ + block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} \ +unsigned int vp9_highbd_10_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vp9_highbd_calc##block_size##x##block_size##var_sse2, \ + block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} \ +unsigned int vp9_highbd_12_variance##w##x##h##_sse2( \ + const uint8_t *src8, int src_stride, \ + const uint8_t *ref8, int ref_stride, \ + unsigned int *sse) { \ + int sum; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + vp9_highbd_calc##block_size##x##block_size##var_sse2, \ + block_size); \ + return *sse - (((int64_t)sum * sum) >> shift); \ +} + +VAR_FN(64, 64, 16, 12); +VAR_FN(64, 32, 16, 11); +VAR_FN(32, 64, 16, 11); +VAR_FN(32, 32, 16, 10); +VAR_FN(32, 16, 16, 9); +VAR_FN(16, 32, 16, 9); +VAR_FN(16, 16, 16, 8); +VAR_FN(16, 8, 8, 7); +VAR_FN(8, 16, 8, 7); +VAR_FN(8, 8, 8, 6); + +#undef VAR_FN + +unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vp9_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vp9_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vp9_highbd_calc16x16var_sse2, 16); + return *sse; +} + +unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vp9_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vp9_highbd_calc8x8var_sse2, 8); + return *sse; +} + +unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, + sse, &sum, vp9_highbd_calc8x8var_sse2, 8); + return *sse; +} + +#define DECL(w, opt) \ +int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint16_t *dst, \ + ptrdiff_t dst_stride, \ + int height, unsigned int *sse); +#define DECLS(opt1, opt2) \ +DECL(8, opt1); \ +DECL(16, opt1) + +DECLS(sse2, sse); +// DECLS(ssse3, ssse3); +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ +unsigned int \ + vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst8, \ + int dst_stride, \ + unsigned int *sse_ptr) { \ + unsigned int sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, h, \ + &sse); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ + src_stride, \ + x_offset, y_offset, \ + dst + 16, \ + dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} \ +unsigned int vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst8, \ + int dst_stride, \ + unsigned int *sse_ptr) { \ + unsigned int sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + h, &sse); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ + src_stride, \ + x_offset, y_offset, \ + dst + 16, \ + dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} \ +unsigned int vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ + const uint8_t *src8, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst8, \ + int dst_stride, \ + unsigned int *sse_ptr) { \ + int start_row; \ + unsigned int sse; \ + int se = 0; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + for (start_row = 0; start_row < h; start_row +=16) { \ + unsigned int sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, \ + x_offset, y_offset, dst + (start_row * dst_stride), \ + dst_stride, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, dst + 16 + (start_row * dst_stride), \ + dst_stride, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, dst + 32 + (start_row * dst_stride), \ + dst_stride, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, dst + 48 + (start_row * dst_stride), \ + dst_stride, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + }\ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} + +#define FNS(opt1, opt2) \ +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ +FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ +FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ +FN(8, 16, 8, 3, 4, opt1, (int64_t)); \ +FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ +FN(8, 4, 8, 3, 2, opt1, (int64_t)); + + +FNS(sse2, sse); + +#undef FNS +#undef FN + + +#define DECL(w, opt) \ +int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint16_t *dst, \ + ptrdiff_t dst_stride, \ + const uint16_t *sec, \ + ptrdiff_t sec_stride, \ + int height, \ + unsigned int *sse); +#define DECLS(opt1) \ +DECL(16, opt1) \ +DECL(8, opt1) + +DECLS(sse2); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ +unsigned int vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst8, \ + int dst_stride, \ + unsigned int *sse_ptr, \ + const uint8_t *sec8) { \ + unsigned int sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + sec, w, h, &sse); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, \ + x_offset, y_offset, \ + dst + 16, dst_stride, \ + sec + 16, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + sec + 32, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + sec + 48, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} \ +unsigned int vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst8, \ + int dst_stride, \ + unsigned int *sse_ptr, \ + const uint8_t *sec8) { \ + unsigned int sse; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + sec, w, h, &sse); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16, src_stride, \ + x_offset, y_offset, \ + dst + 16, dst_stride, \ + sec + 16, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + sec + 32, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + sec + 48, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 2); \ + sse = ROUND_POWER_OF_TWO(sse, 4); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} \ +unsigned int vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ + const uint8_t *src8, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst8, \ + int dst_stride, \ + unsigned int *sse_ptr, \ + const uint8_t *sec8) { \ + int start_row; \ + unsigned int sse; \ + int se = 0; \ + uint64_t long_sse = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ + uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ + for (start_row = 0; start_row < h; start_row +=16) { \ + unsigned int sse2; \ + int height = h - start_row < 16 ? h - start_row : 16; \ + int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + (start_row * src_stride), src_stride, x_offset, \ + y_offset, dst + (start_row * dst_stride), dst_stride, \ + sec + (start_row * w), w, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf) { \ + se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 16 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, \ + dst + 16 + (start_row * dst_stride), dst_stride, \ + sec + 16 + (start_row * w), w, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 32 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, \ + dst + 32 + (start_row * dst_stride), dst_stride, \ + sec + 32 + (start_row * w), w, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ + src + 48 + (start_row * src_stride), src_stride, \ + x_offset, y_offset, \ + dst + 48 + (start_row * dst_stride), dst_stride, \ + sec + 48 + (start_row * w), w, height, &sse2); \ + se += se2; \ + long_sse += sse2; \ + } \ + } \ + } \ + se = ROUND_POWER_OF_TWO(se, 4); \ + sse = ROUND_POWER_OF_TWO(long_sse, 8); \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ +} + + +#define FNS(opt1) \ +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ +FN(16, 16, 16, 4, 4, opt1, (int64_t)); \ +FN(16, 8, 16, 4, 3, opt1, (int64_t)); \ +FN(8, 16, 8, 4, 3, opt1, (int64_t)); \ +FN(8, 8, 8, 3, 3, opt1, (int64_t)); \ +FN(8, 4, 8, 3, 2, opt1, (int64_t)); + +FNS(sse2); + +#undef FNS +#undef FN diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 362a7db9a..00b0edf64 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -104,6 +104,7 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad4d_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm endif ifeq ($(CONFIG_USE_X86INC),yes) @@ -115,6 +116,8 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_sad_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm endif endif -- 2.40.0