From fce3cee8ddd8dda91553e4701c0a8081ff4bab52 Mon Sep 17 00:00:00 2001 From: Jim Bankoski Date: Mon, 2 May 2016 12:17:39 -0700 Subject: [PATCH] Move vpx_add_plane from codec to vpx_dsp and dedup. Change-Id: I12218d8331c0558c0587a66321e3ca46da7e5cc7 --- vp10/common/postproc.c | 29 +--------- vp10/common/vp10_rtcd_defs.pl | 7 --- vp10/common/x86/postproc_sse2.asm | 62 -------------------- vp8/common/mips/msa/postproc_msa.c | 52 +---------------- vp8/common/postproc.c | 51 +---------------- vp8/common/rtcd_defs.pl | 4 -- vp8/common/x86/postproc_mmx.asm | 62 -------------------- vp8/common/x86/postproc_sse2.asm | 62 -------------------- vp9/common/vp9_postproc.c | 30 +--------- vp9/common/vp9_rtcd_defs.pl | 7 --- vp9/common/x86/vp9_postproc_sse2.asm | 62 -------------------- vpx_dsp/mips/postproc_msa.c | 59 +++++++++++++++++++ vpx_dsp/postproc.c | 43 ++++++++++++++ vpx_dsp/vpx_dsp.mk | 7 +++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 9 +++ vpx_dsp/x86/postproc_mmx.asm | 84 ++++++++++++++++++++++++++++ vpx_dsp/x86/postproc_sse2.asm | 82 +++++++++++++++++++++++++++ 17 files changed, 291 insertions(+), 421 deletions(-) create mode 100644 vpx_dsp/mips/postproc_msa.c create mode 100644 vpx_dsp/postproc.c create mode 100644 vpx_dsp/x86/postproc_mmx.asm create mode 100644 vpx_dsp/x86/postproc_sse2.asm diff --git a/vp10/common/postproc.c b/vp10/common/postproc.c index a6ea9c0ef..e8a9f8131 100644 --- a/vp10/common/postproc.c +++ b/vp10/common/postproc.c @@ -13,6 +13,7 @@ #include #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "./vp10_rtcd.h" @@ -587,32 +588,6 @@ static void fillrd(struct postproc_state *state, int q, int a) { state->last_noise = a; } -void vp10_plane_add_noise_c(uint8_t *start, char *noise, - char blackclamp[16], - char whiteclamp[16], - char bothclamp[16], - unsigned int width, unsigned int height, int pitch) { - unsigned int i, j; - - // TODO(jbb): why does simd code use both but c doesn't, normalize and - // fix.. - (void) bothclamp; - for (i = 0; i < height; i++) { - uint8_t *pos = start + i * pitch; - char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT - - for (j = 0; j < width; j++) { - if (pos[j] < blackclamp[0]) - pos[j] = blackclamp[0]; - - if (pos[j] > 255 + whiteclamp[0]) - pos[j] = 255 + whiteclamp[0]; - - pos[j] += ref[j]; - } - } -} - static void swap_mi_and_prev_mi(VP10_COMMON *cm) { // Current mip will be the prev_mip for the next frame. MODE_INFO *temp = cm->postproc_state.prev_mip; @@ -727,7 +702,7 @@ int vp10_post_proc_frame(struct VP10Common *cm, fillrd(ppstate, 63 - q, noise_level); } - vp10_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp, + vpx_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp, ppstate->whiteclamp, ppstate->bothclamp, ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride); } diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl index 9860baedf..f2414f811 100644 --- a/vp10/common/vp10_rtcd_defs.pl +++ b/vp10/common/vp10_rtcd_defs.pl @@ -70,10 +70,6 @@ add_proto qw/void vp10_post_proc_down_and_across/, "const uint8_t *src_ptr, uint specialize qw/vp10_post_proc_down_and_across sse2/; $vp10_post_proc_down_and_across_sse2=vp10_post_proc_down_and_across_xmm; -add_proto qw/void vp10_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"; -specialize qw/vp10_plane_add_noise sse2/; -$vp10_plane_add_noise_sse2=vp10_plane_add_noise_wmt; - add_proto qw/void vp10_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight"; specialize qw/vp10_filter_by_weight16x16 sse2 msa/; @@ -326,9 +322,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp10_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"; specialize qw/vp10_highbd_post_proc_down_and_across/; - - add_proto qw/void vp10_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"; - specialize qw/vp10_highbd_plane_add_noise/; } # diff --git a/vp10/common/x86/postproc_sse2.asm b/vp10/common/x86/postproc_sse2.asm index d5f8e927b..d477a65c2 100644 --- a/vp10/common/x86/postproc_sse2.asm +++ b/vp10/common/x86/postproc_sse2.asm @@ -624,68 +624,6 @@ sym(vp10_mbpost_proc_across_ip_xmm): %undef flimit4 -;void vp10_plane_add_noise_wmt (unsigned char *start, unsigned char *noise, -; unsigned char blackclamp[16], -; unsigned char whiteclamp[16], -; unsigned char bothclamp[16], -; unsigned int width, unsigned int height, int pitch) -global sym(vp10_plane_add_noise_wmt) PRIVATE -sym(vp10_plane_add_noise_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -.addnoise_loop: - call sym(LIBVPX_RAND) WRT_PLT - mov rcx, arg(1) ;noise - and rax, 0xff - add rcx, rax - - ; we rely on the fact that the clamping vectors are stored contiguously - ; in black/white/both order. Note that we have to reload this here because - ; rdx could be trashed by rand() - mov rdx, arg(2) ; blackclamp - - - mov rdi, rcx - movsxd rcx, dword arg(5) ;[Width] - mov rsi, arg(0) ;Pos - xor rax,rax - -.addnoise_nextset: - movdqu xmm1,[rsi+rax] ; get the source - - psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise - paddusb xmm1, [rdx+32] ;bothclamp - psubusb xmm1, [rdx+16] ;whiteclamp - - movdqu xmm2,[rdi+rax] ; get the noise for this line - paddb xmm1,xmm2 ; add it in - movdqu [rsi+rax],xmm1 ; store the result - - add rax,16 ; move to the next line - - cmp rax, rcx - jl .addnoise_nextset - - movsxd rax, dword arg(7) ; Pitch - add arg(0), rax ; Start += Pitch - sub dword arg(6), 1 ; Height -= 1 - jg .addnoise_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - SECTION_RODATA align 16 rd42: diff --git a/vp8/common/mips/msa/postproc_msa.c b/vp8/common/mips/msa/postproc_msa.c index c88f30238..23dcde2eb 100644 --- a/vp8/common/mips/msa/postproc_msa.c +++ b/vp8/common/mips/msa/postproc_msa.c @@ -10,6 +10,7 @@ #include #include "./vp8_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vp8/common/mips/msa/vp8_macros_msa.h" static const int16_t vp8_rv_msa[] = @@ -798,54 +799,3 @@ void vp8_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows, } } } - -void vp8_plane_add_noise_msa(uint8_t *start_ptr, char *noise, - char blackclamp[16], char whiteclamp[16], - char bothclamp[16], - uint32_t width, uint32_t height, - int32_t pitch) -{ - uint32_t i, j; - - for (i = 0; i < height / 2; ++i) - { - uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch; - int8_t *ref0_ptr = (int8_t *) (noise + (rand() & 0xff)); - uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch; - int8_t *ref1_ptr = (int8_t *) (noise + (rand() & 0xff)); - for (j = width / 16; j--;) - { - v16i8 temp00_s, temp01_s; - v16u8 temp00, temp01, black_clamp, white_clamp; - v16u8 pos0, ref0, pos1, ref1; - v16i8 const127 = __msa_ldi_b(127); - - pos0 = LD_UB(pos0_ptr); - ref0 = LD_UB(ref0_ptr); - pos1 = LD_UB(pos1_ptr); - ref1 = LD_UB(ref1_ptr); - black_clamp = (v16u8)__msa_fill_b(blackclamp[0]); - white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]); - temp00 = (pos0 < black_clamp); - pos0 = __msa_bmnz_v(pos0, black_clamp, temp00); - temp01 = (pos1 < black_clamp); - pos1 = __msa_bmnz_v(pos1, black_clamp, temp01); - XORI_B2_128_UB(pos0, pos1); - temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127); - temp00 = (v16u8)(temp00_s < pos0); - pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00); - temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127); - temp01 = (temp01_s < pos1); - pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01); - XORI_B2_128_UB(pos0, pos1); - pos0 += ref0; - ST_UB(pos0, pos0_ptr); - pos1 += ref1; - ST_UB(pos1, pos1_ptr); - pos0_ptr += 16; - pos1_ptr += 16; - ref0_ptr += 16; - ref1_ptr += 16; - } - } -} diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c index 322b61383..6baf00f1e 100644 --- a/vp8/common/postproc.c +++ b/vp8/common/postproc.c @@ -10,6 +10,7 @@ #include "vpx_config.h" +#include "vpx_dsp_rtcd.h" #include "vp8_rtcd.h" #include "vpx_scale_rtcd.h" #include "vpx_scale/yv12config.h" @@ -490,54 +491,6 @@ static void fillrd(struct postproc_state *state, int q, int a) state->last_noise = a; } -/**************************************************************************** - * - * ROUTINE : plane_add_noise_c - * - * INPUTS : unsigned char *Start starting address of buffer to add gaussian - * noise to - * unsigned int Width width of plane - * unsigned int Height height of plane - * int Pitch distance between subsequent lines of frame - * int q quantizer used to determine amount of noise - * to add - * - * OUTPUTS : None. - * - * RETURNS : void. - * - * FUNCTION : adds gaussian noise to a plane of pixels - * - * SPECIAL NOTES : None. - * - ****************************************************************************/ -void vp8_plane_add_noise_c(unsigned char *Start, char *noise, - char blackclamp[16], - char whiteclamp[16], - char bothclamp[16], - unsigned int Width, unsigned int Height, int Pitch) -{ - unsigned int i, j; - (void)bothclamp; - - for (i = 0; i < Height; i++) - { - unsigned char *Pos = Start + i * Pitch; - char *Ref = (char *)(noise + (rand() & 0xff)); - - for (j = 0; j < Width; j++) - { - if (Pos[j] < blackclamp[0]) - Pos[j] = blackclamp[0]; - - if (Pos[j] > 255 + whiteclamp[0]) - Pos[j] = 255 + whiteclamp[0]; - - Pos[j] += Ref[j]; - } - } -} - /* Blend the macro block with a solid colored square. Leave the * edges unblended to give distinction to macro blocks in areas * filled with the same color block. @@ -828,7 +781,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t fillrd(&oci->postproc_state, 63 - q, noise_level); } - vp8_plane_add_noise + vpx_plane_add_noise (oci->post_proc_buffer.y_buffer, oci->postproc_state.noise, oci->postproc_state.blackclamp, diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 6799c2787..b942d5bfa 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -167,10 +167,6 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") { add_proto qw/void vp8_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"; specialize qw/vp8_post_proc_down_and_across_mb_row sse2 msa/; - add_proto qw/void vp8_plane_add_noise/, "unsigned char *s, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int w, unsigned int h, int pitch"; - specialize qw/vp8_plane_add_noise mmx sse2 msa/; - $vp8_plane_add_noise_sse2=vp8_plane_add_noise_wmt; - add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"; # no asm yet diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm index a2b16327f..1a89e7ead 100644 --- a/vp8/common/x86/postproc_mmx.asm +++ b/vp8/common/x86/postproc_mmx.asm @@ -241,68 +241,6 @@ sym(vp8_mbpost_proc_down_mmx): %undef flimit2 -;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise, -; unsigned char blackclamp[16], -; unsigned char whiteclamp[16], -; unsigned char bothclamp[16], -; unsigned int Width, unsigned int Height, int Pitch) -global sym(vp8_plane_add_noise_mmx) PRIVATE -sym(vp8_plane_add_noise_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -.addnoise_loop: - call sym(LIBVPX_RAND) WRT_PLT - mov rcx, arg(1) ;noise - and rax, 0xff - add rcx, rax - - ; we rely on the fact that the clamping vectors are stored contiguously - ; in black/white/both order. Note that we have to reload this here because - ; rdx could be trashed by rand() - mov rdx, arg(2) ; blackclamp - - - mov rdi, rcx - movsxd rcx, dword arg(5) ;[Width] - mov rsi, arg(0) ;Pos - xor rax,rax - -.addnoise_nextset: - movq mm1,[rsi+rax] ; get the source - - psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise - paddusb mm1, [rdx+32] ;bothclamp - psubusb mm1, [rdx+16] ;whiteclamp - - movq mm2,[rdi+rax] ; get the noise for this line - paddb mm1,mm2 ; add it in - movq [rsi+rax],mm1 ; store the result - - add rax,8 ; move to the next line - - cmp rax, rcx - jl .addnoise_nextset - - movsxd rax, dword arg(7) ; Pitch - add arg(0), rax ; Start += Pitch - sub dword arg(6), 1 ; Height -= 1 - jg .addnoise_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - SECTION_RODATA align 16 Blur: diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm index fed4ee5cc..de17afa5c 100644 --- a/vp8/common/x86/postproc_sse2.asm +++ b/vp8/common/x86/postproc_sse2.asm @@ -655,68 +655,6 @@ sym(vp8_mbpost_proc_across_ip_xmm): %undef flimit4 -;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise, -; unsigned char blackclamp[16], -; unsigned char whiteclamp[16], -; unsigned char bothclamp[16], -; unsigned int Width, unsigned int Height, int Pitch) -global sym(vp8_plane_add_noise_wmt) PRIVATE -sym(vp8_plane_add_noise_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -.addnoise_loop: - call sym(LIBVPX_RAND) WRT_PLT - mov rcx, arg(1) ;noise - and rax, 0xff - add rcx, rax - - ; we rely on the fact that the clamping vectors are stored contiguously - ; in black/white/both order. Note that we have to reload this here because - ; rdx could be trashed by rand() - mov rdx, arg(2) ; blackclamp - - - mov rdi, rcx - movsxd rcx, dword arg(5) ;[Width] - mov rsi, arg(0) ;Pos - xor rax,rax - -.addnoise_nextset: - movdqu xmm1,[rsi+rax] ; get the source - - psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise - paddusb xmm1, [rdx+32] ;bothclamp - psubusb xmm1, [rdx+16] ;whiteclamp - - movdqu xmm2,[rdi+rax] ; get the noise for this line - paddb xmm1,xmm2 ; add it in - movdqu [rsi+rax],xmm1 ; store the result - - add rax,16 ; move to the next line - - cmp rax, rcx - jl .addnoise_nextset - - movsxd rax, dword arg(7) ; Pitch - add arg(0), rax ; Start += Pitch - sub dword arg(6), 1 ; Height -= 1 - jg .addnoise_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - SECTION_RODATA align 16 four8s: diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c index b685d813b..c04cc8f05 100644 --- a/vp9/common/vp9_postproc.c +++ b/vp9/common/vp9_postproc.c @@ -12,6 +12,7 @@ #include #include +#include "./vpx_dsp_rtcd.h" #include "./vpx_config.h" #include "./vpx_scale_rtcd.h" #include "./vp9_rtcd.h" @@ -587,32 +588,6 @@ static void fillrd(struct postproc_state *state, int q, int a) { state->last_noise = a; } -void vp9_plane_add_noise_c(uint8_t *start, char *noise, - char blackclamp[16], - char whiteclamp[16], - char bothclamp[16], - unsigned int width, unsigned int height, int pitch) { - unsigned int i, j; - - // TODO(jbb): why does simd code use both but c doesn't, normalize and - // fix.. - (void) bothclamp; - for (i = 0; i < height; i++) { - uint8_t *pos = start + i * pitch; - char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT - - for (j = 0; j < width; j++) { - if (pos[j] < blackclamp[0]) - pos[j] = blackclamp[0]; - - if (pos[j] > 255 + whiteclamp[0]) - pos[j] = 255 + whiteclamp[0]; - - pos[j] += ref[j]; - } - } -} - static void swap_mi_and_prev_mi(VP9_COMMON *cm) { // Current mip will be the prev_mip for the next frame. MODE_INFO *temp = cm->postproc_state.prev_mip; @@ -726,8 +701,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, ppstate->last_noise != noise_level) { fillrd(ppstate, 63 - q, noise_level); } - - vp9_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp, + vpx_plane_add_noise(ppbuf->y_buffer, ppstate->noise, ppstate->blackclamp, ppstate->whiteclamp, ppstate->bothclamp, ppbuf->y_width, ppbuf->y_height, ppbuf->y_stride); } diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 1cf636c1d..d7f5a2113 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -70,10 +70,6 @@ add_proto qw/void vp9_post_proc_down_and_across/, "const uint8_t *src_ptr, uint8 specialize qw/vp9_post_proc_down_and_across sse2/; $vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm; -add_proto qw/void vp9_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"; -specialize qw/vp9_plane_add_noise sse2/; -$vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt; - add_proto qw/void vp9_filter_by_weight16x16/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight"; specialize qw/vp9_filter_by_weight16x16 sse2 msa/; @@ -169,9 +165,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_post_proc_down_and_across/, "const uint16_t *src_ptr, uint16_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit"; specialize qw/vp9_highbd_post_proc_down_and_across/; - - add_proto qw/void vp9_highbd_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"; - specialize qw/vp9_highbd_plane_add_noise/; } # diff --git a/vp9/common/x86/vp9_postproc_sse2.asm b/vp9/common/x86/vp9_postproc_sse2.asm index ec8bfdb18..430762815 100644 --- a/vp9/common/x86/vp9_postproc_sse2.asm +++ b/vp9/common/x86/vp9_postproc_sse2.asm @@ -624,68 +624,6 @@ sym(vp9_mbpost_proc_across_ip_xmm): %undef flimit4 -;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise, -; unsigned char blackclamp[16], -; unsigned char whiteclamp[16], -; unsigned char bothclamp[16], -; unsigned int width, unsigned int height, int pitch) -global sym(vp9_plane_add_noise_wmt) PRIVATE -sym(vp9_plane_add_noise_wmt): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - ; end prolog - -.addnoise_loop: - call sym(LIBVPX_RAND) WRT_PLT - mov rcx, arg(1) ;noise - and rax, 0xff - add rcx, rax - - ; we rely on the fact that the clamping vectors are stored contiguously - ; in black/white/both order. Note that we have to reload this here because - ; rdx could be trashed by rand() - mov rdx, arg(2) ; blackclamp - - - mov rdi, rcx - movsxd rcx, dword arg(5) ;[Width] - mov rsi, arg(0) ;Pos - xor rax,rax - -.addnoise_nextset: - movdqu xmm1,[rsi+rax] ; get the source - - psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise - paddusb xmm1, [rdx+32] ;bothclamp - psubusb xmm1, [rdx+16] ;whiteclamp - - movdqu xmm2,[rdi+rax] ; get the noise for this line - paddb xmm1,xmm2 ; add it in - movdqu [rsi+rax],xmm1 ; store the result - - add rax,16 ; move to the next line - - cmp rax, rcx - jl .addnoise_nextset - - movsxd rax, dword arg(7) ; Pitch - add arg(0), rax ; Start += Pitch - sub dword arg(6), 1 ; Height -= 1 - jg .addnoise_loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - SECTION_RODATA align 16 rd42: diff --git a/vpx_dsp/mips/postproc_msa.c b/vpx_dsp/mips/postproc_msa.c new file mode 100644 index 000000000..366770c0d --- /dev/null +++ b/vpx_dsp/mips/postproc_msa.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include "./macros_msa.h" + +void vpx_plane_add_noise_msa(uint8_t *start_ptr, char *noise, + char blackclamp[16], char whiteclamp[16], + char bothclamp[16], uint32_t width, + uint32_t height, int32_t pitch) { + uint32_t i, j; + + for (i = 0; i < height / 2; ++i) { + uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch; + int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff)); + uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch; + int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff)); + for (j = width / 16; j--;) { + v16i8 temp00_s, temp01_s; + v16u8 temp00, temp01, black_clamp, white_clamp; + v16u8 pos0, ref0, pos1, ref1; + v16i8 const127 = __msa_ldi_b(127); + + pos0 = LD_UB(pos0_ptr); + ref0 = LD_UB(ref0_ptr); + pos1 = LD_UB(pos1_ptr); + ref1 = LD_UB(ref1_ptr); + black_clamp = (v16u8)__msa_fill_b(blackclamp[0]); + white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]); + temp00 = (pos0 < black_clamp); + pos0 = __msa_bmnz_v(pos0, black_clamp, temp00); + temp01 = (pos1 < black_clamp); + pos1 = __msa_bmnz_v(pos1, black_clamp, temp01); + XORI_B2_128_UB(pos0, pos1); + temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127); + temp00 = (v16u8)(temp00_s < pos0); + pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00); + temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127); + temp01 = (temp01_s < pos1); + pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01); + XORI_B2_128_UB(pos0, pos1); + pos0 += ref0; + ST_UB(pos0, pos0_ptr); + pos1 += ref1; + ST_UB(pos1, pos1_ptr); + pos0_ptr += 16; + pos1_ptr += 16; + ref0_ptr += 16; + ref1_ptr += 16; + } + } +} diff --git a/vpx_dsp/postproc.c b/vpx_dsp/postproc.c new file mode 100644 index 000000000..1fa0204f4 --- /dev/null +++ b/vpx_dsp/postproc.c @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +void vpx_plane_add_noise_c(uint8_t *start, char *noise, + char blackclamp[16], + char whiteclamp[16], + char bothclamp[16], + unsigned int width, unsigned int height, int pitch) { + unsigned int i, j; + + // TODO(jbb): why does simd code use both but c doesn't, normalize and + // fix.. + (void) bothclamp; + for (i = 0; i < height; i++) { + uint8_t *pos = start + i * pitch; + char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT + + for (j = 0; j < width; j++) { + if (pos[j] < blackclamp[0]) + pos[j] = blackclamp[0]; + + if (pos[j] > 255 + whiteclamp[0]) + pos[j] = 255 + whiteclamp[0]; + + pos[j] += ref[j]; + } + } +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 581ec3a28..ef319a864 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -53,6 +53,13 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm endif # CONFIG_USE_X86INC endif # CONFIG_VP9_HIGHBITDEPTH +ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),) +DSP_SRCS-yes += postproc.c +DSP_SRCS-$(HAVE_MSA) += mips/postproc_msa.c +DSP_SRCS-$(HAVE_MMX) += x86/postproc_mmx.asm +DSP_SRCS-$(HAVE_SSE2) += x86/postproc_sse2.asm +endif # CONFIG_POSTPROC + DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM) DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 9ea80a098..f883ce553 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1907,6 +1907,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred"; } # CONFIG_VP9_HIGHBITDEPTH + +# +# Post Processing +# +if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") { + add_proto qw/void vpx_plane_add_noise/, "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch"; + specialize qw/vpx_plane_add_noise mmx sse2 msa/; +} + } # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC 1; diff --git a/vpx_dsp/x86/postproc_mmx.asm b/vpx_dsp/x86/postproc_mmx.asm new file mode 100644 index 000000000..97039750d --- /dev/null +++ b/vpx_dsp/x86/postproc_mmx.asm @@ -0,0 +1,84 @@ +; +; Copyright (c) 2015 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +;void vpx_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise, +; unsigned char blackclamp[16], +; unsigned char whiteclamp[16], +; unsigned char bothclamp[16], +; unsigned int Width, unsigned int Height, int Pitch) +global sym(vpx_plane_add_noise_mmx) PRIVATE +sym(vpx_plane_add_noise_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +.addnoise_loop: + call sym(LIBVPX_RAND) WRT_PLT + mov rcx, arg(1) ;noise + and rax, 0xff + add rcx, rax + + ; we rely on the fact that the clamping vectors are stored contiguously + ; in black/white/both order. Note that we have to reload this here because + ; rdx could be trashed by rand() + mov rdx, arg(2) ; blackclamp + + + mov rdi, rcx + movsxd rcx, dword arg(5) ;[Width] + mov rsi, arg(0) ;Pos + xor rax,rax + +.addnoise_nextset: + movq mm1,[rsi+rax] ; get the source + + psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise + paddusb mm1, [rdx+32] ;bothclamp + psubusb mm1, [rdx+16] ;whiteclamp + + movq mm2,[rdi+rax] ; get the noise for this line + paddb mm1,mm2 ; add it in + movq [rsi+rax],mm1 ; store the result + + add rax,8 ; move to the next line + + cmp rax, rcx + jl .addnoise_nextset + + movsxd rax, dword arg(7) ; Pitch + add arg(0), rax ; Start += Pitch + sub dword arg(6), 1 ; Height -= 1 + jg .addnoise_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +Blur: + times 16 dw 16 + times 8 dw 64 + times 16 dw 16 + times 8 dw 0 + +rd: + times 4 dw 0x40 diff --git a/vpx_dsp/x86/postproc_sse2.asm b/vpx_dsp/x86/postproc_sse2.asm new file mode 100644 index 000000000..f4bc8932a --- /dev/null +++ b/vpx_dsp/x86/postproc_sse2.asm @@ -0,0 +1,82 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vpx_plane_add_noise_sse2(unsigned char *start, unsigned char *noise, +; unsigned char blackclamp[16], +; unsigned char whiteclamp[16], +; unsigned char bothclamp[16], +; unsigned int width, unsigned int height, +; int pitch) +global sym(vpx_plane_add_noise_sse2) PRIVATE +sym(vpx_plane_add_noise_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +.addnoise_loop: + call sym(LIBVPX_RAND) WRT_PLT + mov rcx, arg(1) ;noise + and rax, 0xff + add rcx, rax + + ; we rely on the fact that the clamping vectors are stored contiguously + ; in black/white/both order. Note that we have to reload this here because + ; rdx could be trashed by rand() + mov rdx, arg(2) ; blackclamp + + + mov rdi, rcx + movsxd rcx, dword arg(5) ;[Width] + mov rsi, arg(0) ;Pos + xor rax,rax + +.addnoise_nextset: + movdqu xmm1,[rsi+rax] ; get the source + + psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise + paddusb xmm1, [rdx+32] ;bothclamp + psubusb xmm1, [rdx+16] ;whiteclamp + + movdqu xmm2,[rdi+rax] ; get the noise for this line + paddb xmm1,xmm2 ; add it in + movdqu [rsi+rax],xmm1 ; store the result + + add rax,16 ; move to the next line + + cmp rax, rcx + jl .addnoise_nextset + + movsxd rax, dword arg(7) ; Pitch + add arg(0), rax ; Start += Pitch + sub dword arg(6), 1 ; Height -= 1 + jg .addnoise_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +rd42: + times 8 dw 0x04 +four8s: + times 4 dd 8 -- 2.49.0