From 84ae235450418f859536e878535b782ae4dccfee Mon Sep 17 00:00:00 2001 From: Aaron Watry Date: Mon, 18 Apr 2011 13:20:26 -0400 Subject: [PATCH] Initial OpenCL implementation of the VP8 decoder. Change-Id: I74c334af09f13473ce07bbac74b0f9ea57573347 Note: very slow, but functional. Encoder is untested, but should still work. --- .gitignore | 6 + build/make/configure.sh | 32 + configure | 7 + libs.mk | 12 + vp8/common/alloccommon.c | 12 +- vp8/common/blockd.h | 91 ++- vp8/common/filter.c | 100 ++- vp8/common/generic/systemdependent.c | 5 + vp8/common/idct.h | 4 + vp8/common/loopfilter.c | 14 +- vp8/common/loopfilter_filters.c | 2 +- vp8/common/mbpitch.c | 138 +++- vp8/common/onyxc_int.h | 1 - vp8/common/opencl/blockd_cl.c | 233 ++++++ vp8/common/opencl/blockd_cl.h | 64 ++ vp8/common/opencl/dynamic_cl.c | 106 +++ vp8/common/opencl/dynamic_cl.h | 253 ++++++ vp8/common/opencl/filter_cl.c | 824 ++++++++++++++++++++ vp8/common/opencl/filter_cl.cl | 562 +++++++++++++ vp8/common/opencl/filter_cl.h | 74 ++ vp8/common/opencl/idct_cl.h | 45 ++ vp8/common/opencl/idctllm_cl.c | 325 ++++++++ vp8/common/opencl/idctllm_cl.cl | 309 ++++++++ vp8/common/opencl/idctllm_cl.h | 26 + vp8/common/opencl/loopfilter.cl | 427 ++++++++++ vp8/common/opencl/loopfilter_cl.c | 457 +++++++++++ vp8/common/opencl/loopfilter_cl.h | 48 ++ vp8/common/opencl/loopfilter_filters_cl.c | 187 +++++ vp8/common/opencl/opencl_systemdependent.c | 41 + vp8/common/opencl/reconinter_cl.c | 641 +++++++++++++++ vp8/common/opencl/reconinter_cl.h | 25 + vp8/common/opencl/subpixel_cl.h | 46 ++ vp8/common/opencl/vp8_opencl.c | 342 ++++++++ vp8/common/opencl/vp8_opencl.h | 192 +++++ vp8/common/quant_common.c | 2 + vp8/common/recon.c | 30 +- vp8/common/reconinter.c | 104 ++- vp8/common/reconintra.c | 2 +- vp8/common/reconintra4x4.c | 17 +- vp8/common/swapyv12buffer.c | 18 +- vp8/decoder/arm/dequantize_arm.c | 8 +- vp8/decoder/decodframe.c | 183 +++-- vp8/decoder/dequantize.c | 4 +- vp8/decoder/detokenize.c | 10 +- vp8/decoder/generic/dsystemdependent.c | 5 + vp8/decoder/onyxd_if.c | 136 ++++ vp8/decoder/opencl/decodframe_cl.c | 357 +++++++++ vp8/decoder/opencl/decodframe_cl.h | 31 + vp8/decoder/opencl/dequantize_cl.c | 214 +++++ vp8/decoder/opencl/dequantize_cl.cl | 272 +++++++ vp8/decoder/opencl/dequantize_cl.h | 74 ++ vp8/decoder/opencl/idct_blk_cl.c | 196 +++++ vp8/decoder/opencl/opencl_systemdependent.c | 25 + vp8/decoder/opencl/vp8_decode_cl.c | 38 + vp8/decoder/opencl/vp8_decode_cl.h | 24 + vp8/decoder/threading.c | 34 +- vp8/decoder/x86/x86_dsystemdependent.c | 4 +- vp8/encoder/arm/quantize_arm.c | 6 +- vp8/encoder/asm_enc_offsets.c | 9 +- vp8/encoder/encodeintra.c | 7 +- vp8/encoder/encodemb.c | 14 +- vp8/{common => encoder}/invtrans.c | 19 +- vp8/{common => encoder}/invtrans.h | 4 +- vp8/encoder/onyx_if.c | 5 - vp8/encoder/pickinter.c | 4 +- vp8/encoder/quantize.c | 20 +- vp8/encoder/rdopt.c | 28 +- vp8/encoder/tokenize.c | 8 +- vp8/encoder/x86/quantize_sse2.asm | 8 +- vp8/encoder/x86/quantize_ssse3.asm | 4 +- vp8/encoder/x86/x86_csystemdependent.c | 13 +- vp8/vp8_common.mk | 32 +- vp8/vp8_dx_iface.c | 13 + vp8/vp8cx.mk | 2 + vp8/vp8dx.mk | 11 + vpx_ports/config.h | 2 +- vpx_ports/mem.h | 2 +- vpx_scale/generic/yv12config.c | 32 +- vpx_scale/yv12config.h | 11 + 79 files changed, 7384 insertions(+), 309 deletions(-) create mode 100644 vp8/common/opencl/blockd_cl.c create mode 100644 vp8/common/opencl/blockd_cl.h create mode 100644 vp8/common/opencl/dynamic_cl.c create mode 100644 vp8/common/opencl/dynamic_cl.h create mode 100644 vp8/common/opencl/filter_cl.c create mode 100644 vp8/common/opencl/filter_cl.cl create mode 100644 vp8/common/opencl/filter_cl.h create mode 100644 vp8/common/opencl/idct_cl.h create mode 100644 vp8/common/opencl/idctllm_cl.c create mode 100644 vp8/common/opencl/idctllm_cl.cl create mode 100644 vp8/common/opencl/idctllm_cl.h create mode 100644 vp8/common/opencl/loopfilter.cl create mode 100644 vp8/common/opencl/loopfilter_cl.c create mode 100644 vp8/common/opencl/loopfilter_cl.h create mode 100644 vp8/common/opencl/loopfilter_filters_cl.c create mode 100644 vp8/common/opencl/opencl_systemdependent.c create mode 100644 vp8/common/opencl/reconinter_cl.c create mode 100644 vp8/common/opencl/reconinter_cl.h create mode 100644 vp8/common/opencl/subpixel_cl.h create mode 100644 vp8/common/opencl/vp8_opencl.c create mode 100644 vp8/common/opencl/vp8_opencl.h create mode 100644 vp8/decoder/opencl/decodframe_cl.c create mode 100644 vp8/decoder/opencl/decodframe_cl.h create mode 100644 vp8/decoder/opencl/dequantize_cl.c create mode 100644 vp8/decoder/opencl/dequantize_cl.cl create mode 100644 vp8/decoder/opencl/dequantize_cl.h create mode 100644 vp8/decoder/opencl/idct_blk_cl.c create mode 100644 vp8/decoder/opencl/opencl_systemdependent.c create mode 100644 vp8/decoder/opencl/vp8_decode_cl.c create mode 100644 vp8/decoder/opencl/vp8_decode_cl.h rename vp8/{common => encoder}/invtrans.c (71%) rename vp8/{common => encoder}/invtrans.h (93%) diff --git a/.gitignore b/.gitignore index ae616b28c..098611dd3 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,9 @@ /vpx_config.h /vpx_version.h TAGS +vpxdec +vpxenc +.project +.cproject +*.csv +*.oclpj diff --git a/build/make/configure.sh b/build/make/configure.sh index 3324be36e..d0eeed015 100755 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -957,6 +957,38 @@ process_common_toolchain() { enabled rvct && check_add_cflags -Otime enabled small && check_add_cflags -O2 || check_add_cflags -O3 fi + + if enabled opencl; then + disable multithread + echo " disabling multithread" + soft_enable opencl #Provide output to make user comfortable + enable runtime_cpu_detect + + #Use dlopen() to load OpenCL when possible. + case ${toolchain} in + *darwin10*) + check_add_cflags -D__APPLE__ + add_extralibs -framework OpenCL + ;; + *-win32-gcc) + if check_header dlfcn.h; then + add_extralibs -ldl + enable dlopen + else + #This shouldn't be a hard-coded path in the long term + add_extralibs -L/cygdrive/c/Windows/System32 -lOpenCL + fi + ;; + *) + if check_header dlfcn.h; then + add_extralibs -ldl + enable dlopen + else + add_extralibs -lOpenCL + fi + ;; + esac + fi # Position Independent Code (PIC) support, for building relocatable # shared objects diff --git a/configure b/configure index ab3936daf..019167383 100755 --- a/configure +++ b/configure @@ -40,6 +40,7 @@ Advanced options: ${toggle_runtime_cpu_detect} runtime cpu detection ${toggle_shared} shared library support ${toggle_small} favor smaller size over speed + ${toggle_opencl} support for OpenCL-assisted VP8 decoding (experimental) ${toggle_postproc_visualizer} macro block / block level visualizers Codecs: @@ -105,6 +106,7 @@ all_platforms="${all_platforms} x86-darwin8-gcc" all_platforms="${all_platforms} x86-darwin8-icc" all_platforms="${all_platforms} x86-darwin9-gcc" all_platforms="${all_platforms} x86-darwin9-icc" +all_platforms="${all_platforms} x86-darwin10-gcc" all_platforms="${all_platforms} x86-linux-gcc" all_platforms="${all_platforms} x86-linux-icc" all_platforms="${all_platforms} x86-solaris-gcc" @@ -211,6 +213,7 @@ HAVE_LIST=" alt_tree_layout pthread_h sys_mman_h + dlopen " CONFIG_LIST=" external_build @@ -250,6 +253,7 @@ CONFIG_LIST=" realtime_only shared small + opencl postproc_visualizer os_support " @@ -290,6 +294,7 @@ CMDLINE_SELECT=" realtime_only shared small + opencl postproc_visualizer " @@ -556,4 +561,6 @@ process "$@" cat < ${BUILD_PFX}vpx_config.c static const char* const cfg = "$CONFIGURE_ARGS"; const char *vpx_codec_build_config(void) {return cfg;} +static const char* const libdir = "$libdir"; +const char *vpx_codec_lib_dir(void) {return libdir;} EOF diff --git a/libs.mk b/libs.mk index 2cb7f49ba..b0de72c86 100644 --- a/libs.mk +++ b/libs.mk @@ -123,6 +123,18 @@ endif else INSTALL-LIBS-yes += $(LIBSUBDIR)/libvpx.a INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(LIBSUBDIR)/libvpx_g.a + +#Install the OpenCL kernels if CL enabled. +ifeq ($(CONFIG_OPENCL),yes) +INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/common/opencl/filter_cl.cl +INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/common/opencl/idctllm_cl.cl +INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/common/opencl/loopfilter.cl +#only install decoder CL files if VP8 decoder enabled +ifeq ($(CONFIG_VP8_DECODER),yes) +INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/decoder/opencl/dequantize_cl.cl +endif +endif #CONFIG_OPENCL=yes + endif CODEC_SRCS=$(call enabled,CODEC_SRCS) diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c index edef36094..456f144ec 100644 --- a/vp8/common/alloccommon.c +++ b/vp8/common/alloccommon.c @@ -130,32 +130,32 @@ void vp8_setup_version(VP8_COMMON *cm) case 0: cm->no_lpf = 0; cm->simpler_lpf = 0; - cm->use_bilinear_mc_filter = 0; + cm->mcomp_filter_type = SIXTAP; cm->full_pixel = 0; break; case 1: cm->no_lpf = 0; cm->simpler_lpf = 1; - cm->use_bilinear_mc_filter = 1; + cm->mcomp_filter_type = BILINEAR; cm->full_pixel = 0; break; case 2: cm->no_lpf = 1; cm->simpler_lpf = 0; - cm->use_bilinear_mc_filter = 1; + cm->mcomp_filter_type = BILINEAR; cm->full_pixel = 0; break; case 3: cm->no_lpf = 1; cm->simpler_lpf = 1; - cm->use_bilinear_mc_filter = 1; + cm->mcomp_filter_type = BILINEAR; cm->full_pixel = 1; break; default: /*4,5,6,7 are reserved for future use*/ cm->no_lpf = 0; cm->simpler_lpf = 0; - cm->use_bilinear_mc_filter = 0; + cm->mcomp_filter_type = SIXTAP; cm->full_pixel = 0; break; } @@ -170,7 +170,7 @@ void vp8_create_common(VP8_COMMON *oci) oci->mb_no_coeff_skip = 1; oci->no_lpf = 0; oci->simpler_lpf = 0; - oci->use_bilinear_mc_filter = 0; + oci->mcomp_filter_type = SIXTAP; oci->full_pixel = 0; oci->multi_token_partition = ONE_PARTITION; oci->clr_type = REG_YUV; diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h index fc8e0722c..843644fc8 100644 --- a/vp8/common/blockd.h +++ b/vp8/common/blockd.h @@ -14,12 +14,17 @@ void vpx_log(const char *format, ...); -#include "vpx_ports/config.h" -#include "vpx_scale/yv12config.h" +#include "../../vpx_ports/config.h" +#include "../../vpx_scale/yv12config.h" #include "mv.h" #include "treecoder.h" #include "subpixel.h" -#include "vpx_ports/mem.h" +#include "../../vpx_ports/mem.h" + +#include "../../vpx_config.h" +#if CONFIG_OPENCL +#include "opencl/vp8_opencl.h" +#endif #define TRUE 1 #define FALSE 0 @@ -73,19 +78,19 @@ typedef enum typedef enum { - DC_PRED, /* average of above and left pixels */ - V_PRED, /* vertical prediction */ - H_PRED, /* horizontal prediction */ - TM_PRED, /* Truemotion prediction */ - B_PRED, /* block based prediction, each block has its own prediction mode */ - - NEARESTMV, - NEARMV, - ZEROMV, - NEWMV, - SPLITMV, - - MB_MODE_COUNT + DC_PRED = 0, /* average of above and left pixels */ + V_PRED = 1, /* vertical prediction */ + H_PRED = 2, /* horizontal prediction */ + TM_PRED = 3, /* Truemotion prediction */ + B_PRED = 4, /* block based prediction, each block has its own prediction mode */ + + NEARESTMV = 5, + NEARMV = 6, + ZEROMV = 7, + NEWMV = 8, + SPLITMV = 9, + + MB_MODE_COUNT = 10 } MB_PREDICTION_MODE; /* Macroblock level features */ @@ -187,24 +192,47 @@ typedef struct typedef struct { - short *qcoeff; - short *dqcoeff; - unsigned char *predictor; - short *diff; - short *reference; + short *qcoeff_base; + int qcoeff_offset; + + short *dqcoeff_base; + int dqcoeff_offset; + + unsigned char *predictor_base; + int predictor_offset; + + short *diff_base; + int diff_offset; short *dequant; +#if CONFIG_OPENCL + cl_command_queue cl_commands; //pointer to macroblock CL command queue + + cl_mem cl_diff_mem; + cl_mem cl_predictor_mem; + cl_mem cl_qcoeff_mem; + cl_mem cl_dqcoeff_mem; + cl_mem cl_eobs_mem; + + cl_mem cl_dequant_mem; //Block-specific, not shared + + cl_bool sixtap_filter; //Subpixel Prediction type (true=sixtap, false=bilinear) + +#endif + /* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */ - unsigned char **base_pre; + unsigned char **base_pre; //previous frame, same Macroblock, base pointer int pre; int pre_stride; - unsigned char **base_dst; + unsigned char **base_dst; //destination base pointer int dst; int dst_stride; - int eob; + int eob; //only used in encoder? Decoder uses MBD.eobs + + char *eobs_base; //beginning of MB.eobs B_MODE_INFO bmi; @@ -214,16 +242,26 @@ typedef struct { DECLARE_ALIGNED(16, short, diff[400]); /* from idct diff */ DECLARE_ALIGNED(16, unsigned char, predictor[384]); -/* not used DECLARE_ALIGNED(16, short, reference[384]); */ DECLARE_ALIGNED(16, short, qcoeff[400]); DECLARE_ALIGNED(16, short, dqcoeff[400]); DECLARE_ALIGNED(16, char, eobs[25]); +#if CONFIG_OPENCL + cl_command_queue cl_commands; //Each macroblock gets its own command queue. + cl_mem cl_diff_mem; + cl_mem cl_predictor_mem; + cl_mem cl_qcoeff_mem; + cl_mem cl_dqcoeff_mem; + cl_mem cl_eobs_mem; + + cl_bool sixtap_filter; +#endif + /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */ BLOCKD block[25]; YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */ - YV12_BUFFER_CONFIG dst; + YV12_BUFFER_CONFIG dst; /* Destination buffer for current frame */ MODE_INFO *mode_info_context; int mode_info_stride; @@ -273,6 +311,7 @@ typedef struct unsigned int frames_since_golden; unsigned int frames_till_alt_ref_frame; + vp8_subpix_fn_t subpixel_predict; vp8_subpix_fn_t subpixel_predict8x4; vp8_subpix_fn_t subpixel_predict8x8; diff --git a/vp8/common/filter.c b/vp8/common/filter.c index ae5952952..c97675e91 100644 --- a/vp8/common/filter.c +++ b/vp8/common/filter.c @@ -10,6 +10,29 @@ #include +#include + +#define REGISTER_FILTER 1 +#define CLAMP(x,min,max) if (x < min) x = min; else if ( x > max ) x = max; + +#if REGISTER_FILTER +#define FILTER0 filter0 +#define FILTER1 filter1 +#define FILTER2 filter2 +#define FILTER3 filter3 +#define FILTER4 filter4 +#define FILTER5 filter5 +#else +#define FILTER0 vp8_filter[0] +#define FILTER1 vp8_filter[1] +#define FILTER2 vp8_filter[2] +#define FILTER3 vp8_filter[3] +#define FILTER4 vp8_filter[4] +#define FILTER5 vp8_filter[5] +#endif + +#define SRC_INCREMENT src_increment + #include "filter.h" #include "vpx_ports/mem.h" @@ -27,7 +50,6 @@ DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) = DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) = { - { 0, 0, 128, 0, 0, 0 }, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */ { 0, -6, 123, 12, -1, 0 }, { 2, -11, 108, 36, -8, 1 }, /* New 1/4 pel 6 tap filter */ @@ -49,35 +71,45 @@ static void filter_block2d_first_pass const short *vp8_filter ) { + unsigned int i, j; - int Temp; + int Temp; + +#if REGISTER_FILTER + short filter0 = vp8_filter[0]; + short filter1 = vp8_filter[1]; + short filter2 = vp8_filter[2]; + short filter3 = vp8_filter[3]; + short filter4 = vp8_filter[4]; + short filter5 = vp8_filter[5]; +#endif + + int ps2 = 2*(int)pixel_step; + int ps3 = 3*(int)pixel_step; + unsigned int src_increment = src_pixels_per_line - output_width; for (i = 0; i < output_height; i++) { for (j = 0; j < output_width; j++) { - Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) + - ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) + - ((int)src_ptr[0] * vp8_filter[2]) + - ((int)src_ptr[pixel_step] * vp8_filter[3]) + - ((int)src_ptr[2*pixel_step] * vp8_filter[4]) + - ((int)src_ptr[3*pixel_step] * vp8_filter[5]) + - (VP8_FILTER_WEIGHT >> 1); /* Rounding */ + Temp = ((int)src_ptr[-1*ps2] * FILTER0); + Temp += ((int)src_ptr[-1*(int)pixel_step] * FILTER1) + + ((int)src_ptr[0] * FILTER2) + + ((int)src_ptr[pixel_step] * FILTER3) + + ((int)src_ptr[ps2] * FILTER4) + + ((int)src_ptr[ps3] * FILTER5) + + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ /* Normalize back to 0-255 */ Temp = Temp >> VP8_FILTER_SHIFT; - - if (Temp < 0) - Temp = 0; - else if (Temp > 255) - Temp = 255; + CLAMP(Temp, 0, 255); output_ptr[j] = Temp; src_ptr++; } /* Next row... */ - src_ptr += src_pixels_per_line - output_width; + src_ptr += SRC_INCREMENT; output_ptr += output_width; } } @@ -94,36 +126,45 @@ static void filter_block2d_second_pass const short *vp8_filter ) { - unsigned int i, j; - int Temp; + unsigned int i, j; + int Temp; + +#if REGISTER_FILTER + short filter0 = vp8_filter[0]; + short filter1 = vp8_filter[1]; + short filter2 = vp8_filter[2]; + short filter3 = vp8_filter[3]; + short filter4 = vp8_filter[4]; + short filter5 = vp8_filter[5]; +#endif + + int ps2 = ((int)pixel_step) << 1; + int ps3 = ps2 + (int)pixel_step; + unsigned int src_increment = src_pixels_per_line - output_width; for (i = 0; i < output_height; i++) { for (j = 0; j < output_width; j++) { /* Apply filter */ - Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) + - ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) + - ((int)src_ptr[0] * vp8_filter[2]) + - ((int)src_ptr[pixel_step] * vp8_filter[3]) + - ((int)src_ptr[2*pixel_step] * vp8_filter[4]) + - ((int)src_ptr[3*pixel_step] * vp8_filter[5]) + + Temp = ((int)src_ptr[-1*ps2] * FILTER0) + + ((int)src_ptr[-1*(int)pixel_step] * FILTER1) + + ((int)src_ptr[0] * FILTER2) + + ((int)src_ptr[pixel_step] * FILTER3) + + ((int)src_ptr[ps2] * FILTER4) + + ((int)src_ptr[ps3] * FILTER5) + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ /* Normalize back to 0-255 */ Temp = Temp >> VP8_FILTER_SHIFT; - - if (Temp < 0) - Temp = 0; - else if (Temp > 255) - Temp = 255; + CLAMP(Temp, 0, 255); output_ptr[j] = (unsigned char)Temp; src_ptr++; } /* Start next row */ - src_ptr += src_pixels_per_line - output_width; + src_ptr += src_increment; output_ptr += output_pitch; } } @@ -167,6 +208,7 @@ void vp8_sixtap_predict_c filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter); } + void vp8_sixtap_predict8x8_c ( unsigned char *src_ptr, diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c index 5c6464772..5d9896a10 100644 --- a/vp8/common/generic/systemdependent.c +++ b/vp8/common/generic/systemdependent.c @@ -19,6 +19,7 @@ extern void vp8_arch_x86_common_init(VP8_COMMON *ctx); extern void vp8_arch_arm_common_init(VP8_COMMON *ctx); +extern void vp8_arch_opencl_common_init(VP8_COMMON *ctx); void vp8_machine_specific_config(VP8_COMMON *ctx) { @@ -82,4 +83,8 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) vp8_arch_arm_common_init(ctx); #endif +#if CONFIG_OPENCL && (ENABLE_CL_IDCT_DEQUANT || ENABLE_CL_SUBPIXEL || ENABLE_CL_LOOPFILTER) + vp8_arch_opencl_common_init(ctx); +#endif + } diff --git a/vp8/common/idct.h b/vp8/common/idct.h index f5fd94dfd..3480cc9d1 100644 --- a/vp8/common/idct.h +++ b/vp8/common/idct.h @@ -31,6 +31,10 @@ #include "arm/idct_arm.h" #endif +#if CONFIG_OPENCL +#include "opencl/idct_cl.h" +#endif + #ifndef vp8_idct_idct1 #define vp8_idct_idct1 vp8_short_idct4x4llm_1_c #endif diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c index 37c5b7740..f49e0caa6 100644 --- a/vp8/common/loopfilter.c +++ b/vp8/common/loopfilter.c @@ -13,6 +13,10 @@ #include "loopfilter.h" #include "onyxc_int.h" +#if CONFIG_OPENCL +#include "opencl/loopfilter_cl.h" +#endif + typedef unsigned char uc; @@ -312,6 +316,13 @@ void vp8_loop_filter_frame int i; unsigned char *y_ptr, *u_ptr, *v_ptr; +#if CONFIG_OPENCL && ENABLE_CL_LOOPFILTER + if ( cl_initialized == CL_SUCCESS ){ + vp8_loop_filter_frame_cl(cm,mbd,default_filt_lvl); + return; + } +#endif + mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */ /* Note the baseline filter values for each segment */ @@ -394,6 +405,7 @@ void vp8_loop_filter_frame } +/* Encoder only... */ void vp8_loop_filter_frame_yonly ( VP8_COMMON *cm, @@ -489,7 +501,7 @@ void vp8_loop_filter_frame_yonly } - +/* Encoder only... */ void vp8_loop_filter_partial_frame ( VP8_COMMON *cm, diff --git a/vp8/common/loopfilter_filters.c b/vp8/common/loopfilter_filters.c index 694052924..b090b21a7 100644 --- a/vp8/common/loopfilter_filters.c +++ b/vp8/common/loopfilter_filters.c @@ -49,7 +49,6 @@ static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, } static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *op0, uc *oq0, uc *oq1) - { signed char ps0, qs0; signed char ps1, qs1; @@ -94,6 +93,7 @@ static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc * *op1 = u ^ 0x80; } + void vp8_loop_filter_horizontal_edge_c ( unsigned char *s, diff --git a/vp8/common/mbpitch.c b/vp8/common/mbpitch.c index 054042c0b..53adef2a7 100644 --- a/vp8/common/mbpitch.c +++ b/vp8/common/mbpitch.c @@ -11,6 +11,12 @@ #include "blockd.h" +#include "stdio.h" +#include "vpx_config.h" +#if CONFIG_OPENCL +#include "opencl/vp8_opencl.h" +#endif + typedef enum { PRED = 0, @@ -20,7 +26,6 @@ typedef enum static void setup_block ( BLOCKD *b, - int mv_stride, unsigned char **base, int Stride, int offset, @@ -49,81 +54,176 @@ static void setup_macroblock(MACROBLOCKD *x, BLOCKSET bs) int block; unsigned char **y, **u, **v; + unsigned char **buf_base; + int y_off, u_off, v_off; if (bs == DEST) { + buf_base = &x->dst.buffer_alloc; + y_off = x->dst.y_buffer - x->dst.buffer_alloc; + u_off = x->dst.u_buffer - x->dst.buffer_alloc; + v_off = x->dst.v_buffer - x->dst.buffer_alloc; y = &x->dst.y_buffer; u = &x->dst.u_buffer; v = &x->dst.v_buffer; + y_off = 0; + + //y = buf_base; + //y_off = x->dst.y_buffer - x->dst.buffer_alloc; + + u = buf_base; + v = buf_base; + + u_off = x->dst.u_buffer - x->dst.buffer_alloc; + v_off = x->dst.v_buffer - x->dst.buffer_alloc; + } else { + buf_base = &x->pre.buffer_alloc; y = &x->pre.y_buffer; u = &x->pre.u_buffer; v = &x->pre.v_buffer; + y_off = u_off = v_off = 0; + + //y = buf_base; + //y_off = x->pre.y_buffer - x->pre.buffer_alloc; + //u = buf_base; + //u_off = x->pre.u_buffer - x->pre.buffer_alloc; + //v = buf_base; + //v_off = x->pre.v_buffer - x->pre.buffer_alloc; } for (block = 0; block < 16; block++) /* y blocks */ { - setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride, - (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4, bs); + setup_block(&x->block[block], y, x->dst.y_stride, + y_off + ((block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4), bs); } for (block = 16; block < 20; block++) /* U and V blocks */ { - setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride, - ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs); + int block_off = ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4; - setup_block(&x->block[block+4], x->dst.uv_stride, v, x->dst.uv_stride, - ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs); + setup_block(&x->block[block], u, x->dst.uv_stride, + u_off + block_off, bs); + + setup_block(&x->block[block+4], v, x->dst.uv_stride, + v_off + block_off, bs); } } void vp8_setup_block_dptrs(MACROBLOCKD *x) { int r, c; + unsigned int offset; + +#if CONFIG_OPENCL && !ONE_CQ_PER_MB + cl_command_queue y_cq, u_cq, v_cq; + int err; + if (cl_initialized == CL_SUCCESS){ + //Create command queue for Y/U/V Planes + y_cq = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err); + if (!y_cq || err != CL_SUCCESS) { + printf("Error: Failed to create a command queue!\n"); + cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED); + } + u_cq = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err); + if (!u_cq || err != CL_SUCCESS) { + printf("Error: Failed to create a command queue!\n"); + cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED); + } + v_cq = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err); + if (!v_cq || err != CL_SUCCESS) { + printf("Error: Failed to create a command queue!\n"); + cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED); + } + } +#endif + /* 16 Y blocks */ for (r = 0; r < 4; r++) { for (c = 0; c < 4; c++) { - x->block[r*4+c].diff = &x->diff[r * 4 * 16 + c * 4]; - x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4; + offset = r * 4 * 16 + c * 4; + x->block[r*4+c].diff_offset = offset; + x->block[r*4+c].predictor_offset = offset; +#if CONFIG_OPENCL && !ONE_CQ_PER_MB + if (cl_initialized == CL_SUCCESS) + x->block[r*4+c].cl_commands = y_cq; +#endif } } + /* 4 U Blocks */ for (r = 0; r < 2; r++) { for (c = 0; c < 2; c++) { - x->block[16+r*2+c].diff = &x->diff[256 + r * 4 * 8 + c * 4]; - x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4; - + offset = 256 + r * 4 * 8 + c * 4; + x->block[16+r*2+c].diff_offset = offset; + x->block[16+r*2+c].predictor_offset = offset; + +#if CONFIG_OPENCL && !ONE_CQ_PER_MB + if (cl_initialized == CL_SUCCESS) + x->block[16+r*2+c].cl_commands = u_cq; +#endif } } + /* 4 V Blocks */ for (r = 0; r < 2; r++) { for (c = 0; c < 2; c++) { - x->block[20+r*2+c].diff = &x->diff[320+ r * 4 * 8 + c * 4]; - x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4; - + offset = 320+ r * 4 * 8 + c * 4; + x->block[20+r*2+c].diff_offset = offset; + x->block[20+r*2+c].predictor_offset = offset; + +#if CONFIG_OPENCL && !ONE_CQ_PER_MB + if (cl_initialized == CL_SUCCESS) + x->block[20+r*2+c].cl_commands = v_cq; +#endif } } - x->block[24].diff = &x->diff[384]; + x->block[24].diff_offset = 384; for (r = 0; r < 25; r++) { - x->block[r].qcoeff = x->qcoeff + r * 16; - x->block[r].dqcoeff = x->dqcoeff + r * 16; + x->block[r].qcoeff_base = x->qcoeff; + x->block[r].qcoeff_offset = r * 16; + x->block[r].dqcoeff_base = x->dqcoeff; + x->block[r].dqcoeff_offset = r * 16; + + x->block[r].predictor_base = x->predictor; + x->block[r].diff_base = x->diff; + x->block[r].eobs_base = x->eobs; + +#if CONFIG_OPENCL + if (cl_initialized == CL_SUCCESS){ + /* Copy command queue reference from macroblock */ +#if ONE_CQ_PER_MB + x->block[r].cl_commands = x->cl_commands; +#endif + + /* Set up CL memory buffers as appropriate */ + x->block[r].cl_diff_mem = x->cl_diff_mem; + x->block[r].cl_dqcoeff_mem = x->cl_dqcoeff_mem; + x->block[r].cl_eobs_mem = x->cl_eobs_mem; + x->block[r].cl_predictor_mem = x->cl_predictor_mem; + x->block[r].cl_qcoeff_mem = x->cl_qcoeff_mem; + } + + //Copy filter type to block. + x->block[r].sixtap_filter = x->sixtap_filter; +#endif } + } void vp8_build_block_doffsets(MACROBLOCKD *x) { - /* handle the destination pitch features */ setup_macroblock(x, DEST); setup_macroblock(x, PRED); diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h index c8c227787..8c2234e07 100644 --- a/vp8/common/onyxc_int.h +++ b/vp8/common/onyxc_int.h @@ -120,7 +120,6 @@ typedef struct VP8Common int mb_no_coeff_skip; int no_lpf; int simpler_lpf; - int use_bilinear_mc_filter; int full_pixel; int base_qindex; diff --git a/vp8/common/opencl/blockd_cl.c b/vp8/common/opencl/blockd_cl.c new file mode 100644 index 000000000..44b9a9346 --- /dev/null +++ b/vp8/common/opencl/blockd_cl.c @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "../../decoder/onyxd_int.h" +#include "../../../vpx_ports/config.h" +#include "../../common/idct.h" +#include "blockd_cl.h" +#include "../../decoder/opencl/dequantize_cl.h" + + +int vp8_cl_mb_prep(MACROBLOCKD *x, int flags){ + int err; + + if (cl_initialized != CL_SUCCESS){ + return cl_initialized; + } + + //Copy all blockd.cl_*_mem objects + if (flags & DIFF) + VP8_CL_SET_BUF(x->cl_commands, x->cl_diff_mem, sizeof(cl_short)*400, x->diff, + ,err + ); + + if (flags & PREDICTOR) + VP8_CL_SET_BUF(x->cl_commands, x->cl_predictor_mem, sizeof(cl_uchar)*384, x->predictor, + ,err + ); + + if (flags & QCOEFF) + VP8_CL_SET_BUF(x->cl_commands, x->cl_qcoeff_mem, sizeof(cl_short)*400, x->qcoeff, + ,err + ); + + if (flags & DQCOEFF) + VP8_CL_SET_BUF(x->cl_commands, x->cl_dqcoeff_mem, sizeof(cl_short)*400, x->dqcoeff, + ,err + ); + + if (flags & EOBS) + VP8_CL_SET_BUF(x->cl_commands, x->cl_eobs_mem, sizeof(cl_char)*25, x->eobs, + ,err + ); + + if (flags & PRE_BUF){ + VP8_CL_SET_BUF(x->cl_commands, x->pre.buffer_mem, x->pre.buffer_size, x->pre.buffer_alloc, + ,err + ); + } + + if (flags & DST_BUF){ + VP8_CL_SET_BUF(x->cl_commands, x->dst.buffer_mem, x->dst.buffer_size, x->dst.buffer_alloc, + ,err + ); + } + + + return CL_SUCCESS; +} + +int vp8_cl_mb_finish(MACROBLOCKD *x, int flags){ + int err; + + if (cl_initialized != CL_SUCCESS){ + return cl_initialized; + } + + if (flags & DIFF){ + err = clEnqueueReadBuffer(x->cl_commands, x->cl_diff_mem, CL_FALSE, 0, sizeof(cl_short)*400, x->diff, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS, + "Error: Failed to read from GPU!\n", + , err + ); + } + + if (flags & PREDICTOR){ + err = clEnqueueReadBuffer(x->cl_commands, x->cl_predictor_mem, CL_FALSE, 0, sizeof(cl_uchar)*384, x->predictor, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS, + "Error: Failed to read from GPU!\n", + , err + ); + } + + if (flags & QCOEFF){ + err = clEnqueueReadBuffer(x->cl_commands, x->cl_qcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, x->qcoeff, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS, + "Error: Failed to read from GPU!\n", + , err + ); + } + + if (flags & DQCOEFF){ + err = clEnqueueReadBuffer(x->cl_commands, x->cl_dqcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, x->dqcoeff, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS, + "Error: Failed to read from GPU!\n", + , err + ); + } + + if (flags & EOBS){ + err = clEnqueueReadBuffer(x->cl_commands, x->cl_eobs_mem, CL_FALSE, 0, sizeof(cl_char)*25, x->eobs, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS, + "Error: Failed to read from GPU!\n", + , err + ); + } + + if (flags & PRE_BUF){ + err = clEnqueueReadBuffer(x->cl_commands, x->pre.buffer_mem, CL_FALSE, + 0, x->pre.buffer_size, x->pre.buffer_alloc, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS, + "Error: Failed to read from GPU!\n", + , err + ); + } + + if (flags & DST_BUF){ + err = clEnqueueReadBuffer(x->cl_commands, x->dst.buffer_mem, CL_FALSE, + 0, x->dst.buffer_size, x->dst.buffer_alloc, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS, + "Error: Failed to read from GPU!\n", + , err + ); + } + + + return CL_SUCCESS; +} + +int vp8_cl_block_prep(BLOCKD *b, int flags){ + int err; + + if (cl_initialized != CL_SUCCESS){ + return cl_initialized; + } + + //Copy all blockd.cl_*_mem objects + if (flags & DIFF) + VP8_CL_SET_BUF(b->cl_commands, b->cl_diff_mem, sizeof(cl_short)*400, b->diff_base, + ,err + ); + + if (flags & PREDICTOR) + VP8_CL_SET_BUF(b->cl_commands, b->cl_predictor_mem, sizeof(cl_uchar)*384, b->predictor_base, + ,err + ); + + if (flags & QCOEFF) + VP8_CL_SET_BUF(b->cl_commands, b->cl_qcoeff_mem, sizeof(cl_short)*400, b->qcoeff_base, + ,err + ); + + if (flags & DQCOEFF) + VP8_CL_SET_BUF(b->cl_commands, b->cl_dqcoeff_mem, sizeof(cl_short)*400, b->dqcoeff_base, + ,err + ); + + if (flags & EOBS) + VP8_CL_SET_BUF(b->cl_commands, b->cl_eobs_mem, sizeof(cl_char)*25, b->eobs_base, + ,err + ); + + if (flags & DEQUANT) + VP8_CL_SET_BUF(b->cl_commands, b->cl_dequant_mem, sizeof(cl_short)*16 ,b->dequant, + ,err + ); + + return CL_SUCCESS; +} + +int vp8_cl_block_finish(BLOCKD *b, int flags){ + int err; + + if (cl_initialized != CL_SUCCESS){ + return cl_initialized; + } + + if (flags & DIFF){ + err = clEnqueueReadBuffer(b->cl_commands, b->cl_diff_mem, CL_FALSE, 0, sizeof(cl_short)*400, b->diff_base, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to read from GPU!\n", + , err + ); + } + + if (flags & PREDICTOR){ + err = clEnqueueReadBuffer(b->cl_commands, b->cl_predictor_mem, CL_FALSE, 0, sizeof(cl_uchar)*384, b->predictor_base, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to read from GPU!\n", + , err + ); + } + + if (flags & QCOEFF){ + err = clEnqueueReadBuffer(b->cl_commands, b->cl_qcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, b->qcoeff_base, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to read from GPU!\n", + , err + ); + } + + if (flags & DQCOEFF){ + err = clEnqueueReadBuffer(b->cl_commands, b->cl_dqcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, b->dqcoeff_base, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to read from GPU!\n", + , err + ); + } + + if (flags & EOBS){ + err = clEnqueueReadBuffer(b->cl_commands, b->cl_eobs_mem, CL_FALSE, 0, sizeof(cl_char)*25, b->eobs_base, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to read from GPU!\n", + , err + ); + } + + if (flags & DEQUANT){ + err = clEnqueueReadBuffer(b->cl_commands, b->cl_dequant_mem, CL_FALSE, 0, sizeof(cl_short)*16 ,b->dequant, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to read from GPU!\n", + , err + ); + } + + return CL_SUCCESS; +} diff --git a/vp8/common/opencl/blockd_cl.h b/vp8/common/opencl/blockd_cl.h new file mode 100644 index 000000000..fda4147fa --- /dev/null +++ b/vp8/common/opencl/blockd_cl.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef BLOCKD_OPENCL_H +#define BLOCKD_OPENCL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "vp8_opencl.h" +#include "../blockd.h" + +#define DIFF 0x0001 +#define PREDICTOR 0x0002 +#define QCOEFF 0x0004 +#define DQCOEFF 0x0008 +#define EOBS 0x0010 +#define DEQUANT 0x0020 +#define PRE_BUF 0x0040 +#define DST_BUF 0x0080 + +#define BLOCK_COPY_ALL 0xffff + +/* +#define BLOCK_MEM_SIZE 6 +enum { + DIFF_MEM = 0, + PRED_MEM = 1, + QCOEFF_MEM = 2, + DQCOEFF_MEM = 3, + EOBS_MEM = 4, + DEQUANT_MEM = 5 +} BLOCK_MEM_TYPES; + + +struct cl_block_mem{ + cl_mem gpu_mem; + size_t size; + void *host_mem; +}; + +typedef struct cl_block_mem block_mem; +*/ + +extern int vp8_cl_block_finish(BLOCKD *b, int flags); +extern int vp8_cl_block_prep(BLOCKD *b, int flags); + +extern int vp8_cl_mb_prep(MACROBLOCKD *x, int flags); +extern int vp8_cl_mb_finish(MACROBLOCKD *x, int flags); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/vp8/common/opencl/dynamic_cl.c b/vp8/common/opencl/dynamic_cl.c new file mode 100644 index 000000000..aa9aa3f1b --- /dev/null +++ b/vp8/common/opencl/dynamic_cl.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8_opencl.h" + +#include + +CL_FUNCTIONS cl; +void *dll = NULL; +int cl_loaded = VP8_CL_NOT_INITIALIZED; + +int close_cl(){ + int ret = dlclose(dll); + + if (ret != 0) + fprintf(stderr, "Error closing OpenCL library: %s", dlerror()); + + return ret; +} + +int load_cl(char *lib_name){ + + //printf("Loading OpenCL library\n"); + dll = dlopen(lib_name, RTLD_NOW|RTLD_LOCAL); + if (dll != NULL){ + //printf("Found CL library\n"); + } else { + //printf("Didn't find CL library\n"); + return VP8_CL_TRIED_BUT_FAILED; + } + + CL_LOAD_FN("clGetPlatformIDs", cl.getPlatformIDs); + CL_LOAD_FN("clGetPlatformInfo", cl.getPlatformInfo); + CL_LOAD_FN("clGetDeviceIDs", cl.getDeviceIDs); + CL_LOAD_FN("clGetDeviceInfo", cl.getDeviceInfo); + CL_LOAD_FN("clCreateContext", cl.createContext); +// CL_LOAD_FN("clCreateContextFromType", cl.createContextFromType); +// CL_LOAD_FN("clRetainContext", cl.retainContext); + CL_LOAD_FN("clReleaseContext", cl.releaseContext); +// CL_LOAD_FN("clGetContextInfo", cl.getContextInfo); + CL_LOAD_FN("clCreateCommandQueue", cl.createCommandQueue); +// CL_LOAD_FN("clRetainCommandQueue", cl.retainCommandQueue); + CL_LOAD_FN("clReleaseCommandQueue", cl.releaseCommandQueue); +// CL_LOAD_FN("clGetCommandQueueInfo", cl.getCommandQueue); + CL_LOAD_FN("clCreateBuffer", cl.createBuffer); +// CL_LOAD_FN("clCreateImage2D", cl.createImage2D); +// CL_LOAD_FN("clCreateImage3D", cl.createImage3D); +// CL_LOAD_FN("clRetainMemObject", cl.retainMemObject); + CL_LOAD_FN("clReleaseMemObject", cl.releaseMemObject); +// CL_LOAD_FN("clGetSupportedImageFormats", cl.getSupportedImageFormats); +// CL_LOAD_FN("clGetMemObjectInfo", cl.getMemObjectInfo); +// CL_LOAD_FN("clGetImageInfo", cl.getImageInfo); +// CL_LOAD_FN("clCreateSampler", cl.createSampler); +// CL_LOAD_FN("clRetainSampler", cl.retainSampler); +// CL_LOAD_FN("clReleaseSampler", cl.releaseSampler); +// CL_LOAD_FN("clGetSamplerInfo", cl.getSamplerInfo); + CL_LOAD_FN("clCreateProgramWithSource", cl.createProgramWithSource); +// CL_LOAD_FN("clCreateProgramWithBinary", cl.createProgramWithBinary); +// CL_LOAD_FN("clRetainProgram", cl.retainProgram); + CL_LOAD_FN("clReleaseProgram", cl.releaseProgram); + CL_LOAD_FN("clBuildProgram", cl.buildProgram); +// CL_LOAD_FN("clUnloadCompiler", cl.unloadCompiler); + CL_LOAD_FN("clGetProgramInfo", cl.getProgramInfo); + CL_LOAD_FN("clGetProgramBuildInfo", cl.getProgramBuildInfo); + CL_LOAD_FN("clCreateKernel", cl.createKernel); +// CL_LOAD_FN("clCreateKernelsInProgram", cl.createKernelsInProgram); +// CL_LOAD_FN("clRetainKernel", cl.retainKernel); + CL_LOAD_FN("clReleaseKernel", cl.releaseKernel); + CL_LOAD_FN("clSetKernelArg", cl.setKernelArg); +// CL_LOAD_FN("clGetKernelInfo", cl.getKernelInfo); + CL_LOAD_FN("clGetKernelWorkGroupInfo", cl.getKernelWorkGroupInfo); +// CL_LOAD_FN("clWaitForEvents", cl.waitForEvents); +// CL_LOAD_FN("clGetEventInfo", cl.getEventInfo); +// CL_LOAD_FN("clRetainEvent", cl.retainEvent); +// CL_LOAD_FN("clReleaseEvent", cl.releaseEvent); +// CL_LOAD_FN("clGetEventProfilingInfo", cl.getEventProfilingInfo); + CL_LOAD_FN("clFlush", cl.flush); + CL_LOAD_FN("clFinish", cl.finish); + CL_LOAD_FN("clEnqueueReadBuffer", cl.enqueueReadBuffer); + CL_LOAD_FN("clEnqueueWriteBuffer", cl.enqueueWriteBuffer); + CL_LOAD_FN("clEnqueueCopyBuffer", cl.enqueueCopyBuffer); +// CL_LOAD_FN("clEnqueueReadImage", cl.enqueueReadImage); +// CL_LOAD_FN("clEnqueueWriteImage", cl.enqueueWriteImage); +// CL_LOAD_FN("clEnqueueCopyImage", cl.enqueueCopyImage); +// CL_LOAD_FN("clEnqueueCopyImageToBuffer", cl.enqueueCopyImageToBuffer); +// CL_LOAD_FN("clEnqueueCopyBufferToImage", cl.enqueueCopyBufferToImage); +// CL_LOAD_FN("clEnqueueMapBuffer", cl.enqueueMapBuffer); +// CL_LOAD_FN("clEnqueueMapImage", cl.enqueueMapImage); +// CL_LOAD_FN("clEnqueueUnmapMemObject", cl.enqueueUnmapMemObject); + CL_LOAD_FN("clEnqueueNDRangeKernel", cl.enqueueNDRAngeKernel); +// CL_LOAD_FN("clEnqueueTask", cl.enqueueTask); +// CL_LOAD_FN("clEnqueueNativeKernel", cl.enqueueNativeKernel); +// CL_LOAD_FN("clEnqueueMarker", cl.enqueueMarker); +// CL_LOAD_FN("clEnqueueWaitForEvents", cl.enqueueWaitForEvents); + CL_LOAD_FN("clEnqueueBarrier", cl.enqueueBarrier); +// CL_LOAD_FN("clGetExtensionFunctionAddress", cl.getExtensionFunctionAddress); + + return CL_SUCCESS; +} diff --git a/vp8/common/opencl/dynamic_cl.h b/vp8/common/opencl/dynamic_cl.h new file mode 100644 index 000000000..b082a5126 --- /dev/null +++ b/vp8/common/opencl/dynamic_cl.h @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef DYNAMIC_CL_H +#define DYNAMIC_CL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#include + +int load_cl(char *lib_name); +int close_cl(); + +extern int cl_loaded; + +typedef cl_int(*fn_clGetPlatformIDs_t)(cl_uint, cl_platform_id *, cl_uint *); +typedef cl_int(*fn_clGetPlatformInfo_t)(cl_platform_id, cl_platform_info, size_t, void *, size_t *); +typedef cl_int(*fn_clGetDeviceIDs_t)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *); +typedef cl_int(*fn_clGetDeviceInfo_t)(cl_device_id, cl_device_info, size_t, void *, size_t *); +typedef cl_context(*fn_clCreateContext_t)(const cl_context_properties *, cl_uint, const cl_device_id *, void (*pfn_notify)(const char *, const void *, size_t, void *), void *, cl_int *); +typedef cl_context(*fn_clCreateContextFromType_t)(const cl_context_properties *, cl_device_type, void (*pfn_notify)(const char *, const void *, size_t, void *), void *, cl_int *); +typedef cl_int(*fn_clRetainContext_t)(cl_context); +typedef cl_int(*fn_clReleaseContext_t)(cl_context); +typedef cl_int(*fn_clGetContextInfo_t)(cl_context, cl_context_info, size_t, void *, size_t *); +typedef cl_command_queue(*fn_clCreateCommandQueue_t)(cl_context, cl_device_id, cl_command_queue_properties, cl_int *); +typedef cl_int(*fn_clRetainCommandQueue_t)(cl_command_queue); +typedef cl_int(*fn_clReleaseCommandQueue_t)(cl_command_queue); +typedef cl_int(*fn_clGetCommandQueueInfo_t)(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *); +typedef cl_mem(*fn_clCreateBuffer_t)(cl_context, cl_mem_flags, size_t, void *, cl_int *); +typedef cl_mem(*fn_clCreateImage2D_t)(cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t, void *, cl_int *); +typedef cl_mem(*fn_clCreateImage3D_t)(cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t, size_t, size_t, void *, cl_int *); +typedef cl_int(*fn_clRetainMemObject_t)(cl_mem); +typedef cl_int(*fn_clReleaseMemObject_t)(cl_mem); +typedef cl_int(*fn_clGetSupportedImageFormats_t)(cl_context, cl_mem_flags, cl_mem_object_type, cl_uint, cl_image_format *, cl_uint *); +typedef cl_int(*fn_clGetMemObjectInfo_t)(cl_mem, cl_mem_info, size_t, void *, size_t *); +typedef cl_int(*fn_clGetImageInfo_t)(cl_mem, cl_image_info, size_t, void *, size_t *); +typedef cl_sampler(*fn_clCreateSampler_t)(cl_context, cl_bool, cl_addressing_mode, cl_filter_mode, cl_int *); +typedef cl_int(*fn_clRetainSampler_t)(cl_sampler); +typedef cl_int(*fn_clReleaseSampler_t)(cl_sampler); +typedef cl_int(*fn_clGetSamplerInfo_t)(cl_sampler, cl_sampler_info, size_t, void *, size_t *); +typedef cl_program(*fn_clCreateProgramWithSource_t)(cl_context, cl_uint, const char **, const size_t *, cl_int *); +typedef cl_program(*fn_clCreateProgramWithBinary_t)(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *); +typedef cl_int(*fn_clRetainProgram_t)(cl_program); +typedef cl_int(*fn_clReleaseProgram_t)(cl_program); +typedef cl_int(*fn_clBuildProgram_t)(cl_program, cl_uint, const cl_device_id *, const char *, void (*pfn_notify)(cl_program,void*), void *); +typedef cl_int(*fn_clUnloadCompiler_t)(void); +typedef cl_int(*fn_clGetProgramInfo_t)(cl_program, cl_program_info, size_t, void *, size_t *); +typedef cl_int(*fn_clGetProgramBuildInfo_t)(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *); +typedef cl_kernel(*fn_clCreateKernel_t)(cl_program, const char *, cl_int *); +typedef cl_int(*fn_clCreateKernelsInProgram_t)(cl_program, cl_uint, cl_kernel *, cl_uint *); +typedef cl_int(*fn_clRetainKernel_t)(cl_kernel); +typedef cl_int(*fn_clReleaseKernel_t)(cl_kernel); +typedef cl_int(*fn_clSetKernelArg_t)(cl_kernel, cl_uint, size_t, const void *); +typedef cl_int(*fn_clGetKernelInfo_t)(cl_kernel, cl_kernel_info, size_t, void *, size_t *); +typedef cl_int(*fn_clGetKernelWorkGroupInfo_t)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *); +typedef cl_int(*fn_clWaitForEvents_t)(cl_uint, const cl_event *); +typedef cl_int(*fn_clGetEventInfo_t)(cl_event, cl_event_info, size_t, void *, size_t *); +typedef cl_int(*fn_clRetainEvent_t)(cl_event); +typedef cl_int(*fn_clReleaseEvent_t)(cl_event); +typedef cl_int(*fn_clGetEventProfilingInfo_t)(cl_event, cl_profiling_info, size_t, void *, size_t *); +typedef cl_int(*fn_clFlush_t)(cl_command_queue); +typedef cl_int(*fn_clFinish_t)(cl_command_queue); +typedef cl_int(*fn_clEnqueueReadBuffer_t)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *); +typedef cl_int(*fn_clEnqueueWriteBuffer_t)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); +typedef cl_int(*fn_clEnqueueCopyBuffer_t)(cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *); +typedef cl_int(*fn_clEnqueueReadImage_t)(cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *); +typedef cl_int(*fn_clEnqueueWriteImage_t)(cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); +typedef cl_int(*fn_clEnqueueCopyImage_t)(cl_command_queue, cl_mem, cl_mem, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *); +typedef cl_int(*fn_clEnqueueCopyImageToBuffer_t)(cl_command_queue, cl_mem, cl_mem, const size_t *, const size_t *, size_t, cl_uint, const cl_event *, cl_event *); +typedef cl_int(*fn_clEnqueueCopyBufferToImage_t)(cl_command_queue, cl_mem, cl_mem, size_t, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *); +typedef void*(*fn_clEnqueueMapBuffer_t)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *); +typedef void*(*fn_clEnqueueMapImage_t)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, const size_t *, const size_t *, size_t *, size_t *, cl_uint, const cl_event *, cl_event *, cl_int *); +typedef cl_int(*fn_clEnqueueUnmapMemObject_t)(cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *); +typedef cl_int(*fn_clEnqueueNDRangeKernel_t)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *); +typedef cl_int(*fn_clEnqueueTask_t)(cl_command_queue, cl_kernel, cl_uint, const cl_event *, cl_event *); +typedef cl_int(*fn_clEnqueueNativeKernel_t)(cl_command_queue, void (*user_func)(void *), void *, size_t, cl_uint, const cl_mem *, const void **, cl_uint, const cl_event *, cl_event *); +typedef cl_int(*fn_clEnqueueMarker_t)(cl_command_queue, cl_event *); +typedef cl_int(*fn_clEnqueueWaitForEvents_t)(cl_command_queue, cl_uint, const cl_event *); +typedef cl_int(*fn_clEnqueueBarrier_t)(cl_command_queue); +typedef void*(*fn_clGetExtensionFunctionAddress_t)(const char *); + +typedef struct CL_FUNCTIONS { + fn_clGetPlatformIDs_t getPlatformIDs; + fn_clGetPlatformInfo_t getPlatformInfo; + fn_clGetDeviceIDs_t getDeviceIDs; + fn_clGetDeviceInfo_t getDeviceInfo; + fn_clCreateContext_t createContext; + fn_clCreateContextFromType_t createContextFromType; + fn_clRetainContext_t retainContext; + fn_clReleaseContext_t releaseContext; + fn_clGetContextInfo_t getContextInfo; + fn_clCreateCommandQueue_t createCommandQueue; + fn_clRetainCommandQueue_t retainCommandQueue; + fn_clReleaseCommandQueue_t releaseCommandQueue; + fn_clGetCommandQueueInfo_t getCommandQueue; + fn_clCreateBuffer_t createBuffer; + fn_clCreateImage2D_t createImage2D; + fn_clCreateImage3D_t createImage3D; + fn_clRetainMemObject_t retainMemObject; + fn_clReleaseMemObject_t releaseMemObject; + fn_clGetSupportedImageFormats_t getSupportedImageFormats; + fn_clGetMemObjectInfo_t getMemObjectInfo; + fn_clGetImageInfo_t getImageInfo; + fn_clCreateSampler_t createSampler; + fn_clRetainSampler_t retainSampler; + fn_clReleaseSampler_t releaseSampler; + fn_clGetSamplerInfo_t getSamplerInfo; + fn_clCreateProgramWithSource_t createProgramWithSource; + fn_clCreateProgramWithBinary_t createProgramWithBinary; + fn_clRetainProgram_t retainProgram; + fn_clReleaseProgram_t releaseProgram; + fn_clBuildProgram_t buildProgram; + fn_clUnloadCompiler_t unloadCompiler; + fn_clGetProgramInfo_t getProgramInfo; + fn_clGetProgramBuildInfo_t getProgramBuildInfo; + fn_clCreateKernel_t createKernel; + fn_clCreateKernelsInProgram_t createKernelsInProgram; + fn_clRetainKernel_t retainKernel; + fn_clReleaseKernel_t releaseKernel; + fn_clSetKernelArg_t setKernelArg; + fn_clGetKernelInfo_t getKernelInfo; + fn_clGetKernelWorkGroupInfo_t getKernelWorkGroupInfo; + fn_clWaitForEvents_t waitForEvents; + fn_clGetEventInfo_t getEventInfo; + fn_clRetainEvent_t retainEvent; + fn_clReleaseEvent_t releaseEvent; + fn_clGetEventProfilingInfo_t getEventProfilingInfo; + fn_clFlush_t flush; + fn_clFinish_t finish; + fn_clEnqueueReadBuffer_t enqueueReadBuffer; + fn_clEnqueueWriteBuffer_t enqueueWriteBuffer; + fn_clEnqueueCopyBuffer_t enqueueCopyBuffer; + fn_clEnqueueReadImage_t enqueueReadImage; + fn_clEnqueueWriteImage_t enqueueWriteImage; + fn_clEnqueueCopyImage_t enqueueCopyImage; + fn_clEnqueueCopyImageToBuffer_t enqueueCopyImageToBuffer; + fn_clEnqueueCopyBufferToImage_t enqueueCopyBufferToImage; + fn_clEnqueueMapBuffer_t enqueueMapBuffer; + fn_clEnqueueMapImage_t enqueueMapImage; + fn_clEnqueueUnmapMemObject_t enqueueUnmapMemObject; + fn_clEnqueueNDRangeKernel_t enqueueNDRAngeKernel; + fn_clEnqueueTask_t enqueueTask; + fn_clEnqueueNativeKernel_t enqueueNativeKernel; + fn_clEnqueueMarker_t enqueueMarker; + fn_clEnqueueWaitForEvents_t enqueueWaitForEvents; + fn_clEnqueueBarrier_t enqueueBarrier; + fn_clGetExtensionFunctionAddress_t getExtensionFunctionAddress; +} CL_FUNCTIONS; + +extern CL_FUNCTIONS cl; + +#define clGetPlatformIDs cl.getPlatformIDs +#define clGetPlatformInfo cl.getPlatformInfo +#define clGetDeviceIDs cl.getDeviceIDs +#define clGetDeviceInfo cl.getDeviceInfo +#define clCreateContext cl.createContext +#define clCreateContextFromType cl.createContextFromType +#define clRetainContext cl.retainContext +#define clReleaseContext cl.releaseContext +#define clGetContextInfo cl.getContextInfo +#define clCreateCommandQueue cl.createCommandQueue +#define clRetainCommandQueue cl.retainCommandQueue +#define clReleaseCommandQueue cl.releaseCommandQueue +#define clGetCommandQueueInfo cl.getCommandQueue +#define clCreateBuffer cl.createBuffer +#define clCreateSubBuffer cl.createSubBuffer +#define clCreateImage2D cl.createImage2D +#define clCreateImage3D cl.createImage3D +#define clRetainMemObject cl.retainMemObject +#define clReleaseMemObject cl.releaseMemObject +#define clGetSupportedImageFormats cl.getSupportedImageFormats +#define clGetMemObjectInfo cl.getMemObjectInfo +#define clGetImageInfo cl.getImageInfo +#define clSetMemObjectDestructorCallback cl.setMemObjectDestructorCallback +#define clCreateSampler cl.createSampler +#define clRetainSampler cl.retainSampler +#define clReleaseSampler cl.releaseSampler +#define clGetSamplerInfo cl.getSamplerInfo +#define clCreateProgramWithSource cl.createProgramWithSource +#define clCreateProgramWithBinary cl.createProgramWithBinary +#define clRetainProgram cl.retainProgram +#define clReleaseProgram cl.releaseProgram +#define clBuildProgram cl.buildProgram +#define clUnloadCompiler cl.unloadCompiler +#define clGetProgramInfo cl.getProgramInfo +#define clGetProgramBuildInfo cl.getProgramBuildInfo +#define clCreateKernel cl.createKernel +#define clCreateKernelsInProgram cl.createKernelsInProgram +#define clRetainKernel cl.retainKernel +#define clReleaseKernel cl.releaseKernel +#define clSetKernelArg cl.setKernelArg +#define clGetKernelInfo cl.getKernelInfo +#define clGetKernelWorkGroupInfo cl.getKernelWorkGroupInfo +#define clWaitForEvents cl.waitForEvents +#define clGetEventInfo cl.getEventInfo +#define clCreateUserEvent cl.createUserEvent +#define clRetainEvent cl.retainEvent +#define clReleaseEvent cl.releaseEvent +#define clSetUserEventStatus cl.setUserEventStatus +#define clSetEventCallback cl.setEventCallback +#define clGetEventProfilingInfo cl.getEventProfilingInfo +#define clFlush cl.flush +#define clFinish cl.finish +#define clEnqueueReadBuffer cl.enqueueReadBuffer +#define clEnqueueReadBufferRect cl.enqueueReadBufferRect +#define clEnqueueWriteBuffer cl.enqueueWriteBuffer +#define clEnqueueWriteBufferRect cl.enqueueWriteBufferRect +#define clEnqueueCopyBuffer cl.enqueueCopyBuffer +#define clEnqueueCopyBufferRect cl.enqueueCopyBufferRect +#define clEnqueueReadImage cl.enqueueReadImage +#define clEnqueueWriteImage cl.enqueueWriteImage +#define clEnqueueCopyImage cl.enqueueCopyImage +#define clEnqueueCopyImageToBuffer cl.enqueueCopyImageToBuffer +#define clEnqueueCopyBufferToImage cl.enqueueCopyBufferToImage +#define clEnqueueMapBuffer cl.enqueueMapBuffer +#define clEnqueueMapImage cl.enqueueMapImage +#define clEnqueueUnmapMemObject cl.enqueueUnmapMemObject +#define clEnqueueNDRangeKernel cl.enqueueNDRAngeKernel +#define clEnqueueTask cl.enqueueTask +#define clEnqueueNativeKernel cl.enqueueNativeKernel +#define clEnqueueMarker cl.enqueueMarker +#define clEnqueueWaitForEvents cl.enqueueWaitForEvents +#define clEnqueueBarrier cl.enqueueBarrier +#define clGetExtensionFunctionAddress cl.getExtensionFunctionAddress + +#define CL_LOAD_FN(name, ref) \ + ref = dlsym(dll,name); \ + if (ref == NULL){ \ + dlclose(dll); \ + return CL_INVALID_PLATFORM; \ + } + + +#ifdef __cplusplus +} +#endif + +#endif /* DYNAMIC_CL_H */ diff --git a/vp8/common/opencl/filter_cl.c b/vp8/common/opencl/filter_cl.c new file mode 100644 index 000000000..c8ea91eea --- /dev/null +++ b/vp8/common/opencl/filter_cl.c @@ -0,0 +1,824 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include + +//ACW: Remove me after debugging. +#include +#include + +#include "vp8_opencl.h" +#include "filter_cl.h" +#include "../blockd.h" + +#define SIXTAP_FILTER_LEN 6 + +const char *filterCompileOptions = "-Ivp8/common/opencl -DVP8_FILTER_WEIGHT=128 -DVP8_FILTER_SHIFT=7 -DFILTER_OFFSET"; +const char *filter_cl_file_name = "vp8/common/opencl/filter_cl.cl"; + +#define STATIC_MEM 1 +#if STATIC_MEM +static cl_mem int_mem = NULL; +#endif + +void cl_destroy_filter(){ + + if (cl_data.filter_program) + clReleaseProgram(cl_data.filter_program); + + //VP8_CL_RELEASE_KERNEL(cl_data.vp8_block_variation_kernel); +#if !TWO_PASS_SIXTAP + VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict8x8_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict8x4_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict16x16_kernel); +#else + VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_first_pass_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_second_pass_kernel); +#endif + //VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict4x4_kernel); + //VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict8x4_kernel); + //VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict8x8_kernel); + //VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict16x16_kernel); + +#if MEM_COPY_KERNEL + VP8_CL_RELEASE_KERNEL(cl_data.vp8_memcpy_kernel); +#endif + + VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_bil_first_pass_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_bil_second_pass_kernel); + +#if STATIC_MEM + if (int_mem != NULL) + clReleaseMemObject(int_mem); + int_mem = NULL; +#endif + + cl_data.filter_program = NULL; +} + +int cl_init_filter() { + int err; + + + // Create the filter compute program from the file-defined source code + if ( cl_load_program(&cl_data.filter_program, filter_cl_file_name, + filterCompileOptions) != CL_SUCCESS ) + return VP8_CL_TRIED_BUT_FAILED; + + // Create the compute kernel in the program we wish to run +#if TWO_PASS_SIXTAP + VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_first_pass_kernel,"vp8_filter_block2d_first_pass_kernel"); + VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_second_pass_kernel,"vp8_filter_block2d_second_pass_kernel"); + VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_first_pass_kernel,vp8_filter_block2d_first_pass_kernel_size); + VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_second_pass_kernel,vp8_filter_block2d_second_pass_kernel_size); +#else + VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict_kernel,"vp8_sixtap_predict_kernel"); + VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict_kernel,vp8_sixtap_predict_kernel_size); + VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict8x8_kernel,"vp8_sixtap_predict8x8_kernel"); + VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict8x8_kernel,vp8_sixtap_predict8x8_kernel_size); + VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict8x4_kernel,"vp8_sixtap_predict8x4_kernel"); + VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict8x4_kernel,vp8_sixtap_predict8x4_kernel_size); + VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict16x16_kernel,"vp8_sixtap_predict16x16_kernel"); + VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict16x16_kernel,vp8_sixtap_predict16x16_kernel_size); +#endif + + //VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_bil_first_pass_kernel,vp8_filter_block2d_bil_first_pass_kernel_size); + //VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_bil_second_pass_kernel,vp8_filter_block2d_bil_second_pass_kernel_size); + VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_bil_first_pass_kernel,"vp8_filter_block2d_bil_first_pass_kernel"); + VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_bil_second_pass_kernel,"vp8_filter_block2d_bil_second_pass_kernel"); + + + //VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict4x4_kernel,"vp8_bilinear_predict4x4_kernel"); + //VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict8x4_kernel,"vp8_bilinear_predict8x4_kernel"); + //VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict8x8_kernel,"vp8_bilinear_predict8x8_kernel"); + //VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict16x16_kernel,"vp8_bilinear_predict16x16_kernel"); + +#if MEM_COPY_KERNEL + VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_memcpy_kernel,"vp8_memcpy_kernel"); + VP8_CL_CALC_LOCAL_SIZE(vp8_memcpy_kernel,vp8_memcpy_kernel_size); +#endif + +#if STATIC_MEM + VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,err); +#endif + + return CL_SUCCESS; +} + +void vp8_filter_block2d_first_pass_cl( + cl_command_queue cq, + cl_mem src_mem, + int src_offset, + cl_mem int_mem, + unsigned int src_pixels_per_line, + unsigned int int_height, + unsigned int int_width, + int xoffset +){ + int err; + size_t global = int_width*int_height; + size_t local = cl_data.vp8_filter_block2d_first_pass_kernel_size; + if (local > global) + local = global; + + err = clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 0, sizeof (cl_mem), &src_mem); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 1, sizeof (int), &src_offset); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 2, sizeof (cl_mem), &int_mem); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 3, sizeof (cl_uint), &src_pixels_per_line); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 4, sizeof (cl_uint), &int_height); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 5, sizeof (cl_int), &int_width); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 6, sizeof (int), &xoffset); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n", + , + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_first_pass_kernel, 1, NULL, &global, &local , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err);, + ); +} + +void vp8_filter_block2d_second_pass_cl( + cl_command_queue cq, + cl_mem int_mem, + int int_offset, + cl_mem dst_mem, + int dst_offset, + int dst_pitch, + unsigned int output_height, + unsigned int output_width, + int yoffset +){ + int err; + size_t global = output_width*output_height; + size_t local = cl_data.vp8_filter_block2d_second_pass_kernel_size; + if (local > global){ + //printf("Local is now %ld\n",global); + local = global; + } + + /* Set kernel arguments */ + err = clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 0, sizeof (cl_mem), &int_mem); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 1, sizeof (int), &int_offset); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 2, sizeof (cl_mem), &dst_mem); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 3, sizeof (int), &dst_offset); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 4, sizeof (int), &dst_pitch); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 5, sizeof (int), &output_width); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 6, sizeof (int), &output_width); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 7, sizeof (int), &output_height); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 8, sizeof (int), &output_width); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 9, sizeof (int), &yoffset); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n", + , + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_second_pass_kernel, 1, NULL, &global, &local , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err);, + ); +} + +void vp8_sixtap_single_pass( + cl_command_queue cq, + cl_kernel kernel, + size_t local, + size_t global, + cl_mem src_mem, + cl_mem dst_mem, + unsigned char *src_base, + int src_offset, + size_t src_len, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_base, + int dst_offset, + int dst_pitch, + size_t dst_len +){ + int err; + +#if !STATIC_MEM + cl_mem int_mem; +#endif + + int free_src = 0, free_dst = 0; + + if (local > global){ + local = global; + } + + /* Make space for kernel input/output data. + * Initialize the buffer as well if needed. + */ + if (src_mem == NULL){ + VP8_CL_CREATE_BUF( cq, src_mem,, sizeof (unsigned char) * src_len, src_base-2,,); + src_offset = 2; + free_src = 1; + } else { + src_offset -= 2*src_pixels_per_line; + } + + if (dst_mem == NULL){ + VP8_CL_CREATE_BUF( cq, dst_mem,, sizeof (unsigned char) * dst_len + dst_offset, dst_base,, ); + free_dst = 1; + } + +#if !STATIC_MEM + CL_CREATE_BUF( cq, int_mem,, sizeof(cl_int)*FData_height*FData_width, NULL,, ); +#endif + + err = clSetKernelArg(kernel, 0, sizeof (cl_mem), &src_mem); + err |= clSetKernelArg(kernel, 1, sizeof (int), &src_offset); + err |= clSetKernelArg(kernel, 2, sizeof (cl_int), &src_pixels_per_line); + err |= clSetKernelArg(kernel, 3, sizeof (cl_int), &xoffset); + err |= clSetKernelArg(kernel, 4, sizeof (cl_int), &yoffset); + err |= clSetKernelArg(kernel, 5, sizeof (cl_mem), &dst_mem); + err |= clSetKernelArg(kernel, 6, sizeof (cl_int), &dst_offset); + err |= clSetKernelArg(kernel, 7, sizeof (int), &dst_pitch); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n", + , + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel( cq, kernel, 1, NULL, &global, &local , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err);, + ); + + if (free_src == 1) + clReleaseMemObject(src_mem); + + if (free_dst == 1){ + /* Read back the result data from the device */ + err = clEnqueueReadBuffer(cq, dst_mem, CL_FALSE, 0, sizeof (unsigned char) * dst_len + dst_offset, dst_base, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to read output array!\n", + , + ); + clReleaseMemObject(dst_mem); + } +} + +void vp8_sixtap_run_cl( + cl_command_queue cq, + cl_mem src_mem, + cl_mem dst_mem, + unsigned char *src_base, + int src_offset, + size_t src_len, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_base, + int dst_offset, + int dst_pitch, + size_t dst_len, + unsigned int FData_height, + unsigned int FData_width, + unsigned int output_height, + unsigned int output_width, + int int_offset +) +{ + int err; + +#if !STATIC_MEM + cl_mem int_mem; +#endif + + int free_src = 0, free_dst = 0; + + /* Make space for kernel input/output data. + * Initialize the buffer as well if needed. + */ + if (src_mem == NULL){ + VP8_CL_CREATE_BUF( cq, src_mem,, sizeof (unsigned char) * src_len, src_base-2,,); + src_offset = 2; + free_src = 1; + } else { + src_offset -= 2*src_pixels_per_line; + } + + if (dst_mem == NULL){ + VP8_CL_CREATE_BUF( cq, dst_mem,, sizeof (unsigned char) * dst_len + dst_offset, dst_base,, ); + free_dst = 1; + } + +#if !STATIC_MEM + CL_CREATE_BUF( cq, int_mem,, sizeof(cl_int)*FData_height*FData_width, NULL,, ); +#endif + + vp8_filter_block2d_first_pass_cl( + cq, src_mem, src_offset, int_mem, src_pixels_per_line, + FData_height, FData_width, xoffset + ); + + vp8_filter_block2d_second_pass_cl(cq,int_mem,int_offset,dst_mem,dst_offset,dst_pitch, + output_height,output_width,yoffset); + + if (free_src == 1) + clReleaseMemObject(src_mem); + + if (free_dst == 1){ + /* Read back the result data from the device */ + err = clEnqueueReadBuffer(cq, dst_mem, CL_FALSE, 0, sizeof (unsigned char) * dst_len + dst_offset, dst_base, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to read output array!\n", + , + ); + clReleaseMemObject(dst_mem); + } + +#if !STATIC_MEM + clReleaseMemObject(int_mem); +#endif +} + +void vp8_sixtap_predict4x4_cl +( + cl_command_queue cq, + unsigned char *src_base, + cl_mem src_mem, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_base, + cl_mem dst_mem, + int dst_offset, + int dst_pitch +) { + + int output_width=4, output_height=4, FData_height=9, FData_width=4; + + //Size of output to transfer + int dst_len = DST_LEN(dst_pitch,output_height,output_width); + int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line); + +#if TWO_PASS_SIXTAP + int int_offset = 8; + unsigned char *src_ptr = src_base + src_offset; + + vp8_sixtap_run_cl(cq, src_mem, dst_mem, + (src_ptr-2*src_pixels_per_line),src_offset, src_len, + src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset, + dst_pitch,dst_len,FData_height,FData_width,output_height, + output_width,int_offset + ); +#else + vp8_sixtap_single_pass( + cq, + cl_data.vp8_sixtap_predict_kernel, + cl_data.vp8_sixtap_predict_kernel_size, + FData_height*FData_width, + src_mem, + dst_mem, + src_base, + src_offset, + src_len, + src_pixels_per_line, + xoffset, + yoffset, + dst_base, + dst_offset, + dst_pitch, + dst_len + ); +#endif + + + return; +} + +void vp8_sixtap_predict8x8_cl +( + cl_command_queue cq, + unsigned char *src_base, + cl_mem src_mem, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_base, + cl_mem dst_mem, + int dst_offset, + int dst_pitch +) { + int output_width=8, output_height=8, FData_height=13, FData_width=8; + + //Size of output to transfer + int dst_len = DST_LEN(dst_pitch,output_height,output_width); + int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line); + +#if TWO_PASS_SIXTAP + int int_offset = 16; + unsigned char *src_ptr = src_base + src_offset; + + vp8_sixtap_run_cl(cq, src_mem, dst_mem, + (src_ptr-2*src_pixels_per_line),src_offset, src_len, + src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset, + dst_pitch,dst_len,FData_height,FData_width,output_height, + output_width,int_offset + ); +#else + vp8_sixtap_single_pass( + cq, + cl_data.vp8_sixtap_predict8x8_kernel, + cl_data.vp8_sixtap_predict8x8_kernel_size, + FData_height*FData_width, + src_mem, + dst_mem, + src_base, + src_offset, + src_len, + src_pixels_per_line, + xoffset, + yoffset, + dst_base, + dst_offset, + dst_pitch, + dst_len + ); +#endif + + return; +} + +void vp8_sixtap_predict8x4_cl +( + cl_command_queue cq, + unsigned char *src_base, + cl_mem src_mem, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_base, + cl_mem dst_mem, + int dst_offset, + int dst_pitch +) { + + int output_width=8, output_height=4, FData_height=9, FData_width=8; + + //Size of output to transfer + int dst_len = DST_LEN(dst_pitch,output_height,output_width); + int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line); + +#if TWO_PASS_SIXTAP + int int_offset = 16; + unsigned char *src_ptr = src_base + src_offset; + + vp8_sixtap_run_cl(cq, src_mem, dst_mem, + (src_ptr-2*src_pixels_per_line),src_offset, src_len, + src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset, + dst_pitch,dst_len,FData_height,FData_width,output_height, + output_width,int_offset + ); +#else + vp8_sixtap_single_pass( + cq, + cl_data.vp8_sixtap_predict8x4_kernel, + cl_data.vp8_sixtap_predict8x4_kernel_size, + FData_height*FData_width, + src_mem, + dst_mem, + src_base, + src_offset, + src_len, + src_pixels_per_line, + xoffset, + yoffset, + dst_base, + dst_offset, + dst_pitch, + dst_len + ); +#endif + + return; +} + +void vp8_sixtap_predict16x16_cl +( + cl_command_queue cq, + unsigned char *src_base, + cl_mem src_mem, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_base, + cl_mem dst_mem, + int dst_offset, + int dst_pitch +) { + + int output_width=16, output_height=16, FData_height=21, FData_width=16; + + //Size of output to transfer + int dst_len = DST_LEN(dst_pitch,output_height,output_width); + int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line); + +#if TWO_PASS_SIXTAP + int int_offset = 32; + unsigned char *src_ptr = src_base + src_offset; + + vp8_sixtap_run_cl(cq, src_mem, dst_mem, + (src_ptr-2*src_pixels_per_line),src_offset, src_len, + src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset, + dst_pitch,dst_len,FData_height,FData_width,output_height, + output_width,int_offset + ); +#else + vp8_sixtap_single_pass( + cq, + cl_data.vp8_sixtap_predict16x16_kernel, + cl_data.vp8_sixtap_predict16x16_kernel_size, + FData_height*FData_width, + src_mem, + dst_mem, + src_base, + src_offset, + src_len, + src_pixels_per_line, + xoffset, + yoffset, + dst_base, + dst_offset, + dst_pitch, + dst_len + ); +#endif + + return; + +} + + + +void vp8_filter_block2d_bil_first_pass_cl( + cl_command_queue cq, + unsigned char *src_base, + cl_mem src_mem, + int src_offset, + cl_mem int_mem, + int src_pixels_per_line, + int height, + int width, + int xoffset +) +{ + int err; + size_t global = width*height; + int free_src = 0; + + if (src_mem == NULL){ + int src_len = BIL_SRC_LEN(width,height,src_pixels_per_line); + + /*Make space for kernel input/output data. Initialize the buffer as well if needed. */ + VP8_CL_CREATE_BUF(cq, src_mem, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, + sizeof (unsigned char) * src_len, src_base+src_offset,, + ); + src_offset = 0; //Set to zero as long as src_mem starts at base+offset + free_src = 1; + } + + err = clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 0, sizeof (cl_mem), &src_mem); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 1, sizeof (int), &src_offset); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 2, sizeof (cl_mem), &int_mem); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 3, sizeof (int), &src_pixels_per_line); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 4, sizeof (int), &height); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 5, sizeof (int), &width); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 6, sizeof (int), &xoffset); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n", + , + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_bil_first_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err);, + ); + + if (free_src == 1) + clReleaseMemObject(src_mem); +} + + +void vp8_filter_block2d_bil_second_pass_cl( + cl_command_queue cq, + cl_mem int_mem, + unsigned char *dst_base, + cl_mem dst_mem, + int dst_offset, + int dst_pitch, + int height, + int width, + int yoffset +) +{ + int err; + size_t global = width*height; + + //Size of output data + int dst_len = DST_LEN(dst_pitch,height,width); + + int free_dst = 0; + if (dst_mem == NULL){ + VP8_CL_CREATE_BUF(cq, dst_mem, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR, + sizeof (unsigned char) * dst_len + dst_offset, dst_base,, + ); + free_dst = 1; + } + + err = clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 0, sizeof (cl_mem), &int_mem); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 1, sizeof (cl_mem), &dst_mem); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 2, sizeof (int), &dst_offset); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 3, sizeof (int), &dst_pitch); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 4, sizeof (int), &height); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 5, sizeof (int), &width); + err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 6, sizeof (int), &yoffset); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n", + , + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_bil_second_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err);, + ); + + if (free_dst == 1){ + /* Read back the result data from the device */ + err = clEnqueueReadBuffer(cq, dst_mem, CL_FALSE, 0, sizeof (unsigned char) * dst_len + dst_offset, dst_base, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to read output array!\n", + , + ); + clReleaseMemObject(dst_mem); + } + +} + +void vp8_bilinear_predict4x4_cl +( + cl_command_queue cq, + unsigned char *src_base, + cl_mem src_mem, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_base, + cl_mem dst_mem, + int dst_offset, + int dst_pitch +) { + + const int height = 4, width = 4; + +#if !STATIC_MEM + int err; + cl_mem int_mem = NULL; + VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,); +#endif + + /* First filter 1-D horizontally... */ + vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset); + + /* then 1-D vertically... */ + vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset); + +#if !STATIC_MEM + clReleaseMemObject(int_mem); +#endif + +} + +void vp8_bilinear_predict8x8_cl +( + cl_command_queue cq, + unsigned char *src_base, + cl_mem src_mem, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_base, + cl_mem dst_mem, + int dst_offset, + int dst_pitch +) { + + const int height = 8, width = 8; + +#if !STATIC_MEM + int err; + cl_mem int_mem = NULL; + VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,); +#endif + + /* First filter 1-D horizontally... */ + vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset); + + /* then 1-D vertically... */ + vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset); + +#if !STATIC_MEM + clReleaseMemObject(int_mem); +#endif + +} + +void vp8_bilinear_predict8x4_cl +( + cl_command_queue cq, + unsigned char *src_base, + cl_mem src_mem, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_base, + cl_mem dst_mem, + int dst_offset, + int dst_pitch +) { + + const int height = 4, width = 8; + +#if !STATIC_MEM + int err; + cl_mem int_mem = NULL; + VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,); +#endif + + /* First filter 1-D horizontally... */ + vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset); + + /* then 1-D vertically... */ + vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset); + +#if !STATIC_MEM + clReleaseMemObject(int_mem); +#endif + +} + +void vp8_bilinear_predict16x16_cl +( + cl_command_queue cq, + unsigned char *src_base, + cl_mem src_mem, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + unsigned char *dst_base, + cl_mem dst_mem, + int dst_offset, + int dst_pitch +) { + + const int height = 16, width = 16; + +#if !STATIC_MEM + int err; + cl_mem int_mem = NULL; + VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,); +#endif + + /* First filter 1-D horizontally... */ + vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset); + + /* then 1-D vertically... */ + vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset); + +#if !STATIC_MEM + clReleaseMemObject(int_mem); +#endif + +} diff --git a/vp8/common/opencl/filter_cl.cl b/vp8/common/opencl/filter_cl.cl new file mode 100644 index 000000000..e8aadaa17 --- /dev/null +++ b/vp8/common/opencl/filter_cl.cl @@ -0,0 +1,562 @@ +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable +#pragma OPENCL EXTENSION cl_amd_printf : enable + +__constant int bilinear_filters[8][2] = { + { 128, 0}, + { 112, 16}, + { 96, 32}, + { 80, 48}, + { 64, 64}, + { 48, 80}, + { 32, 96}, + { 16, 112} +}; + +__constant short sub_pel_filters[8][8] = { + //These were originally 8x6, but are padded for vector ops + { 0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */ + { 0, -6, 123, 12, -1, 0, 0, 0}, + { 2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */ + { 0, -9, 93, 50, -6, 0, 0, 0}, + { 3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */ + { 0, -6, 50, 93, -9, 0, 0, 0}, + { 1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */ + { 0, -1, 12, 123, -6, 0, 0, 0}, +}; + + +kernel void vp8_filter_block2d_first_pass_kernel( + __global unsigned char *src_base, + int src_offset, + __global int *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width, + int filter_offset +){ + uint tid = get_global_id(0); + + global unsigned char *src_ptr = &src_base[src_offset]; + //Note that src_offset will be reset later, which is why we use it now + + int Temp; + + __constant short *vp8_filter = sub_pel_filters[filter_offset]; + + if (tid < (output_width*output_height)){ + src_offset = tid + (tid/output_width * (src_pixels_per_line - output_width)); + + Temp = (int)(src_ptr[src_offset - 2] * vp8_filter[0]) + + (int)(src_ptr[src_offset - 1] * vp8_filter[1]) + + (int)(src_ptr[src_offset] * vp8_filter[2]) + + (int)(src_ptr[src_offset + 1] * vp8_filter[3]) + + (int)(src_ptr[src_offset + 2] * vp8_filter[4]) + + (int)(src_ptr[src_offset + 3] * vp8_filter[5]) + + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp = Temp >> VP8_FILTER_SHIFT; + + if (Temp < 0) + Temp = 0; + else if ( Temp > 255 ) + Temp = 255; + + output_ptr[tid] = Temp; + } + +} + +kernel void vp8_filter_block2d_second_pass_kernel +( + __global int *src_base, + int src_offset, + __global unsigned char *output_base, + int output_offset, + int output_pitch, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + int filter_offset +) { + + uint i = get_global_id(0); + + global int *src_ptr = &src_base[src_offset]; + global unsigned char *output_ptr = &output_base[output_offset]; + + int out_offset; //Not same as output_offset... + int Temp; + int PS2 = 2*(int)pixel_step; + int PS3 = 3*(int)pixel_step; + + unsigned int src_increment = src_pixels_per_line - output_width; + + __constant short *vp8_filter = sub_pel_filters[filter_offset]; + + if (i < (output_width * output_height)){ + out_offset = i/output_width; + src_offset = out_offset; + + src_offset = i + (src_offset * src_increment); + out_offset = i%output_width + (out_offset * output_pitch); + + /* Apply filter */ + Temp = ((int)src_ptr[src_offset - PS2] * vp8_filter[0]) + + ((int)src_ptr[src_offset -(int)pixel_step] * vp8_filter[1]) + + ((int)src_ptr[src_offset] * vp8_filter[2]) + + ((int)src_ptr[src_offset + pixel_step] * vp8_filter[3]) + + ((int)src_ptr[src_offset + PS2] * vp8_filter[4]) + + ((int)src_ptr[src_offset + PS3] * vp8_filter[5]) + + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp = Temp >> VP8_FILTER_SHIFT; + if (Temp < 0) + Temp = 0; + else if (Temp > 255) + Temp = 255; + + output_ptr[out_offset] = (unsigned char)Temp; + } +} + + +kernel void vp8_filter_block2d_bil_first_pass_kernel( + __global unsigned char *src_base, + int src_offset, + __global int *output_ptr, + unsigned int src_pixels_per_line, + unsigned int output_height, + unsigned int output_width, + int filter_offset +) +{ + uint tid = get_global_id(0); + + if (tid < output_width * output_height){ + global unsigned char *src_ptr = &src_base[src_offset]; + + unsigned int i, j; + __constant int *vp8_filter = bilinear_filters[filter_offset]; + + unsigned int out_row,out_offset; + int src_increment = src_pixels_per_line - output_width; + + i = tid / output_width; + j = tid % output_width; + + src_offset = i*(output_width+src_increment) + j; + out_row = output_width * i; + + out_offset = out_row + j; + + /* Apply bilinear filter */ + output_ptr[out_offset] = (((int)src_ptr[src_offset] * vp8_filter[0]) + + ((int)src_ptr[src_offset+1] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT; + } +} + +kernel void vp8_filter_block2d_bil_second_pass_kernel +( + __global int *src_ptr, + __global unsigned char *output_base, + int output_offset, + int output_pitch, + unsigned int output_height, + unsigned int output_width, + int filter_offset +) +{ + + uint tid = get_global_id(0); + + if (tid < output_width * output_height){ + global unsigned char *output_ptr = &output_base[output_offset]; + + unsigned int i, j; + int Temp; + __constant int *vp8_filter = bilinear_filters[filter_offset]; + + int out_offset; + int src_offset; + + i = tid / output_width; + j = tid % output_width; + + src_offset = i*(output_width) + j; + out_offset = i*output_pitch + j; + + /* Apply filter */ + Temp = ((int)src_ptr[src_offset] * vp8_filter[0]) + + ((int)src_ptr[src_offset+output_width] * vp8_filter[1]) + + (VP8_FILTER_WEIGHT / 2); + + output_ptr[out_offset++] = (unsigned int)(Temp >> VP8_FILTER_SHIFT); + } +} + + + + +//Called from reconinter_cl.c +kernel void vp8_memcpy_kernel( + global unsigned char *src_base, + int src_offset, + int src_stride, + global unsigned char *dst_base, + int dst_offset, + int dst_stride, + int num_bytes, + int num_iter +){ + + int i,r; + global unsigned char *src = &src_base[src_offset]; + global unsigned char *dst = &dst_base[dst_offset]; + src_offset = dst_offset = 0; + + r = get_global_id(1); + if (r < get_global_size(1)){ + i = get_global_id(0); + if (i < get_global_size(0)){ + src_offset = r*src_stride + i; + dst_offset = r*dst_stride + i; + dst[dst_offset] = src[src_offset]; + } + } +} + +//Not used currently. +void vp8_memset_short( + global short *mem, + int offset, + short newval, + unsigned int size +) +{ + int tid = get_global_id(0); + + if (tid < (size/2)){ + mem[offset+tid/2] = newval; + } +} + + + +__kernel void vp8_bilinear_predict4x4_kernel +( + __global unsigned char *src_base, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + __global unsigned char *dst_base, + int dst_offset, + int dst_pitch, + __global int *int_mem +) +{ + int Height = 4, Width = 4; + + /* First filter 1-D horizontally... */ + vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset); + + /* then 1-D vertically... */ + vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset); +} + +__kernel void vp8_bilinear_predict8x8_kernel +( + __global unsigned char *src_base, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + __global unsigned char *dst_base, + int dst_offset, + int dst_pitch, + __global int *int_mem +) +{ + int Height = 8, Width = 8; + + /* First filter 1-D horizontally... */ + vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset); + + /* then 1-D vertically... */ + vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset); + +} + +__kernel void vp8_bilinear_predict8x4_kernel +( + __global unsigned char *src_base, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + __global unsigned char *dst_base, + int dst_offset, + int dst_pitch, + __global int *int_mem +) +{ + int Height = 4, Width = 8; + + /* First filter 1-D horizontally... */ + vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset); + + /* then 1-D vertically... */ + vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset); +} + +__kernel void vp8_bilinear_predict16x16_kernel +( + __global unsigned char *src_base, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + __global unsigned char *dst_base, + int dst_offset, + int dst_pitch, + __global int *int_mem +) +{ + + int Height = 16, Width = 16; + + /* First filter 1-D horizontally... */ + vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset); + + /* then 1-D vertically... */ + vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset); + +} + +void vp8_filter_block2d_first_pass( + global unsigned char *src_base, + int src_offset, + local int *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + int filter_offset +){ + uint tid = get_global_id(0); + uint i = tid; + + int nthreads = get_global_size(0); + int ngroups = nthreads / get_local_size(0); + + global unsigned char *src_ptr = &src_base[src_offset]; + //Note that src_offset will be reset later, which is why we capture it now + + int Temp; + + __constant short *vp8_filter = sub_pel_filters[filter_offset]; + + if (tid < (output_width*output_height)){ + short filter0 = vp8_filter[0]; + short filter1 = vp8_filter[1]; + short filter2 = vp8_filter[2]; + short filter3 = vp8_filter[3]; + short filter4 = vp8_filter[4]; + short filter5 = vp8_filter[5]; + + if (ngroups > 1){ + //This is generally only true on Apple CPU-CL, which gives a group + //size of 1, regardless of the CPU core count. + for (i=0; i < output_width*output_height; i++){ + src_offset = i + (i/output_width * (src_pixels_per_line - output_width)); + + Temp = (int)(src_ptr[src_offset - 2] * filter0) + + (int)(src_ptr[src_offset - 1] * filter1) + + (int)(src_ptr[src_offset] * filter2) + + (int)(src_ptr[src_offset + 1] * filter3) + + (int)(src_ptr[src_offset + 2] * filter4) + + (int)(src_ptr[src_offset + 3] * filter5) + + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp >>= VP8_FILTER_SHIFT; + + if (Temp < 0) + Temp = 0; + else if ( Temp > 255 ) + Temp = 255; + + output_ptr[i] = Temp; + } + } else { + src_offset = i + (i/output_width * (src_pixels_per_line - output_width)); + + Temp = (int)(src_ptr[src_offset - 2] * filter0) + + (int)(src_ptr[src_offset - 1] * filter1) + + (int)(src_ptr[src_offset] * filter2) + + (int)(src_ptr[src_offset + 1] * filter3) + + (int)(src_ptr[src_offset + 2] * filter4) + + (int)(src_ptr[src_offset + 3] * filter5) + + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp >>= VP8_FILTER_SHIFT; + + if (Temp < 0) + Temp = 0; + else if ( Temp > 255 ) + Temp = 255; + + output_ptr[i] = Temp; + } + } + + //Add a fence so that no 2nd pass stuff starts before 1st pass writes are done. + barrier(CLK_LOCAL_MEM_FENCE); +} + +void vp8_filter_block2d_second_pass +( + local int *src_ptr, + global unsigned char *output_base, + int output_offset, + int output_pitch, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + int filter_offset +) { + + global unsigned char *output_ptr = &output_base[output_offset]; + + int out_offset; //Not same as output_offset... + int src_offset; + int Temp; + int PS2 = 2*(int)pixel_step; + int PS3 = 3*(int)pixel_step; + + unsigned int src_increment = src_pixels_per_line - output_width; + + uint i = get_global_id(0); + + __constant short *vp8_filter = sub_pel_filters[filter_offset]; + + if (i < (output_width * output_height)){ + out_offset = i/output_width; + src_offset = out_offset; + + src_offset = i + (src_offset * src_increment); + out_offset = i%output_width + (out_offset * output_pitch); + + /* Apply filter */ + Temp = ((int)src_ptr[src_offset - PS2] * vp8_filter[0]) + + ((int)src_ptr[src_offset -(int)pixel_step] * vp8_filter[1]) + + ((int)src_ptr[src_offset] * vp8_filter[2]) + + ((int)src_ptr[src_offset + pixel_step] * vp8_filter[3]) + + ((int)src_ptr[src_offset + PS2] * vp8_filter[4]) + + ((int)src_ptr[src_offset + PS3] * vp8_filter[5]) + + (VP8_FILTER_WEIGHT >> 1); /* Rounding */ + + /* Normalize back to 0-255 */ + Temp = Temp >> VP8_FILTER_SHIFT; + if (Temp < 0) + Temp = 0; + else if (Temp > 255) + Temp = 255; + + output_ptr[out_offset] = (unsigned char)Temp; + } +} + +__kernel void vp8_sixtap_predict_kernel +( + __global unsigned char *src_ptr, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + __global unsigned char *dst_ptr, + int dst_offset, + int dst_pitch +) +{ + + local int FData[9*4]; + + /* First filter 1-D horizontally... */ + vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 9, 4, xoffset); + + /* then filter vertically... */ + vp8_filter_block2d_second_pass(&FData[8], dst_ptr, dst_offset, dst_pitch, 4, 4, 4, 4, yoffset); +} + +__kernel void vp8_sixtap_predict8x8_kernel +( + __global unsigned char *src_ptr, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + __global unsigned char *dst_ptr, + int dst_offset, + int dst_pitch +) +{ + local int FData[13*16]; /* Temp data bufffer used in filtering */ + + /* First filter 1-D horizontally... */ + vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 13, 8, xoffset); + + /* then filter vertically... */ + vp8_filter_block2d_second_pass(&FData[16], dst_ptr, dst_offset, dst_pitch, 8, 8, 8, 8, yoffset); + +} + +__kernel void vp8_sixtap_predict8x4_kernel +( + __global unsigned char *src_ptr, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + __global unsigned char *dst_ptr, + int dst_offset, + int dst_pitch +) +{ + local int FData[13*16]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 9, 8, xoffset); + + /* then filter verticaly... */ + vp8_filter_block2d_second_pass(&FData[16], dst_ptr, dst_offset, dst_pitch, 8, 8, 4, 8, yoffset); +} + +__kernel void vp8_sixtap_predict16x16_kernel +( + __global unsigned char *src_ptr, + int src_offset, + int src_pixels_per_line, + int xoffset, + int yoffset, + __global unsigned char *dst_ptr, + int dst_offset, + int dst_pitch +) +{ + local int FData[21*24]; /* Temp data buffer used in filtering */ + + /* First filter 1-D horizontally... */ + vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 21, 16, xoffset); + + /* then filter verticaly... */ + vp8_filter_block2d_second_pass(&FData[32], dst_ptr, dst_offset, dst_pitch, 16, 16, 16, 16, yoffset); + + return; +} diff --git a/vp8/common/opencl/filter_cl.h b/vp8/common/opencl/filter_cl.h new file mode 100644 index 000000000..1edcf51cc --- /dev/null +++ b/vp8/common/opencl/filter_cl.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef FILTER_CL_H_ +#define FILTER_CL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "vp8_opencl.h" + +#define VP8_FILTER_WEIGHT 128 +#define VP8_FILTER_SHIFT 7 + +#define REGISTER_FILTER 1 +#define CLAMP(x,min,max) if (x < min) x = min; else if ( x > max ) x = max; +#define PRE_CALC_PIXEL_STEPS 1 +#define PRE_CALC_SRC_INCREMENT 1 + +#if PRE_CALC_PIXEL_STEPS +#define PS2 two_pixel_steps +#define PS3 three_pixel_steps +#else +#define PS2 2*(int)pixel_step +#define PS3 3*(int)pixel_step +#endif + +#if REGISTER_FILTER +#define FILTER0 filter0 +#define FILTER1 filter1 +#define FILTER2 filter2 +#define FILTER3 filter3 +#define FILTER4 filter4 +#define FILTER5 filter5 +#else +#define FILTER0 vp8_filter[0] +#define FILTER1 vp8_filter[1] +#define FILTER2 vp8_filter[2] +#define FILTER3 vp8_filter[3] +#define FILTER4 vp8_filter[4] +#define FILTER5 vp8_filter[5] +#endif + +#if PRE_CALC_SRC_INCREMENT +#define SRC_INCREMENT src_increment +#else +#define SRC_INCREMENT (src_pixels_per_line - output_width) +#endif + +#define FILTER_OFFSET //Filter data stored as CL constant memory +#define FILTER_REF sub_pel_filters[filter_offset] + +extern const char *filterCompileOptions; +extern const char *filter_cl_file_name; + +//Copy the -2*pixel_step (and ps*3) bytes because the filter algorithm +//accesses negative indexes +#define SIXTAP_SRC_LEN(out_width,out_height,src_px) ((out_width)*(out_height) + (((out_width)*(out_height)-1)/(out_width))*(src_px - out_width) + 5) +#define BIL_SRC_LEN(out_width,out_height,src_px) ((out_height) * src_px + out_width) +#define DST_LEN(dst_pitch,dst_height,dst_width) (dst_pitch * (dst_height) + (dst_width)) + +#ifdef __cplusplus +} +#endif + +#endif /* FILTER_CL_H_ */ diff --git a/vp8/common/opencl/idct_cl.h b/vp8/common/opencl/idct_cl.h new file mode 100644 index 000000000..7358a11c8 --- /dev/null +++ b/vp8/common/opencl/idct_cl.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef IDCT_OPENCL_H +#define IDCT_OPENCL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "vp8_opencl.h" +#include "vp8/common/blockd.h" + +#define prototype_second_order_cl(sym) \ + void sym(BLOCKD *b) + +#define prototype_idct_cl(sym) \ + void sym(BLOCKD *b, int pitch) + +#define prototype_idct_scalar_add_cl(sym) \ + void sym(BLOCKD *b, cl_int use_diff, int diff_offset, int qcoeff_offset, \ + int pred_offset, unsigned char *output, cl_mem out_mem, int out_offset, size_t out_size, \ + int pitch, int stride)\ + + +extern prototype_idct_cl(vp8_short_idct4x4llm_1_cl); +extern prototype_idct_cl(vp8_short_idct4x4llm_cl); +extern prototype_idct_scalar_add_cl(vp8_dc_only_idct_add_cl); + +extern prototype_second_order_cl(vp8_short_inv_walsh4x4_1_cl); +extern prototype_second_order_cl(vp8_short_inv_walsh4x4_cl); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/vp8/common/opencl/idctllm_cl.c b/vp8/common/opencl/idctllm_cl.c new file mode 100644 index 000000000..bfee80b28 --- /dev/null +++ b/vp8/common/opencl/idctllm_cl.c @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include + +//ACW: Remove me after debugging. +#include +#include + +#include "idct_cl.h" +#include "idctllm_cl.h" +#include "blockd_cl.h" + +void cl_destroy_idct(){ + + if (cl_data.idct_program) + clReleaseProgram(cl_data.idct_program); + + cl_data.idct_program = NULL; + + VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_inv_walsh4x4_1_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_dc_only_idct_add_kernel); + //VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_idct4x4llm_1_kernel); + //VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_idct4x4llm_kernel); + +} + +int cl_init_idct() { + int err; + + // Create the filter compute program from the file-defined source code + if (cl_load_program(&cl_data.idct_program, idctllm_cl_file_name, + idctCompileOptions) != CL_SUCCESS) + return VP8_CL_TRIED_BUT_FAILED; + + // Create the compute kernel in the program we wish to run + VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_inv_walsh4x4_1_kernel,"vp8_short_inv_walsh4x4_1_kernel"); + VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_inv_walsh4x4_1st_pass_kernel,"vp8_short_inv_walsh4x4_1st_pass_kernel"); + VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_inv_walsh4x4_2nd_pass_kernel,"vp8_short_inv_walsh4x4_2nd_pass_kernel"); + VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_dc_only_idct_add_kernel,"vp8_dc_only_idct_add_kernel"); + + ////idct4x4llm kernels are only useful for the encoder + //VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_idct4x4llm_1_kernel,"vp8_short_idct4x4llm_1_kernel"); + //VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_idct4x4llm_kernel,"vp8_short_idct4x4llm_kernel"); + + return CL_SUCCESS; +} + +#define max(x,y) (x > y ? x: y) +//#define NO_CL + +/* Only useful for encoder... Untested... */ +void vp8_short_idct4x4llm_cl(BLOCKD *b, int pitch) +{ + int err; + + short *input = b->dqcoeff_base + b->dqcoeff_offset; + short *output = &b->diff_base[b->diff_offset]; + + cl_mem src_mem, dst_mem; + + //1 instance for now. This should be split into 2-pass * 4 thread. + size_t global = 1; + + if (cl_initialized != CL_SUCCESS){ + vp8_short_idct4x4llm_c(input,output,pitch); + return; + } + + VP8_CL_CREATE_BUF(b->cl_commands, src_mem,, + sizeof(short)*16, input, + vp8_short_idct4x4llm_c(input,output,pitch), + ); + + VP8_CL_CREATE_BUF(b->cl_commands, dst_mem,, + sizeof(short)*(4+(pitch/2)*3), output, + vp8_short_idct4x4llm_c(input,output,pitch), + ); + + //Set arguments and run kernel + err = 0; + err = clSetKernelArg(cl_data.vp8_short_idct4x4llm_kernel, 0, sizeof (cl_mem), &src_mem); + err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_kernel, 1, sizeof (cl_mem), &dst_mem); + err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_kernel, 2, sizeof (int), &pitch); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n", + vp8_short_idct4x4llm_c(input,output,pitch), + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_idct4x4llm_kernel, 1, NULL, &global, NULL , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err); + vp8_short_idct4x4llm_c(input,output,pitch), + ); + + /* Read back the result data from the device */ + err = clEnqueueReadBuffer(b->cl_commands, dst_mem, CL_FALSE, 0, sizeof(short)*(4+pitch/2*3), output, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS(b->cl_commands, err != CL_SUCCESS, + "Error: Failed to read output array!\n", + vp8_short_idct4x4llm_c(input,output,pitch), + ); + + clReleaseMemObject(src_mem); + clReleaseMemObject(dst_mem); + + return; +} + +/* Only useful for encoder... Untested... */ +void vp8_short_idct4x4llm_1_cl(BLOCKD *b, int pitch) +{ + int err; + size_t global = 4; + + short *input = b->dqcoeff_base + b->dqcoeff_offset; + short *output = &b->diff_base[b->diff_offset]; + + cl_mem src_mem, dst_mem; + + if (cl_initialized != CL_SUCCESS){ + vp8_short_idct4x4llm_1_c(input,output,pitch); + return; + } + + printf("vp8_short_idct4x4llm_1_cl\n"); + + VP8_CL_CREATE_BUF(b->cl_commands, src_mem,, + sizeof(short), input, + vp8_short_idct4x4llm_1_c(input,output,pitch), + ); + + VP8_CL_CREATE_BUF(b->cl_commands, dst_mem,, + sizeof(short)*(4+(pitch/2)*3), output, + vp8_short_idct4x4llm_1_c(input,output,pitch), + ); + + //Set arguments and run kernel + err = 0; + err = clSetKernelArg(cl_data.vp8_short_idct4x4llm_1_kernel, 0, sizeof (cl_mem), &src_mem); + err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_1_kernel, 1, sizeof (cl_mem), &dst_mem); + err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_1_kernel, 2, sizeof (int), &pitch); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n", + vp8_short_idct4x4llm_1_c(input,output,pitch), + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_idct4x4llm_1_kernel, 1, NULL, &global, NULL , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err); + vp8_short_idct4x4llm_1_c(input,output,pitch), + ); + + /* Read back the result data from the device */ + err = clEnqueueReadBuffer(b->cl_commands, dst_mem, CL_FALSE, 0, sizeof(short)*(4+pitch/2*3), output, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS(b->cl_commands, err != CL_SUCCESS, + "Error: Failed to read output array!\n", + vp8_short_idct4x4llm_1_c(input,output,pitch), + ); + + clReleaseMemObject(src_mem); + clReleaseMemObject(dst_mem); + + return; + +} + +void vp8_dc_only_idct_add_cl(BLOCKD *b, cl_int use_diff, int diff_offset, + int qcoeff_offset, int pred_offset, + unsigned char *dst_base, cl_mem dst_mem, int dst_offset, size_t dest_size, + int pitch, int stride +) +{ + + int err; + size_t global = 16; + + int free_mem = 0; + //cl_mem dest_mem = NULL; + + if (dst_mem == NULL){ + VP8_CL_CREATE_BUF(b->cl_commands, dst_mem,, + dest_size, dst_base,, + ); + free_mem = 1; + } + + //Set arguments and run kernel + err = clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 0, sizeof (cl_mem), &b->cl_predictor_mem); + err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 1, sizeof (int), &pred_offset); + err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 2, sizeof (cl_mem), &dst_mem); + err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 3, sizeof (int), &dst_offset); + err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 4, sizeof (int), &pitch); + err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 5, sizeof (int), &stride); + err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 6, sizeof (cl_int), &use_diff); + err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 7, sizeof (cl_mem), &b->cl_diff_mem); + err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 8, sizeof (int), &diff_offset); + err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 9, sizeof (cl_mem), &b->cl_qcoeff_mem); + err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 10, sizeof (int), &qcoeff_offset); + err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 11, sizeof (cl_mem), &b->cl_dequant_mem); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n",, + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_dc_only_idct_add_kernel, 1, NULL, &global, NULL , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err);, + ); + + + if (free_mem == 1){ + /* Read back the result data from the device */ + err = clEnqueueReadBuffer(b->cl_commands, dst_mem, CL_FALSE, 0, + dest_size, dst_base, 0, NULL, NULL); + + VP8_CL_CHECK_SUCCESS(b->cl_commands, err != CL_SUCCESS, + "Error: Failed to read output array!\n",, + ); + + clReleaseMemObject(dst_mem); + } + + return; +} + +void vp8_short_inv_walsh4x4_cl(BLOCKD *b) +{ + int err; + size_t global = 4; + + if (cl_initialized != CL_SUCCESS){ + vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset,&b->diff_base[b->diff_offset]); + return; + } + + //Set arguments and run kernel + err = 0; + err = clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 0, sizeof (cl_mem), &b->cl_dqcoeff_mem); + err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 1, sizeof(int), &b->dqcoeff_offset); + err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 2, sizeof (cl_mem), &b->cl_diff_mem); + err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 3, sizeof(int), &b->diff_offset); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n", + vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]), + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err); + vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]), + ); + + //Second pass + //Set arguments and run kernel + err = 0; + err = clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel, 0, sizeof (cl_mem), &b->cl_diff_mem); + err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel, 1, sizeof(int), &b->diff_offset); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n", + vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]), + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err); + vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]), + ); + + return; +} + +void vp8_short_inv_walsh4x4_1_cl(BLOCKD *b) +{ + + int err; + size_t global = 4; + + if (cl_initialized != CL_SUCCESS){ + vp8_short_inv_walsh4x4_1_c(b->dqcoeff_base + b->dqcoeff_offset, + &b->diff_base[b->diff_offset]); + return; + } + + //Set arguments and run kernel + err = 0; + err = clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 0, sizeof (cl_mem), &b->cl_dqcoeff_mem); + err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 1, sizeof (int), &b->dqcoeff_offset); + err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 2, sizeof (cl_mem), &b->cl_diff_mem); + err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 3, sizeof (int), &b->diff_offset); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n", + vp8_short_inv_walsh4x4_1_c(b->dqcoeff_base + b->dqcoeff_offset, + &b->diff_base[b->diff_offset]), + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_inv_walsh4x4_1_kernel, 1, NULL, &global, NULL , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err); + vp8_short_inv_walsh4x4_1_c(b->dqcoeff_base + b->dqcoeff_offset, + &b->diff_base[b->diff_offset]), + ); + + return; +} diff --git a/vp8/common/opencl/idctllm_cl.cl b/vp8/common/opencl/idctllm_cl.cl new file mode 100644 index 000000000..6a8439a37 --- /dev/null +++ b/vp8/common/opencl/idctllm_cl.cl @@ -0,0 +1,309 @@ +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable +#pragma OPENCL EXTENSION cl_amd_printf : enable + +__constant int cospi8sqrt2minus1 = 20091; +__constant int sinpi8sqrt2 = 35468; +__constant int rounding = 0; + + +kernel void vp8_short_idct4x4llm_1st_pass_kernel(global short*,global short *,int); +kernel void vp8_short_idct4x4llm_2nd_pass_kernel(global short*,int); + + +__kernel void vp8_short_idct4x4llm_kernel( + __global short *input, + __global short *output, + int pitch +){ + vp8_short_idct4x4llm_1st_pass_kernel(input,output,pitch); + vp8_short_idct4x4llm_2nd_pass_kernel(output,pitch); +} + +__kernel void vp8_short_idct4x4llm_1st_pass_kernel( + __global short *ip, + __global short *op, + int pitch +) +{ + int i; + int a1, b1, c1, d1; + + int temp1, temp2; + int shortpitch = pitch >> 1; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[8]; + b1 = ip[0] - ip[8]; + + temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16; + temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16); + temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16; + d1 = temp1 + temp2; + + op[shortpitch*0] = a1 + d1; + op[shortpitch*3] = a1 - d1; + + op[shortpitch*1] = b1 + c1; + op[shortpitch*2] = b1 - c1; + + ip++; + op++; + } + + return; +} + +__kernel void vp8_short_idct4x4llm_2nd_pass_kernel( + __global short *output, + int pitch +) +{ + int i; + int a1, b1, c1, d1; + + int temp1, temp2; + int shortpitch = pitch >> 1; + __global short *ip = output; + __global short *op = output; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[2]; + b1 = ip[0] - ip[2]; + + temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16; + temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16); + temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16; + d1 = temp1 + temp2; + + op[0] = (a1 + d1 + 4) >> 3; + op[3] = (a1 - d1 + 4) >> 3; + + op[1] = (b1 + c1 + 4) >> 3; + op[2] = (b1 - c1 + 4) >> 3; + + ip += shortpitch; + op += shortpitch; + } + + return; +} + +__kernel void vp8_short_idct4x4llm_1_kernel( + __global short *input, + __global short *output, + int pitch +) +{ + int a1; + int out_offset; + int shortpitch = pitch >> 1; + + //short4 a; + a1 = ((input[0] + 4) >> 3); + //a = a1; + + int tid = get_global_id(0); + if (tid < 4){ + out_offset = shortpitch * tid; + + //vstore4(a,0,&output[out_offset]; + output[out_offset] = a1; + output[out_offset+1] = a1; + output[out_offset+2] = a1; + output[out_offset+3] = a1; + } +} + +__kernel void vp8_dc_only_idct_add_kernel( + __global unsigned char *pred_base, + int pred_offset, + __global unsigned char *dst_base, + int dst_offset, + int pitch, + int stride, + int use_diff, + global short *diff_base, + int diff_offset, + global short *qcoeff_base, + int qcoeff_offset, + global short *dequant +) +{ + int r, c; + //int pred_offset; + global unsigned char *pred_ptr = &pred_base[pred_offset]; + global unsigned char *dst_ptr = &dst_base[dst_offset]; + + int tid = get_global_id(0); + + int a1; + + if (tid < 16){ + + if (use_diff == 1){ + a1 = diff_base[diff_offset]; + } else { + a1 = qcoeff_base[qcoeff_offset] * dequant[0]; + } + a1 = (a1 + 4)>>3; + + r = tid / 4; + c = tid % 4; + + pred_offset = r * pitch; + dst_offset += r * stride; + int a = a1 + pred_ptr[pred_offset + c] ; + + if (a < 0) + a = 0; + else if (a > 255) + a = 255; + + dst_base[dst_offset + c] = (unsigned char) a ; + } +} + + +__kernel void vp8_short_inv_walsh4x4_1st_pass_kernel( + __global short *src_base, + int src_offset, + __global short *output_base, + int out_offset +) +{ + + __global short *input = src_base + src_offset; + __global short *output = output_base + src_offset; + int tid = get_global_id(0); + +#define VEC_WALSH 0 +#if VEC_WALSH + //4-short vectors to calculate things in + short4 a,b,c,d, a2v, b2v, c2v, d2v, a1t, b1t, c1t, d1t; + short16 out; + + if (tid == 0){ + //first pass loop in vector form + a = vload4(0,input) + vload4(3,input); + b = vload4(1,input) + vload4(2,input); + c = vload4(1,input) - vload4(2,input); + d = vload4(0,input) - vload4(3,input); + vstore4(a + b, 0, output); + vstore4(c + d, 1, output); + vstore4(a - b, 2, output); + vstore4(d - c, 3, output); + + return; + + //2nd pass + a = (short4)(output[0], output[4], output[8], output[12]); + b = (short4)(output[1], output[5], output[9], output[13]); + c = (short4)(output[1], output[5], output[9], output[13]); + d = (short4)(output[0], output[4], output[8], output[12]); + a1t = (short4)(output[3], output[7], output[11], output[15]); + b1t = (short4)(output[2], output[6], output[10], output[14]); + c1t = (short4)(output[2], output[6], output[10], output[14]); + d1t = (short4)(output[3], output[7], output[11], output[15]); + + a = a + a1t + (short)3; + b = b + b1t; + c = c - c1t; + d = d - d1t + (short)3; + + a2v = (a + b) >> (short)3; + b2v = (c + d) >> (short)3; + c2v = (a - b) >> (short)3; + d2v = (d - c) >> (short)3; + + out.s048c = a2v; + out.s159d = b2v; + out.s26ae = c2v; + out.s37bf = d2v; + vstore16(out,0,output); + } +#else + + int i; + int a1, b1, c1, d1; + int a2, b2, c2, d2; + global short *ip = input; + global short *op = output; + + int offset; + + if (tid < 4){ + offset = tid; + a1 = ip[offset] + ip[offset + 12]; + b1 = ip[offset + 4] + ip[offset + 8]; + c1 = ip[offset + 4] - ip[offset + 8]; + d1 = ip[offset] - ip[offset + 12]; + + op[offset] = a1 + b1; + op[offset + 4] = c1 + d1; + op[offset + 8] = a1 - b1; + op[offset + 12] = d1 - c1; + } +#endif +} + +__kernel void vp8_short_inv_walsh4x4_2nd_pass_kernel( + __global short *output_base, + int out_offset +) +{ + int i; + int a1, b1, c1, d1; + int a2, b2, c2, d2; + + __global short *output = output_base + out_offset; + int tid = get_global_id(0); + int offset = 0; + + if (tid < 4){ + offset = 4*tid; + a1 = output[offset] + output[offset + 3]; + b1 = output[offset + 1] + output[offset + 2]; + c1 = output[offset + 1] - output[offset + 2]; + d1 = output[offset + 0] - output[offset + 3]; + + a2 = a1 + b1; + b2 = c1 + d1; + c2 = a1 - b1; + d2 = d1 - c1; + + output[offset + 0] = (a2 + 3) >> 3; + output[offset + 1] = (b2 + 3) >> 3; + output[offset + 2] = (c2 + 3) >> 3; + output[offset + 3] = (d2 + 3) >> 3; + } +} + +__kernel void vp8_short_inv_walsh4x4_1_kernel( + __global short *src_data, + int src_offset, + __global short *dst_data, + int dst_offset +){ + int a1; + int tid = get_global_id(0); + //short16 a; + int i; + short4 a; + __global short *input = src_data + src_offset; + __global short *output = dst_data + dst_offset; + + if (tid < 4) + { + a1 = ((input[0] + 3) >> 3); + a = (short)a1; //Set all elements of vector to a1 + vstore4(a, tid, output); + } +} diff --git a/vp8/common/opencl/idctllm_cl.h b/vp8/common/opencl/idctllm_cl.h new file mode 100644 index 000000000..3156e96bd --- /dev/null +++ b/vp8/common/opencl/idctllm_cl.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp8_opencl.h" +#include "vp8/common/blockd.h" + +#define CLAMP(x,min,max) if (x < min) x = min; else if ( x > max ) x = max; + +//External functions that are fallbacks if CL is unavailable +extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch); +extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch); +extern void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride); +extern void vp8_short_inv_walsh4x4_c(short *input, short *output); +extern void vp8_short_inv_walsh4x4_1_c(short *input, short *output); + +const char *idctCompileOptions = "-Ivp8/common/opencl"; +const char *idctllm_cl_file_name = "vp8/common/opencl/idctllm_cl.cl"; + diff --git a/vp8/common/opencl/loopfilter.cl b/vp8/common/opencl/loopfilter.cl new file mode 100644 index 000000000..ea854797c --- /dev/null +++ b/vp8/common/opencl/loopfilter.cl @@ -0,0 +1,427 @@ +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable +#pragma OPENCL EXTENSION cl_amd_printf : enable + +typedef unsigned char uc; +typedef signed char sc; + +__inline signed char vp8_filter_mask(sc, sc, uc, uc, uc, uc, uc, uc, uc, uc); +__inline signed char vp8_simple_filter_mask(signed char, signed char, uc, uc, uc, uc); +__inline signed char vp8_hevmask(signed char, uc, uc, uc, uc); +__inline signed char vp8_signed_char_clamp(int); + +__inline void vp8_mbfilter(signed char mask,signed char hev,global uc *op2, + global uc *op1,global uc *op0,global uc *oq0,global uc *oq1,global uc *oq2); + +void vp8_simple_filter(signed char mask,global uc *base, int op1_off,int op0_off,int oq0_off,int oq1_off); + + +typedef struct +{ + signed char lim[16]; + signed char flim[16]; + signed char thr[16]; + signed char mbflim[16]; + signed char mbthr[16]; + signed char uvlim[16]; + signed char uvflim[16]; + signed char uvthr[16]; + signed char uvmbflim[16]; + signed char uvmbthr[16]; +} loop_filter_info; + + + + +void vp8_filter( + signed char mask, + signed char hev, + global uc *base, + int op1_off, + int op0_off, + int oq0_off, + int oq1_off +) +{ + + global uc *op1 = &base[op1_off]; + global uc *op0 = &base[op0_off]; + global uc *oq0 = &base[oq0_off]; + global uc *oq1 = &base[oq1_off]; + + signed char ps0, qs0; + signed char ps1, qs1; + signed char vp8_filter, Filter1, Filter2; + signed char u; + + ps1 = (signed char) * op1 ^ 0x80; + ps0 = (signed char) * op0 ^ 0x80; + qs0 = (signed char) * oq0 ^ 0x80; + qs1 = (signed char) * oq1 ^ 0x80; + + /* add outer taps if we have high edge variance */ + vp8_filter = vp8_signed_char_clamp(ps1 - qs1); + vp8_filter &= hev; + + /* inner taps */ + vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); + vp8_filter &= mask; + + /* save bottom 3 bits so that we round one side +4 and the other +3 + * if it equals 4 we'll set to adjust by -1 to account for the fact + * we'd round 3 the other way + */ + Filter1 = vp8_signed_char_clamp(vp8_filter + 4); + Filter2 = vp8_signed_char_clamp(vp8_filter + 3); + Filter1 >>= 3; + Filter2 >>= 3; + u = vp8_signed_char_clamp(qs0 - Filter1); + *oq0 = u ^ 0x80; + u = vp8_signed_char_clamp(ps0 + Filter2); + *op0 = u ^ 0x80; + vp8_filter = Filter1; + + /* outer tap adjustments */ + vp8_filter += 1; + vp8_filter >>= 1; + vp8_filter &= ~hev; + + u = vp8_signed_char_clamp(qs1 - vp8_filter); + *oq1 = u ^ 0x80; + u = vp8_signed_char_clamp(ps1 + vp8_filter); + *op1 = u ^ 0x80; +} + + +kernel void vp8_loop_filter_horizontal_edge_kernel +( + global unsigned char *s_base, + int s_off, + int p, /* pitch */ + global signed char *flimit, + global signed char *limit, + global signed char *thresh, + int off_stride +) +{ + int hev = 0; /* high edge variance */ + signed char mask = 0; + int i = get_global_id(0); + + if (i < get_global_size(0)){ + s_off += i; + + mask = vp8_filter_mask(limit[i], flimit[i], s_base[s_off - 4*p], + s_base[s_off - 3*p], s_base[s_off - 2*p], s_base[s_off - p], + s_base[s_off], s_base[s_off + p], s_base[s_off + 2*p], + s_base[s_off + 3*p]); + + hev = vp8_hevmask(thresh[i], s_base[s_off - 2*p], s_base[s_off - p], + s_base[s_off], s_base[s_off+p]); + + vp8_filter(mask, hev, s_base, s_off - 2 * p, s_off - p, s_off, + s_off + p); + } +} + + +kernel void vp8_loop_filter_vertical_edge_kernel +( + global unsigned char *s_base, + int s_off, + int p, + global signed char *flimit, + global signed char *limit, + global signed char *thresh, + int off_stride +) +{ + + int hev = 0; /* high edge variance */ + signed char mask = 0; + int i = get_global_id(0); + + if ( i < get_global_size(0) ){ + s_off += p * i; + mask = vp8_filter_mask(limit[i], flimit[i], + s_base[s_off-4], s_base[s_off-3], s_base[s_off-2], + s_base[s_off-1], s_base[s_off], s_base[s_off+1], + s_base[s_off+2], s_base[s_off+3]); + + hev = vp8_hevmask(thresh[i], s_base[s_off-2], s_base[s_off-1], + s_base[s_off], s_base[s_off+1]); + + vp8_filter(mask, hev, s_base, s_off - 2, s_off - 1, s_off, s_off + 1); + + } +} + + +kernel void vp8_mbloop_filter_horizontal_edge_kernel +( + global unsigned char *s_base, + int s_off, + int p, + global signed char *flimit, + global signed char *limit, + global signed char *thresh, + int off_stride +) +{ + + global uc *s = s_base+s_off; + + signed char hev = 0; /* high edge variance */ + signed char mask = 0; + int i = get_global_id(0); + + if (i < get_global_size(0)){ + s += i; + + mask = vp8_filter_mask(limit[i], flimit[i], + s[-4*p], s[-3*p], s[-2*p], s[-1*p], + s[0*p], s[1*p], s[2*p], s[3*p]); + + hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]); + + vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p); + + } +} + + +kernel void vp8_mbloop_filter_vertical_edge_kernel +( + global unsigned char *s_base, + int s_off, + int p, + global signed char *flimit, + global signed char *limit, + global signed char *thresh, + int off_stride +) +{ + + global uc *s = s_base + s_off; + + signed char hev = 0; /* high edge variance */ + signed char mask = 0; + int i = get_global_id(0); + + if (i < get_global_size(0)){ + s += p * i; + + mask = vp8_filter_mask(limit[i], flimit[i], + s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]); + + hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]); + + vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2); + + } +} + + +kernel void vp8_loop_filter_simple_horizontal_edge_kernel +( + global unsigned char *s_base, + int s_off, + int p, + global const signed char *flimit, + global const signed char *limit, + global const signed char *thresh, + int off_stride +) +{ + + signed char mask = 0; + int i = get_global_id(0); + (void) thresh; + + if (i < get_global_size(0)) + { + s_off += i; + mask = vp8_simple_filter_mask(limit[i], flimit[i], s_base[s_off-2*p], s_base[s_off-p], s_base[s_off], s_base[s_off+p]); + vp8_simple_filter(mask, s_base, s_off - 2 * p, s_off - 1 * p, s_off, s_off + 1 * p); + } +} + + +kernel void vp8_loop_filter_simple_vertical_edge_kernel +( + global unsigned char *s_base, + int s_off, + int p, + global signed char *flimit, + global signed char *limit, + global signed char *thresh, + int off_stride +) +{ + + signed char mask = 0; + int i = get_global_id(0); + (void) thresh; + + if (i < get_global_size(0)){ + s_off += p * i; + mask = vp8_simple_filter_mask(limit[i], flimit[i], s_base[s_off-2], s_base[s_off-1], s_base[s_off], s_base[s_off+1]); + vp8_simple_filter(mask, s_base, s_off - 2, s_off - 1, s_off, s_off + 1); + } + +} + + + +//Inline and non-kernel functions follow. + +__inline void vp8_mbfilter( + signed char mask, + signed char hev, + global uc *op2, + global uc *op1, + global uc *op0, + global uc *oq0, + global uc *oq1, + global uc *oq2 +) +{ + signed char s, u; + signed char vp8_filter, Filter1, Filter2; + signed char ps2 = (signed char) * op2 ^ 0x80; + signed char ps1 = (signed char) * op1 ^ 0x80; + signed char ps0 = (signed char) * op0 ^ 0x80; + signed char qs0 = (signed char) * oq0 ^ 0x80; + signed char qs1 = (signed char) * oq1 ^ 0x80; + signed char qs2 = (signed char) * oq2 ^ 0x80; + + /* add outer taps if we have high edge variance */ + vp8_filter = vp8_signed_char_clamp(ps1 - qs1); + vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0)); + vp8_filter &= mask; + + Filter2 = vp8_filter; + Filter2 &= hev; + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + Filter1 = vp8_signed_char_clamp(Filter2 + 4); + Filter2 = vp8_signed_char_clamp(Filter2 + 3); + Filter1 >>= 3; + Filter2 >>= 3; + qs0 = vp8_signed_char_clamp(qs0 - Filter1); + ps0 = vp8_signed_char_clamp(ps0 + Filter2); + + + /* only apply wider filter if not high edge variance */ + vp8_filter &= ~hev; + Filter2 = vp8_filter; + + /* roughly 3/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7); + s = vp8_signed_char_clamp(qs0 - u); + *oq0 = s ^ 0x80; + s = vp8_signed_char_clamp(ps0 + u); + *op0 = s ^ 0x80; + + /* roughly 2/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7); + s = vp8_signed_char_clamp(qs1 - u); + *oq1 = s ^ 0x80; + s = vp8_signed_char_clamp(ps1 + u); + *op1 = s ^ 0x80; + + /* roughly 1/7th difference across boundary */ + u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7); + s = vp8_signed_char_clamp(qs2 - u); + *oq2 = s ^ 0x80; + s = vp8_signed_char_clamp(ps2 + u); + *op2 = s ^ 0x80; +} + + +__inline signed char vp8_signed_char_clamp(int t) +{ + t = (t < -128 ? -128 : t); + t = (t > 127 ? 127 : t); + return (signed char) t; +} + + +/* is there high variance internal edge ( 11111111 yes, 00000000 no) */ +__inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1) +{ + signed char hev = 0; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; + return hev; +} + + +/* should we apply any filter at all ( 11111111 yes, 00000000 no) */ +__inline signed char vp8_filter_mask( + signed char limit, + signed char flimit, + uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3) +{ + signed char mask = 0; + mask |= (abs(p3 - p2) > limit) * -1; + mask |= (abs(p2 - p1) > limit) * -1; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(q2 - q1) > limit) * -1; + mask |= (abs(q3 - q2) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit) * -1; + mask = ~mask; + return mask; +} + +/* should we apply any filter at all ( 11111111 yes, 00000000 no) */ +__inline signed char vp8_simple_filter_mask( + signed char limit, + signed char flimit, + uc p1, + uc p0, + uc q0, + uc q1 +) +{ + signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= flimit * 2 + limit) * -1; + return mask; +} + +void vp8_simple_filter( + signed char mask, + global uc *base, + int op1_off, + int op0_off, + int oq0_off, + int oq1_off +) +{ + + global uc *op1 = base + op1_off; + global uc *op0 = base + op0_off; + global uc *oq0 = base + oq0_off; + global uc *oq1 = base + oq1_off; + + signed char vp8_filter, Filter1, Filter2; + signed char p1 = (signed char) * op1 ^ 0x80; + signed char p0 = (signed char) * op0 ^ 0x80; + signed char q0 = (signed char) * oq0 ^ 0x80; + signed char q1 = (signed char) * oq1 ^ 0x80; + signed char u; + + vp8_filter = vp8_signed_char_clamp(p1 - q1); + vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (q0 - p0)); + vp8_filter &= mask; + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + Filter1 = vp8_signed_char_clamp(vp8_filter + 4); + Filter1 >>= 3; + u = vp8_signed_char_clamp(q0 - Filter1); + *oq0 = u ^ 0x80; + + Filter2 = vp8_signed_char_clamp(vp8_filter + 3); + Filter2 >>= 3; + u = vp8_signed_char_clamp(p0 + Filter2); + *op0 = u ^ 0x80; +} \ No newline at end of file diff --git a/vp8/common/opencl/loopfilter_cl.c b/vp8/common/opencl/loopfilter_cl.c new file mode 100644 index 000000000..e04279b23 --- /dev/null +++ b/vp8/common/opencl/loopfilter_cl.c @@ -0,0 +1,457 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "../../../vpx_ports/config.h" +#include "loopfilter_cl.h" +#include "../onyxc_int.h" + +#include "vpx_config.h" +#include "vp8_opencl.h" +#include "blockd_cl.h" + +const char *loopFilterCompileOptions = "-Ivp8/common/opencl"; +const char *loop_filter_cl_file_name = "vp8/common/opencl/loopfilter.cl"; + +typedef unsigned char uc; + +extern void vp8_loop_filter_frame +( + VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl +); + +prototype_loopfilter_cl(vp8_loop_filter_horizontal_edge_cl); +prototype_loopfilter_cl(vp8_loop_filter_vertical_edge_cl); +prototype_loopfilter_cl(vp8_mbloop_filter_horizontal_edge_cl); +prototype_loopfilter_cl(vp8_mbloop_filter_vertical_edge_cl); +prototype_loopfilter_cl(vp8_loop_filter_simple_horizontal_edge_cl); +prototype_loopfilter_cl(vp8_loop_filter_simple_vertical_edge_cl); + +/* Horizontal MB filtering */ +void vp8_loop_filter_mbh_cl( + MACROBLOCKD *x, + cl_mem buf_base, + int y_off, + int u_off, + int v_off, + int y_stride, + int uv_stride, + loop_filter_info *lfi, + int simpler_lpf +) +{ + (void) simpler_lpf; + + vp8_mbloop_filter_horizontal_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1); + vp8_mbloop_filter_horizontal_edge_cl(x, buf_base, u_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1); + vp8_mbloop_filter_horizontal_edge_cl(x, buf_base, v_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1); +} + +void vp8_loop_filter_mbhs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1); +} + +/* Vertical MB Filtering */ +void vp8_loop_filter_mbv_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + + vp8_mbloop_filter_vertical_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1); + vp8_mbloop_filter_vertical_edge_cl(x, buf_base, u_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1); + vp8_mbloop_filter_vertical_edge_cl(x, buf_base, v_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1); +} + +void vp8_loop_filter_mbvs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) uv_stride; + (void) simpler_lpf; + vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1); +} + +/* Horizontal B Filtering */ +void vp8_loop_filter_bh_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + + vp8_loop_filter_horizontal_edge_cl(x, buf_base, y_off + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1); + vp8_loop_filter_horizontal_edge_cl(x, buf_base, y_off + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1); + vp8_loop_filter_horizontal_edge_cl(x, buf_base, y_off + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1); + vp8_loop_filter_horizontal_edge_cl(x, buf_base, u_off + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1); + vp8_loop_filter_horizontal_edge_cl(x, buf_base, v_off + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1); + +} + +void vp8_loop_filter_bhs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) uv_stride; + (void) simpler_lpf; + + vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1); + vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1); + vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1); +} + +/* Vertical B Filtering */ +void vp8_loop_filter_bv_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) simpler_lpf; + + vp8_loop_filter_vertical_edge_cl(x, buf_base, y_off + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1); + vp8_loop_filter_vertical_edge_cl(x, buf_base, y_off + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1); + vp8_loop_filter_vertical_edge_cl(x, buf_base, y_off + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1); + + vp8_loop_filter_vertical_edge_cl(x, buf_base, u_off + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1); + vp8_loop_filter_vertical_edge_cl(x, buf_base, v_off + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1); +} + +void vp8_loop_filter_bvs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off, + int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) +{ + (void) uv_stride; + (void) simpler_lpf; + + vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1); + vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1); + vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1); +} + +void vp8_init_loop_filter_cl(VP8_COMMON *cm) +{ + loop_filter_info *lfi = cm->lf_info; + int sharpness_lvl = cm->sharpness_level; + int frame_type = cm->frame_type; + int i, j; + + int block_inside_limit = 0; + int HEVThresh; + const int yhedge_boost = 2; + + /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */ + for (i = 0; i <= MAX_LOOP_FILTER; i++) + { + int filt_lvl = i; + + if (frame_type == KEY_FRAME) + { + if (filt_lvl >= 40) + HEVThresh = 2; + else if (filt_lvl >= 15) + HEVThresh = 1; + else + HEVThresh = 0; + } + else + { + if (filt_lvl >= 40) + HEVThresh = 3; + else if (filt_lvl >= 20) + HEVThresh = 2; + else if (filt_lvl >= 15) + HEVThresh = 1; + else + HEVThresh = 0; + } + + /* Set loop filter paramaeters that control sharpness. */ + block_inside_limit = filt_lvl >> (sharpness_lvl > 0); + block_inside_limit = block_inside_limit >> (sharpness_lvl > 4); + + if (sharpness_lvl > 0) + { + if (block_inside_limit > (9 - sharpness_lvl)) + block_inside_limit = (9 - sharpness_lvl); + } + + if (block_inside_limit < 1) + block_inside_limit = 1; + + for (j = 0; j < 16; j++) + { + lfi[i].lim[j] = block_inside_limit; + lfi[i].mbflim[j] = filt_lvl + yhedge_boost; + lfi[i].flim[j] = filt_lvl; + lfi[i].thr[j] = HEVThresh; + } + } +} + +/* Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding + * each frame. Check last_frame_type to skip the function most of times. + */ +void vp8_frame_init_loop_filter_cl(loop_filter_info *lfi, int frame_type) +{ + int HEVThresh; + int i, j; + + /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */ + for (i = 0; i <= MAX_LOOP_FILTER; i++) + { + int filt_lvl = i; + + if (frame_type == KEY_FRAME) + { + if (filt_lvl >= 40) + HEVThresh = 2; + else if (filt_lvl >= 15) + HEVThresh = 1; + else + HEVThresh = 0; + } + else + { + if (filt_lvl >= 40) + HEVThresh = 3; + else if (filt_lvl >= 20) + HEVThresh = 2; + else if (filt_lvl >= 15) + HEVThresh = 1; + else + HEVThresh = 0; + } + + for (j = 0; j < 16; j++) + { + lfi[i].thr[j] = HEVThresh; + } + } +} + + +//This might not need to be copied from loopfilter.c +void vp8_adjust_mb_lf_value_cl(MACROBLOCKD *mbd, int *filter_level) +{ + MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi; + + if (mbd->mode_ref_lf_delta_enabled) + { + /* Apply delta for reference frame */ + *filter_level += mbd->ref_lf_deltas[mbmi->ref_frame]; + + /* Apply delta for mode */ + if (mbmi->ref_frame == INTRA_FRAME) + { + /* Only the split mode BPRED has a further special case */ + if (mbmi->mode == B_PRED) + *filter_level += mbd->mode_lf_deltas[0]; + } + else + { + /* Zero motion mode */ + if (mbmi->mode == ZEROMV) + *filter_level += mbd->mode_lf_deltas[1]; + + /* Split MB motion mode */ + else if (mbmi->mode == SPLITMV) + *filter_level += mbd->mode_lf_deltas[3]; + + /* All other inter motion modes (Nearest, Near, New) */ + else + *filter_level += mbd->mode_lf_deltas[2]; + } + + /* Range check */ + if (*filter_level > MAX_LOOP_FILTER) + *filter_level = MAX_LOOP_FILTER; + else if (*filter_level < 0) + *filter_level = 0; + } +} + + +//Start of externally callable functions. + +int cl_init_loop_filter() { + int err; + + // Create the filter compute program from the file-defined source code + if ( cl_load_program(&cl_data.loop_filter_program, loop_filter_cl_file_name, + loopFilterCompileOptions) != CL_SUCCESS ) + return VP8_CL_TRIED_BUT_FAILED; + + // Create the compute kernels in the program we wish to run + VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_horizontal_edge_kernel,"vp8_loop_filter_horizontal_edge_kernel"); + VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_vertical_edge_kernel,"vp8_loop_filter_vertical_edge_kernel"); + VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_mbloop_filter_horizontal_edge_kernel,"vp8_mbloop_filter_horizontal_edge_kernel"); + VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_mbloop_filter_vertical_edge_kernel,"vp8_mbloop_filter_vertical_edge_kernel"); + VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_simple_horizontal_edge_kernel,"vp8_loop_filter_simple_horizontal_edge_kernel"); + VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_simple_vertical_edge_kernel,"vp8_loop_filter_simple_vertical_edge_kernel"); + + return CL_SUCCESS; +} + +void cl_destroy_loop_filter(){ + + if (cl_data.loop_filter_program) + clReleaseProgram(cl_data.loop_filter_program); + + VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_horizontal_edge_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_vertical_edge_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_mbloop_filter_horizontal_edge_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_mbloop_filter_vertical_edge_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_simple_horizontal_edge_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_simple_vertical_edge_kernel); + + cl_data.loop_filter_program = NULL; +} + + +void vp8_loop_filter_set_baselines_cl(MACROBLOCKD *mbd, int default_filt_lvl, int *baseline_filter_level){ + int alt_flt_enabled = mbd->segmentation_enabled; + int i; + + if (alt_flt_enabled) + { + for (i = 0; i < MAX_MB_SEGMENTS; i++) + { + /* Abs value */ + if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) + baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + /* Delta Value */ + else + { + baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i]; + baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */ + } + } + } + else + { + for (i = 0; i < MAX_MB_SEGMENTS; i++) + baseline_filter_level[i] = default_filt_lvl; + } +} + +void vp8_loop_filter_frame_cl +( + VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl +) +{ + YV12_BUFFER_CONFIG *post = cm->frame_to_show; + loop_filter_info *lfi = cm->lf_info; + FRAME_TYPE frame_type = cm->frame_type; + LOOPFILTERTYPE filter_type = cm->filter_type; + + int mb_row; + int mb_col; + + int baseline_filter_level[MAX_MB_SEGMENTS]; + int filter_level; + int alt_flt_enabled = mbd->segmentation_enabled; + + int err; + unsigned char *buf_base; + int y_off, u_off, v_off; + //unsigned char *y_ptr, *u_ptr, *v_ptr; + + mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */ + + /* Note the baseline filter values for each segment */ + vp8_loop_filter_set_baselines_cl(mbd, default_filt_lvl, baseline_filter_level); + + /* Initialize the loop filter for this frame. */ + if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level)) + vp8_init_loop_filter_cl(cm); + else if (frame_type != cm->last_frame_type) + vp8_frame_init_loop_filter_cl(lfi, frame_type); + + /* Set up the buffer pointers */ + + buf_base = post->buffer_alloc; + y_off = post->y_buffer - buf_base; + u_off = post->u_buffer - buf_base; + v_off = post->v_buffer - buf_base; + + VP8_CL_SET_BUF(mbd->cl_commands, post->buffer_mem, post->buffer_size, post->buffer_alloc, + vp8_loop_filter_frame(cm,mbd,default_filt_lvl),); + + /* vp8_filter each macro block */ + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; + + filter_level = baseline_filter_level[Segment]; + + /* Distance of Mb to the various image edges. + * These specified to 8th pel as they are always compared to values + * that are in 1/8th pel units. Apply any context driven MB level + * adjustment + */ + filter_level = vp8_adjust_mb_lf_value(mbd, filter_level); + + if (filter_level) + { + if (mb_col > 0){ + if (filter_type == NORMAL_LOOPFILTER) + vp8_loop_filter_mbv_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + else + vp8_loop_filter_mbvs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + } + + if (mbd->mode_info_context->mbmi.dc_diff > 0){ + if (filter_type == NORMAL_LOOPFILTER) + vp8_loop_filter_bv_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + else + vp8_loop_filter_bvs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + } + + /* don't apply across umv border */ + if (mb_row > 0){ + if (filter_type == NORMAL_LOOPFILTER) + vp8_loop_filter_mbh_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + else + vp8_loop_filter_mbhs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + } + + if (mbd->mode_info_context->mbmi.dc_diff > 0){ + if (filter_type == NORMAL_LOOPFILTER) + vp8_loop_filter_bh_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + else + vp8_loop_filter_bhs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + } + } + + y_off += 16; + u_off += 8; + v_off += 8; + + mbd->mode_info_context++; /* step to next MB */ + } + + y_off += post->y_stride * 16 - post->y_width; + u_off += post->uv_stride * 8 - post->uv_width; + v_off += post->uv_stride * 8 - post->uv_width; + + mbd->mode_info_context++; /* Skip border mb */ + } + + //Retrieve buffer contents + err = clEnqueueReadBuffer(mbd->cl_commands, post->buffer_mem, CL_FALSE, 0, post->buffer_size, post->buffer_alloc, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS(mbd->cl_commands, err != CL_SUCCESS, + "Error: Failed to read loop filter output!\n", + , + ); + + VP8_CL_FINISH(mbd->cl_commands); +} diff --git a/vp8/common/opencl/loopfilter_cl.h b/vp8/common/opencl/loopfilter_cl.h new file mode 100644 index 000000000..0ed72aad2 --- /dev/null +++ b/vp8/common/opencl/loopfilter_cl.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef loopfilter_cl_h +#define loopfilter_cl_h + +#include "../../../vpx_ports/mem.h" + +#include "../onyxc_int.h" +#include "blockd_cl.h" +#include "../loopfilter.h" + +#define prototype_loopfilter_cl(sym) \ + void sym(MACROBLOCKD*, cl_mem src_base, int src_offset, \ + int pitch, const signed char *flimit, \ + const signed char *limit, const signed char *thresh, int count, int block_cnt) + +#define prototype_loopfilter_block_cl(sym) \ + void sym(MACROBLOCKD*, unsigned char *y, unsigned char *u, unsigned char *v,\ + int ystride, int uv_stride, loop_filter_info *lfi, int simpler) + +extern void vp8_loop_filter_frame_cl +( + VP8_COMMON *cm, + MACROBLOCKD *mbd, + int default_filt_lvl +); + +extern prototype_loopfilter_block_cl(vp8_lf_normal_mb_v_cl); +extern prototype_loopfilter_block_cl(vp8_lf_normal_b_v_cl); +extern prototype_loopfilter_block_cl(vp8_lf_normal_mb_h_cl); +extern prototype_loopfilter_block_cl(vp8_lf_normal_b_h_cl); +extern prototype_loopfilter_block_cl(vp8_lf_simple_mb_v_cl); +extern prototype_loopfilter_block_cl(vp8_lf_simple_b_v_cl); +extern prototype_loopfilter_block_cl(vp8_lf_simple_mb_h_cl); +extern prototype_loopfilter_block_cl(vp8_lf_simple_b_h_cl); + +typedef prototype_loopfilter_block_cl((*vp8_lf_block_cl_fn_t)); + +#endif diff --git a/vp8/common/opencl/loopfilter_filters_cl.c b/vp8/common/opencl/loopfilter_filters_cl.c new file mode 100644 index 000000000..656552f86 --- /dev/null +++ b/vp8/common/opencl/loopfilter_filters_cl.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include + +#include + +#include "vpx_ports/config.h" +#include "vp8_opencl.h" +#include "blockd_cl.h" + +//#include "loopfilter_cl.h" +//#include "../onyxc_int.h" + +typedef unsigned char uc; + +static void vp8_loop_filter_cl_run( + cl_command_queue cq, + cl_kernel kernel, + cl_mem buf_mem, + int s_off, + int p, + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + int count, + int block_cnt +){ + size_t global[] = {count,block_cnt}; + int err; + + cl_mem flimit_mem; + cl_mem limit_mem; + cl_mem thresh_mem; + + VP8_CL_CREATE_BUF(cq, flimit_mem, , sizeof(uc)*16, flimit,, ); + VP8_CL_CREATE_BUF(cq, limit_mem, , sizeof(uc)*16, limit,, ); + VP8_CL_CREATE_BUF(cq, thresh_mem, , sizeof(uc)*16, thresh,, ); + + err = 0; + err = clSetKernelArg(kernel, 0, sizeof (cl_mem), &buf_mem); + err |= clSetKernelArg(kernel, 1, sizeof (cl_int), &s_off); + err |= clSetKernelArg(kernel, 2, sizeof (cl_int), &p); + err |= clSetKernelArg(kernel, 3, sizeof (cl_mem), &flimit_mem); + err |= clSetKernelArg(kernel, 4, sizeof (cl_mem), &limit_mem); + err |= clSetKernelArg(kernel, 5, sizeof (cl_mem), &thresh_mem); + err |= clSetKernelArg(kernel, 6, sizeof (cl_int), &block_cnt); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n",, + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel(cq, kernel, 2, NULL, global, NULL , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err);, + ); + + clReleaseMemObject(flimit_mem); + clReleaseMemObject(limit_mem); + clReleaseMemObject(thresh_mem); + + VP8_CL_FINISH(cq); +} + +void vp8_loop_filter_horizontal_edge_cl +( + MACROBLOCKD *x, + cl_mem s_base, + int s_off, + int p, /* pitch */ + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + int count, + int block_cnt +) +{ + vp8_loop_filter_cl_run(x->cl_commands, + cl_data.vp8_loop_filter_horizontal_edge_kernel, s_base, s_off, + p, flimit, limit, thresh, count*8, block_cnt + ); +} + +void vp8_loop_filter_vertical_edge_cl +( + MACROBLOCKD *x, + cl_mem s_base, + int s_off, + int p, + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + int count, + int block_cnt +) +{ + vp8_loop_filter_cl_run(x->cl_commands, + cl_data.vp8_loop_filter_vertical_edge_kernel, s_base, s_off, + p, flimit, limit, thresh, count*8, block_cnt + ); +} + +void vp8_mbloop_filter_horizontal_edge_cl +( + MACROBLOCKD *x, + cl_mem s_base, + int s_off, + int p, + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + int count, + int block_cnt +) +{ + vp8_loop_filter_cl_run(x->cl_commands, + cl_data.vp8_mbloop_filter_horizontal_edge_kernel, s_base, s_off, + p, flimit, limit, thresh, count*8, block_cnt + ); +} + + +void vp8_mbloop_filter_vertical_edge_cl +( + MACROBLOCKD *x, + cl_mem s_base, + int s_off, + int p, + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + int count, + int block_cnt +) +{ + vp8_loop_filter_cl_run(x->cl_commands, + cl_data.vp8_mbloop_filter_vertical_edge_kernel, s_base, s_off, + p, flimit, limit, thresh, count*8, block_cnt + ); +} + +void vp8_loop_filter_simple_horizontal_edge_cl +( + MACROBLOCKD *x, + cl_mem s_base, + int s_off, + int p, + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + int count, + int block_cnt +) +{ + vp8_loop_filter_cl_run(x->cl_commands, + cl_data.vp8_loop_filter_simple_horizontal_edge_kernel, s_base, s_off, + p, flimit, limit, thresh, count*8, block_cnt + ); +} + +void vp8_loop_filter_simple_vertical_edge_cl +( + MACROBLOCKD *x, + cl_mem s_base, + int s_off, + int p, + const signed char *flimit, + const signed char *limit, + const signed char *thresh, + int count, + int block_cnt +) +{ + vp8_loop_filter_cl_run(x->cl_commands, + cl_data.vp8_loop_filter_simple_vertical_edge_kernel, s_base, s_off, + p, flimit, limit, thresh, count*8, block_cnt + ); +} diff --git a/vp8/common/opencl/opencl_systemdependent.c b/vp8/common/opencl/opencl_systemdependent.c new file mode 100644 index 000000000..b2551d047 --- /dev/null +++ b/vp8/common/opencl/opencl_systemdependent.c @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_ports/config.h" +#include "../subpixel.h" +#include "subpixel_cl.h" +#include "../onyxc_int.h" +#include "vp8_opencl.h" + +#if HAVE_DLOPEN +#include "dynamic_cl.h" +#endif + +void vp8_arch_opencl_common_init(VP8_COMMON *ctx) +{ + +#if HAVE_DLOPEN + +#if WIN32 //Windows .dll has no lib prefix and no extension + cl_loaded = load_cl("OpenCL"); +#else //But *nix needs full name + cl_loaded = load_cl("libOpenCL.so"); +#endif + + if (cl_loaded == CL_SUCCESS) + cl_initialized = cl_common_init(); + else + cl_initialized = VP8_CL_TRIED_BUT_FAILED; + +#else //!HAVE_DLOPEN (e.g. Apple) + cl_initialized = cl_common_init(); +#endif + +} diff --git a/vp8/common/opencl/reconinter_cl.c b/vp8/common/opencl/reconinter_cl.c new file mode 100644 index 000000000..fb2f83e9c --- /dev/null +++ b/vp8/common/opencl/reconinter_cl.c @@ -0,0 +1,641 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +//for the decoder, all subpixel prediction is done in this file. +// +//Need to determine some sort of mechanism for easily determining SIXTAP/BILINEAR +//and what arguments to feed into the kernels. These kernels SHOULD be 2-pass, +//and ideally there'd be a data structure that determined what static arguments +//to pass in. +// +//Also, the only external functions being called here are the subpixel prediction +//functions. Hopefully this means no worrying about when to copy data back/forth. + +#include "../../../vpx_ports/config.h" +//#include "../recon.h" +#include "../subpixel.h" +//#include "../blockd.h" +//#include "../reconinter.h" +#if CONFIG_RUNTIME_CPU_DETECT +//#include "../onyxc_int.h" +#endif + +#include "vp8_opencl.h" +#include "filter_cl.h" +#include "reconinter_cl.h" +#include "blockd_cl.h" + +#include + +/* use this define on systems where unaligned int reads and writes are + * not allowed, i.e. ARM architectures + */ +/*#define MUST_BE_ALIGNED*/ + +static const int bbb[4] = {0, 2, 8, 10}; + +static void vp8_memcpy( + unsigned char *src_base, + int src_offset, + int src_stride, + unsigned char *dst_base, + int dst_offset, + int dst_stride, + int num_bytes, + int num_iter +){ + + int i,r; + unsigned char *src = &src_base[src_offset]; + unsigned char *dst = &dst_base[dst_offset]; + src_offset = dst_offset = 0; + + for (r = 0; r < num_iter; r++){ + for (i = 0; i < num_bytes; i++){ + src_offset = r*src_stride + i; + dst_offset = r*dst_stride + i; + dst[dst_offset] = src[src_offset]; + } + } +} + +static void vp8_copy_mem_cl( + cl_command_queue cq, + cl_mem src_mem, + int *src_offsets, + int src_stride, + cl_mem dst_mem, + int *dst_offsets, + int dst_stride, + int num_bytes, + int num_iter, + int num_blocks +){ + + int err,block; + +#if MEM_COPY_KERNEL + size_t global[3] = {num_bytes, num_iter, num_blocks}; + + size_t local[3]; + local[0] = global[0]; + local[1] = global[1]; + local[2] = global[2]; + + err = clSetKernelArg(cl_data.vp8_memcpy_kernel, 0, sizeof (cl_mem), &src_mem); + err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 2, sizeof (int), &src_stride); + err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 3, sizeof (cl_mem), &dst_mem); + err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 5, sizeof (int), &dst_stride); + err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 6, sizeof (int), &num_bytes); + err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 7, sizeof (int), &num_iter); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n", + return, + ); + + for (block = 0; block < num_blocks; block++){ + + /* Set kernel arguments */ + err = clSetKernelArg(cl_data.vp8_memcpy_kernel, 1, sizeof (int), &src_offsets[block]); + err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 4, sizeof (int), &dst_offsets[block]); + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n", + return, + ); + + /* Execute the kernel */ + if (num_bytes * num_iter > cl_data.vp8_memcpy_kernel_size){ + err = clEnqueueNDRangeKernel( cq, cl_data.vp8_memcpy_kernel, 2, NULL, global, NULL , 0, NULL, NULL); + } else { + err = clEnqueueNDRangeKernel( cq, cl_data.vp8_memcpy_kernel, 2, NULL, global, local , 0, NULL, NULL); + } + + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + return, + ); + } +#else + int iter; + for (block=0; block < num_blocks; block++){ + for (iter = 0; iter < num_iter; iter++){ + err = clEnqueueCopyBuffer(cq, src_mem, dst_mem, + src_offsets[block]+iter*src_stride, + dst_offsets[block]+iter*dst_stride, + num_bytes, 0, NULL, NULL + ); + VP8_CL_CHECK_SUCCESS(cq, err != CL_SUCCESS, "Error copying between buffers\n", + , + ); + } + } +#endif +} + +static void vp8_build_inter_predictors_b_cl(MACROBLOCKD *x, BLOCKD *d, int pitch) +{ + unsigned char *ptr_base = *(d->base_pre); + int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + + vp8_subpix_cl_fn_t sppf; + + int pre_dist = *d->base_pre - x->pre.buffer_alloc; + cl_mem pre_mem = x->pre.buffer_mem; + int pre_off = pre_dist+ptr_offset; + + if (d->sixtap_filter == CL_TRUE) + sppf = vp8_sixtap_predict4x4_cl; + else + sppf = vp8_bilinear_predict4x4_cl; + + //ptr_base a.k.a. d->base_pre is the start of the + //Macroblock's y_buffer, u_buffer, or v_buffer + + if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7) + { + sppf(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch); + } + else + { + vp8_copy_mem_cl(d->cl_commands, pre_mem, &pre_off, d->pre_stride,d->cl_predictor_mem, &d->predictor_offset,pitch,4,4,1); + } +} + + +static void vp8_build_inter_predictors4b_cl(MACROBLOCKD *x, BLOCKD *d, int pitch) +{ + unsigned char *ptr_base = *(d->base_pre); + int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + + int pre_dist = *d->base_pre - x->pre.buffer_alloc; + cl_mem pre_mem = x->pre.buffer_mem; + int pre_off = pre_dist + ptr_offset; + + //If there's motion in the bottom 8 subpixels, need to do subpixel prediction + if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7) + { + if (d->sixtap_filter == CL_TRUE) + vp8_sixtap_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch); + else + vp8_bilinear_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch); + } + //Otherwise copy memory directly from src to dest + else + { + vp8_copy_mem_cl(d->cl_commands, pre_mem, &pre_off, d->pre_stride, d->cl_predictor_mem, &d->predictor_offset, pitch, 8, 8, 1); + } + + +} + +static void vp8_build_inter_predictors2b_cl(MACROBLOCKD *x, BLOCKD *d, int pitch) +{ + unsigned char *ptr_base = *(d->base_pre); + + int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + + int pre_dist = *d->base_pre - x->pre.buffer_alloc; + cl_mem pre_mem = x->pre.buffer_mem; + int pre_off = pre_dist+ptr_offset; + + if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7) + { + if (d->sixtap_filter == CL_TRUE) + vp8_sixtap_predict8x4_cl(d->cl_commands,ptr_base,pre_mem,pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch); + else + vp8_bilinear_predict8x4_cl(d->cl_commands,ptr_base,pre_mem,pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch); + } + else + { + vp8_copy_mem_cl(d->cl_commands, pre_mem, &pre_off, d->pre_stride, d->cl_predictor_mem, &d->predictor_offset, pitch, 8, 4, 1); + } +} + + +void vp8_build_inter_predictors_mbuv_cl(MACROBLOCKD *x) +{ + int i; + + vp8_cl_mb_prep(x, PREDICTOR|PRE_BUF); + +#if !ONE_CQ_PER_MB + VP8_CL_FINISH(x->cl_commands); +#endif + + if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME && + x->mode_info_context->mbmi.mode != SPLITMV) + { + + unsigned char *pred_base = x->predictor; + int upred_offset = 256; + int vpred_offset = 320; + + int mv_row = x->block[16].bmi.mv.as_mv.row; + int mv_col = x->block[16].bmi.mv.as_mv.col; + int offset; + + unsigned char *pre_base = x->pre.buffer_alloc; + cl_mem pre_mem = x->pre.buffer_mem; + int upre_off = x->pre.u_buffer - pre_base; + int vpre_off = x->pre.v_buffer - pre_base; + int pre_stride = x->block[16].pre_stride; + + offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); + + if ((mv_row | mv_col) & 7) + { + if (cl_initialized == CL_SUCCESS && x->sixtap_filter == CL_TRUE){ + vp8_sixtap_predict8x8_cl(x->block[16].cl_commands,pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8); + vp8_sixtap_predict8x8_cl(x->block[20].cl_commands,pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8); + } + else{ + vp8_bilinear_predict8x8_cl(x->block[16].cl_commands,pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8); + vp8_bilinear_predict8x8_cl(x->block[20].cl_commands,pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8); + } + } + else + { + int pre_offsets[2] = {upre_off+offset, vpre_off+offset}; + int pred_offsets[2] = {upred_offset,vpred_offset}; + vp8_copy_mem_cl(x->block[16].cl_commands, pre_mem, pre_offsets, pre_stride, x->cl_predictor_mem, pred_offsets, 8, 8, 8, 2); + } + } + else + { + // Can probably batch these operations as well, but not tested in decoder + // (or at least the test videos I've been using. + for (i = 16; i < 24; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + vp8_build_inter_predictors2b_cl(x, d0, 8); + else + { + vp8_build_inter_predictors_b_cl(x, d0, 8); + vp8_build_inter_predictors_b_cl(x, d1, 8); + } + } + } + +#if !ONE_CQ_PER_MB + VP8_CL_FINISH(x->block[0].cl_commands); + VP8_CL_FINISH(x->block[16].cl_commands); + VP8_CL_FINISH(x->block[20].cl_commands); +#endif + + vp8_cl_mb_finish(x, PREDICTOR); +} + +void vp8_build_inter_predictors_mb_cl(MACROBLOCKD *x) +{ + //If CL is running in encoder, need to call following before proceeding. + //vp8_cl_mb_prep(x, PRE_BUF); + +#if !ONE_CQ_PER_MB + VP8_CL_FINISH(x->cl_commands); +#endif + + if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME && + x->mode_info_context->mbmi.mode != SPLITMV) + { + int offset; + unsigned char *pred_base = x->predictor; + int upred_offset = 256; + int vpred_offset = 320; + + int mv_row = x->mode_info_context->mbmi.mv.as_mv.row; + int mv_col = x->mode_info_context->mbmi.mv.as_mv.col; + int pre_stride = x->block[0].pre_stride; + + unsigned char *pre_base = x->pre.buffer_alloc; + cl_mem pre_mem = x->pre.buffer_mem; + int ypre_off = x->pre.y_buffer - pre_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); + int upre_off = x->pre.u_buffer - pre_base; + int vpre_off = x->pre.v_buffer - pre_base; + + if ((mv_row | mv_col) & 7) + { + if (cl_initialized == CL_SUCCESS && x->sixtap_filter == CL_TRUE){ + vp8_sixtap_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, 0, 16); + } + else + vp8_bilinear_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, 0, 16); + } + else + { + //16x16 copy + int pred_off = 0; + vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &ypre_off, pre_stride, x->cl_predictor_mem, &pred_off, 16, 16, 16, 1); + } + + + mv_row = x->block[16].bmi.mv.as_mv.row; + mv_col = x->block[16].bmi.mv.as_mv.col; + pre_stride >>= 1; + offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); + + if ((mv_row | mv_col) & 7) + { + if (x->sixtap_filter == CL_TRUE){ + vp8_sixtap_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8); + vp8_sixtap_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8); + } + else { + vp8_bilinear_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8); + vp8_bilinear_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8); + } + } + else + { + int pre_off = upre_off + offset; + vp8_copy_mem_cl(x->block[16].cl_commands, pre_mem, &pre_off, pre_stride, x->cl_predictor_mem, &upred_offset, 8, 8, 8, 1); + pre_off = vpre_off + offset; + vp8_copy_mem_cl(x->block[20].cl_commands, pre_mem, &pre_off, pre_stride, x->cl_predictor_mem, &vpred_offset, 8, 8, 8, 1); + } + } + else + { + int i; + + if (x->mode_info_context->mbmi.partitioning < 3) + { + for (i = 0; i < 4; i++) + { + BLOCKD *d = &x->block[bbb[i]]; + vp8_build_inter_predictors4b_cl(x, d, 16); + } + } + else + { + /* This loop can be done in any order... No dependencies.*/ + /* Also, d0/d1 can be decoded simultaneously */ + for (i = 0; i < 16; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + vp8_build_inter_predictors2b_cl(x, d0, 16); + else + { + vp8_build_inter_predictors_b_cl(x, d0, 16); + vp8_build_inter_predictors_b_cl(x, d1, 16); + } + } + } + + /* Another case of re-orderable/batchable loop */ + for (i = 16; i < 24; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + vp8_build_inter_predictors2b_cl(x, d0, 8); + else + { + vp8_build_inter_predictors_b_cl(x, d0, 8); + vp8_build_inter_predictors_b_cl(x, d1, 8); + } + } + } + +#if !ONE_CQ_PER_MB + VP8_CL_FINISH(x->block[0].cl_commands); + VP8_CL_FINISH(x->block[16].cl_commands); + VP8_CL_FINISH(x->block[20].cl_commands); +#endif + + vp8_cl_mb_finish(x, PREDICTOR); +} + + +/* The following functions are written for skip_recon_mb() to call. Since there is no recon in this + * situation, we can write the result directly to dst buffer instead of writing it to predictor + * buffer and then copying it to dst buffer. + */ +static void vp8_build_inter_predictors_b_s_cl(MACROBLOCKD *x, BLOCKD *d, int dst_offset) +{ + unsigned char *ptr_base = *(d->base_pre); + int dst_stride = d->dst_stride; + int pre_stride = d->pre_stride; + int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + vp8_subpix_cl_fn_t sppf; + + int pre_dist = *d->base_pre - x->pre.buffer_alloc; + cl_mem pre_mem = x->pre.buffer_mem; + cl_mem dst_mem = x->dst.buffer_mem; + + if (d->sixtap_filter == CL_TRUE){ + sppf = vp8_sixtap_predict4x4_cl; + } else + sppf = vp8_bilinear_predict4x4_cl; + + if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7) + { + sppf(d->cl_commands, ptr_base, pre_mem, pre_dist+ptr_offset, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, NULL, dst_mem, dst_offset, dst_stride); + } + else + { + int pre_off = pre_dist+ptr_offset; + vp8_copy_mem_cl(d->cl_commands, pre_mem,&pre_off,pre_stride, dst_mem, &dst_offset,dst_stride,4,4,1); + } +} + + +void vp8_build_inter_predictors_mb_s_cl(MACROBLOCKD *x) +{ + cl_mem dst_mem = NULL; + cl_mem pre_mem = x->pre.buffer_mem; + + unsigned char *dst_base = x->dst.buffer_alloc; + int ydst_off = x->dst.y_buffer - dst_base; + int udst_off = x->dst.u_buffer - dst_base; + int vdst_off = x->dst.v_buffer - dst_base; + + dst_mem = x->dst.buffer_mem; + vp8_cl_mb_prep(x, DST_BUF); + +#if !ONE_CQ_PER_MB + VP8_CL_FINISH(x->cl_commands); +#endif + + if (x->mode_info_context->mbmi.mode != SPLITMV) + { + int offset; + unsigned char *pre_base = x->pre.buffer_alloc; + int ypre_off = x->pre.y_buffer - pre_base; + int upre_off = x->pre.u_buffer - pre_base; + int vpre_off = x->pre.v_buffer - pre_base; + + int mv_row = x->mode_info_context->mbmi.mv.as_mv.row; + int mv_col = x->mode_info_context->mbmi.mv.as_mv.col; + int pre_stride = x->dst.y_stride; + + int ptr_offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); + + if ((mv_row | mv_col) & 7) + { + if (x->sixtap_filter == CL_TRUE){ + vp8_sixtap_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off+ptr_offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride); + } + else + vp8_bilinear_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off+ptr_offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride); + } + else + { + int pre_off = ypre_off+ptr_offset; + vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off, pre_stride, dst_mem, &ydst_off, x->dst.y_stride, 16, 16, 1); + } + + mv_row = x->block[16].bmi.mv.as_mv.row; + mv_col = x->block[16].bmi.mv.as_mv.col; + pre_stride >>= 1; + offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); + + if ((mv_row | mv_col) & 7) + { + if (x->sixtap_filter == CL_TRUE){ + vp8_sixtap_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, udst_off, x->dst.uv_stride); + vp8_sixtap_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, vdst_off, x->dst.uv_stride); + } else { + vp8_bilinear_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, udst_off, x->dst.uv_stride); + vp8_bilinear_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, vdst_off, x->dst.uv_stride); + } + } + else + { + int pre_offsets[2] = {upre_off+offset, vpre_off+offset}; + int dst_offsets[2] = {udst_off,vdst_off}; + vp8_copy_mem_cl(x->block[16].cl_commands, pre_mem, pre_offsets, pre_stride, dst_mem, dst_offsets, x->dst.uv_stride, 8, 8, 2); + } + + } + else + { + /* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later, + * if sth is wrong, go back to what it is in build_inter_predictors_mb. + * + * ACW: Not sure who the above comment belongs to, but it is + * accurate for the decoder. Verified by reverse trace of source + */ + int i; + + if (x->mode_info_context->mbmi.partitioning < 3) + { + for (i = 0; i < 4; i++) + { + BLOCKD *d = &x->block[bbb[i]]; + + { + unsigned char *ptr_base = *(d->base_pre); + int pre_off = ptr_base - x->pre.buffer_alloc; + + int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + + pre_off += ptr_offset; + + if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7) + { + if (x->sixtap_filter == CL_TRUE) + vp8_sixtap_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride); + else + vp8_bilinear_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride); + } + else + { + vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off, d->pre_stride, dst_mem, &ydst_off, x->dst.y_stride, 8, 8, 1); + } + } + } + } + else + { + for (i = 0; i < 16; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + { + /*vp8_build_inter_predictors2b(x, d0, 16);*/ + unsigned char *ptr_base = *(d0->base_pre); + + int pre_off = ptr_base - x->pre.buffer_alloc; + + int ptr_offset = d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3); + pre_off += ptr_offset; + + if ( (d0->bmi.mv.as_mv.row | d0->bmi.mv.as_mv.col) & 7) + { + if (d0->sixtap_filter == CL_TRUE) + vp8_sixtap_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem, pre_off, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride); + else + vp8_bilinear_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem,pre_off, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride); + } + else + { + vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off, d0->pre_stride, dst_mem, &ydst_off, x->dst.y_stride, 8, 4, 1); + } + } + else + { + vp8_build_inter_predictors_b_s_cl(x,d0, ydst_off); + vp8_build_inter_predictors_b_s_cl(x,d1, ydst_off); + } + } + } + + for (i = 16; i < 24; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + { + /*vp8_build_inter_predictors2b(x, d0, 8);*/ + unsigned char *ptr_base = *(d0->base_pre); + int ptr_offset = d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3); + int pre_off = ptr_base - x->pre.buffer_alloc + ptr_offset; + + if ( (d0->bmi.mv.as_mv.row | d0->bmi.mv.as_mv.col) & 7) + { + if (d0->sixtap_filter || CL_TRUE) + vp8_sixtap_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem, pre_off, d0->pre_stride, + d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, + dst_base, dst_mem, ydst_off, x->dst.uv_stride); + else + vp8_bilinear_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem, pre_off, d0->pre_stride, + d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, + dst_base, dst_mem, ydst_off, x->dst.uv_stride); + } + else + { + vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off, + d0->pre_stride, dst_mem, &ydst_off, x->dst.uv_stride, 8, 4, 1); + } + } + else + { + vp8_build_inter_predictors_b_s_cl(x,d0, ydst_off); + vp8_build_inter_predictors_b_s_cl(x,d1, ydst_off); + } + } //end for + } + +#if !ONE_CQ_PER_MB + VP8_CL_FINISH(x->block[0].cl_commands); + VP8_CL_FINISH(x->block[16].cl_commands); + VP8_CL_FINISH(x->block[20].cl_commands); +#endif + + vp8_cl_mb_finish(x, DST_BUF); +} diff --git a/vp8/common/opencl/reconinter_cl.h b/vp8/common/opencl/reconinter_cl.h new file mode 100644 index 000000000..b37584270 --- /dev/null +++ b/vp8/common/opencl/reconinter_cl.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef __INC_RECONINTER_CL_H +#define __INC_RECONINTER_CL_H + +#include "blockd_cl.h" +#include "subpixel_cl.h" +#include "filter_cl.h" + +extern void vp8_build_inter_predictors_mb_cl(MACROBLOCKD *x); +extern void vp8_build_inter_predictors_mbuv_cl(MACROBLOCKD *x); + +extern void vp8_build_inter_predictors_mb_s_cl(MACROBLOCKD *x); +//extern void vp8_build_inter_predictors_b_cl(BLOCKD *d, int pitch); + +#endif diff --git a/vp8/common/opencl/subpixel_cl.h b/vp8/common/opencl/subpixel_cl.h new file mode 100644 index 000000000..2d80b54da --- /dev/null +++ b/vp8/common/opencl/subpixel_cl.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef SUBPIXEL_CL_H +#define SUBPIXEL_CL_H + +#include "../blockd.h" + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ + +#define prototype_subpixel_predict_cl(sym) \ + void sym(cl_command_queue cq, unsigned char *src_base, cl_mem src_mem, int src_offset, \ + int src_pitch, int xofst, int yofst, \ + unsigned char *dst_base, cl_mem dst_mem, int dst_offset, int dst_pitch) + +extern prototype_subpixel_predict_cl(vp8_sixtap_predict16x16_cl); +extern prototype_subpixel_predict_cl(vp8_sixtap_predict8x8_cl); +extern prototype_subpixel_predict_cl(vp8_sixtap_predict8x4_cl); +extern prototype_subpixel_predict_cl(vp8_sixtap_predict4x4_cl); +extern prototype_subpixel_predict_cl(vp8_bilinear_predict16x16_cl); +extern prototype_subpixel_predict_cl(vp8_bilinear_predict8x8_cl); +extern prototype_subpixel_predict_cl(vp8_bilinear_predict8x4_cl); +extern prototype_subpixel_predict_cl(vp8_bilinear_predict4x4_cl); + +typedef prototype_subpixel_predict_cl((*vp8_subpix_cl_fn_t)); + +//typedef enum +//{ +// SIXTAP = 0, +// BILINEAR = 1 +//} SUBPIX_TYPE; + +#endif diff --git a/vp8/common/opencl/vp8_opencl.c b/vp8/common/opencl/vp8_opencl.c new file mode 100644 index 000000000..6ebcc6a7d --- /dev/null +++ b/vp8/common/opencl/vp8_opencl.c @@ -0,0 +1,342 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include "vp8_opencl.h" + +int cl_initialized = VP8_CL_NOT_INITIALIZED; +VP8_COMMON_CL cl_data; + +//Initialization functions for various CL programs. +extern int cl_init_filter(); +extern int cl_init_idct(); +extern int cl_init_loop_filter(); + +//Common CL destructors +extern void cl_destroy_loop_filter(); +extern void cl_destroy_filter(); +extern void cl_destroy_idct(); + +//Destructors for encoder/decoder-specific bits +extern void cl_decode_destroy(); +extern void cl_encode_destroy(); + +/** + * + * @param cq + * @param new_status + */ +void cl_destroy(cl_command_queue cq, int new_status) { + + if (cl_initialized != CL_SUCCESS) + return; + + //Wait on any pending operations to complete... frees up all of our pointers + if (cq != NULL) + clFinish(cq); + +#if ENABLE_CL_SUBPIXEL + //Release the objects that we've allocated on the GPU + cl_destroy_filter(); +#endif + +#if ENABLE_CL_IDCT_DEQUANT + cl_destroy_idct(); + +#if CONFIG_VP8_DECODER + if (cl_data.cl_decode_initialized == CL_SUCCESS) + cl_decode_destroy(); +#endif + +#endif +#if ENABLE_CL_LOOPFILTER + cl_destroy_loop_filter(); +#endif + + +#if CONFIG_VP8_ENCODER + //placeholder for if/when encoder CL gets implemented +#endif + + if (cq){ + clReleaseCommandQueue(cq); + } + + if (cl_data.context){ + clReleaseContext(cl_data.context); + cl_data.context = NULL; + } + + cl_initialized = new_status; + + return; +} + +/** + * + * @param dev + * @return + */ +cl_device_type device_type(cl_device_id dev){ + cl_device_type type; + int err; + + err = clGetDeviceInfo(dev, CL_DEVICE_TYPE, sizeof(type),&type,NULL); + if (err != CL_SUCCESS) + return CL_INVALID_DEVICE; + return type; +} + +/** + * + * @return + */ +int cl_common_init() { + int err,i,dev; + cl_platform_id platform_ids[MAX_NUM_PLATFORMS]; + cl_uint num_found, num_devices; + cl_device_id devices[MAX_NUM_DEVICES]; + + //Don't allow multiple CL contexts.. + if (cl_initialized != VP8_CL_NOT_INITIALIZED) + return cl_initialized; + + // Connect to a compute device + err = clGetPlatformIDs(MAX_NUM_PLATFORMS, platform_ids, &num_found); + + if (err != CL_SUCCESS) { + fprintf(stderr, "Couldn't query platform IDs\n"); + return VP8_CL_TRIED_BUT_FAILED; + } + + if (num_found == 0) { + fprintf(stderr, "No platforms found\n"); + return VP8_CL_TRIED_BUT_FAILED; + } + + //printf("Enumerating %d platform(s)\n", num_found); + //Enumerate the platforms found + for (i = 0; i < num_found; i++){ + char buf[2048]; + size_t len; + + err = clGetPlatformInfo( platform_ids[i], CL_PLATFORM_VENDOR, sizeof(buf), buf, &len); + if (err != CL_SUCCESS){ + fprintf(stderr, "Error retrieving platform vendor for platform %d",i); + continue; + } + //printf("Platform %d: %s\n",i,buf); + + //If you need to force a platform (e.g. CPU-only testing), uncomment this + //if (strstr(buf,"NVIDIA")) + // continue; + + //Try to find a valid compute device + //Favor the GPU, but fall back to any other available device if necessary +#ifdef __APPLE__ + printf("Apple system. Running CL as CPU-only for now...\n"); + err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_CPU, MAX_NUM_DEVICES, devices, &num_devices); +#else + err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, MAX_NUM_DEVICES, devices, &num_devices); +#endif //__APPLE__ + //printf("found %d devices\n", num_devices); + cl_data.device_id = NULL; + for( dev = 0; dev < num_devices; dev++ ){ + char ext[2048]; + //Get info for this device. + err = clGetDeviceInfo(devices[dev], CL_DEVICE_EXTENSIONS, + sizeof(ext),ext,NULL); + VP8_CL_CHECK_SUCCESS(NULL,err != CL_SUCCESS, + "Error retrieving device extension list",continue, 0); + //printf("Device %d supports: %s\n",dev,ext); + + //The kernels in VP8 require byte-addressable stores, which is an + //extension. It's required in OpenCL 1.1, but not all devices + //support it. + if (strstr(ext,"cl_khr_byte_addressable_store")){ + //We found a valid device, so use it. But if we find a GPU + //(maybe this is one), prefer that. + cl_data.device_id = devices[dev]; + + if ( device_type(devices[dev]) == CL_DEVICE_TYPE_GPU ){ + //printf("Device %d is a GPU\n",dev); + break; + } + } + } + + //If we've found a usable GPU, stop looking. + if (cl_data.device_id != NULL && device_type(cl_data.device_id) == CL_DEVICE_TYPE_GPU ) + break; + + } + + if (cl_data.device_id == NULL){ + printf("Error: Failed to find a valid OpenCL device. Using CPU paths\n"); + return VP8_CL_TRIED_BUT_FAILED; + } + + // Create the compute context + cl_data.context = clCreateContext(0, 1, &cl_data.device_id, NULL, NULL, &err); + if (!cl_data.context) { + printf("Error: Failed to create a compute context!\n"); + return VP8_CL_TRIED_BUT_FAILED; + } + + //Initialize programs to null value + //Enables detection of if they've been initialized as well. + cl_data.filter_program = NULL; + cl_data.idct_program = NULL; + cl_data.loop_filter_program = NULL; + +#if ENABLE_CL_SUBPIXEL + err = cl_init_filter(); + if (err != CL_SUCCESS) + return err; +#endif + +#if ENABLE_CL_IDCT_DEQUANT + err = cl_init_idct(); + if (err != CL_SUCCESS) + return err; +#endif + +#if ENABLE_CL_LOOPFILTER + + err = cl_init_loop_filter(); + if (err != CL_SUCCESS) + return err; +#endif + + return CL_SUCCESS; +} + +char *cl_read_file(const char* file_name) { + long pos; + char *bytes; + size_t amt_read; + FILE *f; + + f = fopen(file_name, "rb"); + + if (f == NULL) { + char *fullpath; + //printf("Couldn't find %s\n", file_name); + + //Generate a file path for the CL sources using the library install dir + fullpath = malloc(strlen(vpx_codec_lib_dir()) + strlen(file_name) + 2); + if (fullpath == NULL) { + return NULL; + } + strcpy(fullpath, vpx_codec_lib_dir()); + strcat(fullpath, "/"); //Will need to be changed for MSVS + strcat(fullpath, file_name); + + //printf("Looking in %s\n", fullpath); + + f = fopen(fullpath, "rb"); + if (f == NULL) { + fprintf(stderr,"Couldn't find CL source at %s or %s\n", file_name, fullpath); + free(fullpath); + return NULL; + } + + //printf("Found cl source at %s\n", fullpath); + free(fullpath); + } else { + //printf("Found cl source at %s\n", file_name); + } + + fseek(f, 0, SEEK_END); + pos = ftell(f); + fseek(f, 0, SEEK_SET); + bytes = malloc(pos+1); + + if (bytes == NULL) { + fclose(f); + return NULL; + } + + amt_read = fread(bytes, pos, 1, f); + if (amt_read != 1) { + free(bytes); + fclose(f); + return NULL; + } + + bytes[pos] = '\0'; //null terminate the source string + fclose(f); + + + return bytes; +} + +void show_build_log(cl_program *prog_ref){ + size_t len; + char *buffer; + int err = clGetProgramBuildInfo(*prog_ref, cl_data.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &len); + + if (err != CL_SUCCESS){ + printf("Error: Could not get length of CL build log\n"); + } + + buffer = (char*) malloc(len); + if (buffer == NULL) { + printf("Error: Couldn't allocate compile output buffer memory\n"); + } + + err = clGetProgramBuildInfo(*prog_ref, cl_data.device_id, CL_PROGRAM_BUILD_LOG, len, buffer, NULL); + if (err != CL_SUCCESS) { + printf("Error: Could not get CL build log\n"); + + } else { + printf("Compile output: %s\n", buffer); + } + free(buffer); +} + +int cl_load_program(cl_program *prog_ref, const char *file_name, const char *opts) { + + int err; + char *kernel_src = cl_read_file(file_name); + + *prog_ref = NULL; + if (kernel_src != NULL) { + *prog_ref = clCreateProgramWithSource(cl_data.context, 1, (const char**)&kernel_src, NULL, &err); + free(kernel_src); + } else { + cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED); + printf("Couldn't find OpenCL source files. \nUsing software path.\n"); + return VP8_CL_TRIED_BUT_FAILED; + } + + if (*prog_ref == NULL) { + printf("Error: Couldn't create program\n"); + return VP8_CL_TRIED_BUT_FAILED; + } + + if (err != CL_SUCCESS) { + printf("Error creating program: %d\n", err); + } + + /* Build the program executable */ + err = clBuildProgram(*prog_ref, 0, NULL, opts, NULL, NULL); + if (err != CL_SUCCESS) { + printf("Error: Failed to build program executable for %s!\n", file_name); + + show_build_log(prog_ref); + + return VP8_CL_TRIED_BUT_FAILED; + } + + return CL_SUCCESS; +} diff --git a/vp8/common/opencl/vp8_opencl.h b/vp8/common/opencl/vp8_opencl.h new file mode 100644 index 000000000..fba6ab5d8 --- /dev/null +++ b/vp8/common/opencl/vp8_opencl.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_OPENCL_H +#define VP8_OPENCL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "../../../vpx_config.h" + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#if HAVE_DLOPEN +#include "dynamic_cl.h" +#endif + +#define ENABLE_CL_IDCT_DEQUANT 0 +#define ENABLE_CL_SUBPIXEL 1 +#define TWO_PASS_SIXTAP 0 +#define MEM_COPY_KERNEL 1 +#define ONE_CQ_PER_MB 1 //Value of 0 is racey... still experimental. +#define ENABLE_CL_LOOPFILTER 0 + +extern char *cl_read_file(const char* file_name); +extern int cl_common_init(); +extern void cl_destroy(cl_command_queue cq, int new_status); +extern int cl_load_program(cl_program *prog_ref, const char *file_name, const char *opts); + +#define MAX_NUM_PLATFORMS 4 +#define MAX_NUM_DEVICES 10 + +#define VP8_CL_TRIED_BUT_FAILED 1 +#define VP8_CL_NOT_INITIALIZED -1 +extern int cl_initialized; + +extern const char *vpx_codec_lib_dir(void); + +#define VP8_CL_FINISH(cq) \ + if (cl_initialized == CL_SUCCESS){ \ + /* Wait for kernels to finish. */ \ + clFinish(cq); \ + } + +#define VP8_CL_BARRIER(cq) \ + if (cl_initialized == CL_SUCCESS){ \ + /* Insert a barrier into the command queue. */ \ + clEnqueueBarrier(cq); \ + } + +#define VP8_CL_CHECK_SUCCESS(cq,cond,msg,alt,retCode) \ + if ( cond ){ \ + fprintf(stderr, msg); \ + cl_destroy(cq, VP8_CL_TRIED_BUT_FAILED); \ + alt; \ + return retCode; \ + } + +#define VP8_CL_CALC_LOCAL_SIZE(kernel, kernel_size) \ + err = clGetKernelWorkGroupInfo( cl_data.kernel, \ + cl_data.device_id, \ + CL_KERNEL_WORK_GROUP_SIZE, \ + sizeof(size_t), \ + &cl_data.kernel_size, \ + NULL);\ + VP8_CL_CHECK_SUCCESS(NULL, err != CL_SUCCESS, \ + "Error: Failed to calculate local size of kernel!\n", \ + ,\ + VP8_CL_TRIED_BUT_FAILED \ + ); \ + +#define VP8_CL_CREATE_KERNEL(data,program,name,str_name) \ + data.name = clCreateKernel(data.program, str_name , &err); \ + VP8_CL_CHECK_SUCCESS(NULL, err != CL_SUCCESS || !data.name, \ + "Error: Failed to create compute kernel "#str_name"!\n", \ + ,\ + VP8_CL_TRIED_BUT_FAILED \ + ); + +#define VP8_CL_READ_BUF(cq, bufRef, bufSize, dstPtr) \ + err = clEnqueueReadBuffer(cq, bufRef, CL_FALSE, 0, bufSize , dstPtr, 0, NULL, NULL); \ + VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, \ + "Error: Failed to read from GPU!\n",, err \ + ); \ + +#define VP8_CL_SET_BUF(cq, bufRef, bufSize, dataPtr, altPath, retCode) \ + { \ + err = clEnqueueWriteBuffer(cq, bufRef, CL_FALSE, 0, \ + bufSize, dataPtr, 0, NULL, NULL); \ + \ + VP8_CL_CHECK_SUCCESS(cq, err != CL_SUCCESS, \ + "Error: Failed to write to buffer!\n", \ + altPath, retCode\ + ); \ + } \ + +#define VP8_CL_CREATE_BUF(cq, bufRef, bufType, bufSize, dataPtr, altPath, retCode) \ + bufRef = clCreateBuffer(cl_data.context, CL_MEM_READ_WRITE, bufSize, NULL, NULL); \ + if (dataPtr != NULL && bufRef != NULL){ \ + VP8_CL_SET_BUF(cq, bufRef, bufSize, dataPtr, altPath, retCode)\ + } \ + VP8_CL_CHECK_SUCCESS(cq, !bufRef, \ + "Error: Failed to allocate buffer. Using CPU path!\n", \ + altPath, retCode\ + ); \ + +#define VP8_CL_RELEASE_KERNEL(kernel) \ + if (kernel) \ + clReleaseKernel(kernel); \ + kernel = NULL; + +typedef struct VP8_COMMON_CL { + cl_device_id device_id; // compute device id + cl_context context; // compute context + //cl_command_queue commands; // compute command queue + + cl_program filter_program; // compute program for subpixel/bilinear filters + cl_kernel vp8_sixtap_predict_kernel; + size_t vp8_sixtap_predict_kernel_size; + cl_kernel vp8_sixtap_predict8x4_kernel; + size_t vp8_sixtap_predict8x4_kernel_size; + cl_kernel vp8_sixtap_predict8x8_kernel; + size_t vp8_sixtap_predict8x8_kernel_size; + cl_kernel vp8_sixtap_predict16x16_kernel; + size_t vp8_sixtap_predict16x16_kernel_size; + + cl_kernel vp8_bilinear_predict4x4_kernel; + cl_kernel vp8_bilinear_predict8x4_kernel; + cl_kernel vp8_bilinear_predict8x8_kernel; + cl_kernel vp8_bilinear_predict16x16_kernel; + + cl_kernel vp8_filter_block2d_first_pass_kernel; + size_t vp8_filter_block2d_first_pass_kernel_size; + cl_kernel vp8_filter_block2d_second_pass_kernel; + size_t vp8_filter_block2d_second_pass_kernel_size; + + cl_kernel vp8_filter_block2d_bil_first_pass_kernel; + size_t vp8_filter_block2d_bil_first_pass_kernel_size; + cl_kernel vp8_filter_block2d_bil_second_pass_kernel; + size_t vp8_filter_block2d_bil_second_pass_kernel_size; + + cl_kernel vp8_memcpy_kernel; + size_t vp8_memcpy_kernel_size; + cl_kernel vp8_memset_short_kernel; + + cl_program idct_program; + cl_kernel vp8_short_inv_walsh4x4_1_kernel; + cl_kernel vp8_short_inv_walsh4x4_1st_pass_kernel; + cl_kernel vp8_short_inv_walsh4x4_2nd_pass_kernel; + cl_kernel vp8_dc_only_idct_add_kernel; + //Note that the following 2 kernels are encoder-only. Not used in decoder. + cl_kernel vp8_short_idct4x4llm_1_kernel; + cl_kernel vp8_short_idct4x4llm_kernel; + + cl_program loop_filter_program; + cl_kernel vp8_loop_filter_horizontal_edge_kernel; + cl_kernel vp8_loop_filter_vertical_edge_kernel; + cl_kernel vp8_mbloop_filter_horizontal_edge_kernel; + cl_kernel vp8_mbloop_filter_vertical_edge_kernel; + cl_kernel vp8_loop_filter_simple_horizontal_edge_kernel; + cl_kernel vp8_loop_filter_simple_vertical_edge_kernel; + + cl_program dequant_program; + cl_kernel vp8_dequant_dc_idct_add_kernel; + cl_kernel vp8_dequant_idct_add_kernel; + cl_kernel vp8_dequantize_b_kernel; + + cl_int cl_decode_initialized; + cl_int cl_encode_initialized; + +} VP8_COMMON_CL; + +extern VP8_COMMON_CL cl_data; + +#ifdef __cplusplus +} +#endif + +#endif /* VP8_OPENCL_H */ + diff --git a/vp8/common/quant_common.c b/vp8/common/quant_common.c index e9833fe33..9ea57571b 100644 --- a/vp8/common/quant_common.c +++ b/vp8/common/quant_common.c @@ -66,6 +66,7 @@ int vp8_dc2quant(int QIndex, int Delta) return retval; } + int vp8_dc_uv_quant(int QIndex, int Delta) { int retval; @@ -116,6 +117,7 @@ int vp8_ac2quant(int QIndex, int Delta) return retval; } + int vp8_ac_uv_quant(int QIndex, int Delta) { int retval; diff --git a/vp8/common/recon.c b/vp8/common/recon.c index d72d6e410..a80ef8a26 100644 --- a/vp8/common/recon.c +++ b/vp8/common/recon.c @@ -110,19 +110,19 @@ void vp8_recon_mby_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) { #if ARCH_ARM BLOCKD *b = &x->block[0]; - RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); /*b = &x->block[4];*/ b += 4; - RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); /*b = &x->block[8];*/ b += 4; - RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); /*b = &x->block[12];*/ b += 4; - RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); #else int i; @@ -130,7 +130,7 @@ void vp8_recon_mby_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) { BLOCKD *b = &x->block[i]; - RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); } #endif } @@ -140,27 +140,27 @@ void vp8_recon_mb_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) #if ARCH_ARM BLOCKD *b = &x->block[0]; - RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); b += 4; - RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); b += 4; - RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); b += 4; - RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); b += 4; /*b = &x->block[16];*/ - RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); b++; b++; - RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); b++; b++; - RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); b++; b++; - RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); #else int i; @@ -168,14 +168,14 @@ void vp8_recon_mb_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) { BLOCKD *b = &x->block[i]; - RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); } for (i = 16; i < 24; i += 2) { BLOCKD *b = &x->block[i]; - RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); } #endif } diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c index 7cfab4140..08c5a9158 100644 --- a/vp8/common/reconinter.c +++ b/vp8/common/reconinter.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include "vpx_ports/config.h" #include "recon.h" #include "subpixel.h" @@ -18,6 +17,12 @@ #include "onyxc_int.h" #endif +#if CONFIG_OPENCL +#include "opencl/vp8_opencl.h" +#include "opencl/filter_cl.h" +#include "opencl/reconinter_cl.h" +#endif + /* use this define on systems where unaligned int reads and writes are * not allowed, i.e. ARM architectures */ @@ -27,7 +32,7 @@ static const int bbb[4] = {0, 2, 8, 10}; - +//Copy 16 x 16-bytes from src to dst. void vp8_copy_mem16x16_c( unsigned char *src, int src_stride, @@ -37,6 +42,9 @@ void vp8_copy_mem16x16_c( int r; + //Set this up as a 2D kernel. Each loop iteration is X, each byte/int within + //is the Y address. + for (r = 0; r < 16; r++) { #ifdef MUST_BE_ALIGNED @@ -71,6 +79,7 @@ void vp8_copy_mem16x16_c( } +//Copy 8 x 8-bytes void vp8_copy_mem8x8_c( unsigned char *src, int src_stride, @@ -136,34 +145,32 @@ void vp8_copy_mem8x4_c( void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf) { int r; - unsigned char *ptr_base; - unsigned char *ptr; - unsigned char *pred_ptr = d->predictor; - ptr_base = *(d->base_pre); + //d->base_pre is the start of the previous frame's y_buffer, u_buffer, or v_buffer + unsigned char *ptr_base = *(d->base_pre); + int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); + + unsigned char *pred_ptr = d->predictor_base + d->predictor_offset; if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) { - ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); - sppf(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch); + sppf(ptr_base+ptr_offset, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch); } else { - ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); - ptr = ptr_base; for (r = 0; r < 4; r++) { #ifdef MUST_BE_ALIGNED - pred_ptr[0] = ptr[0]; - pred_ptr[1] = ptr[1]; - pred_ptr[2] = ptr[2]; - pred_ptr[3] = ptr[3]; + pred_ptr[0] = ptr_base[ptr_offset]; + pred_ptr[1] = ptr_base[ptr_offset+1]; + pred_ptr[2] = ptr_base[ptr_offset+2]; + pred_ptr[3] = ptr_base[ptr_offset+3]; #else - *(int *)pred_ptr = *(int *)ptr ; + *(int *)pred_ptr = *(int *)(ptr_base+ptr_offset) ; #endif pred_ptr += pitch; - ptr += d->pre_stride; + ptr_offset += d->pre_stride; } } } @@ -172,7 +179,7 @@ static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch) { unsigned char *ptr_base; unsigned char *ptr; - unsigned char *pred_ptr = d->predictor; + unsigned char *pred_ptr = d->predictor_base + d->predictor_offset; ptr_base = *(d->base_pre); ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); @@ -191,7 +198,7 @@ static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch) { unsigned char *ptr_base; unsigned char *ptr; - unsigned char *pred_ptr = d->predictor; + unsigned char *pred_ptr = d->predictor_base + d->predictor_offset; ptr_base = *(d->base_pre); ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); @@ -206,11 +213,22 @@ static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch) } } - +/* Encoder only */ void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x) { int i; +#if CONFIG_OPENCL + if ( 0 && cl_initialized == CL_SUCCESS ){ + vp8_build_inter_predictors_mbuv_cl(x); + VP8_CL_FINISH(x->cl_commands); + VP8_CL_FINISH(x->block[0].cl_commands); + VP8_CL_FINISH(x->block[16].cl_commands); + VP8_CL_FINISH(x->block[20].cl_commands); + return; + } +#endif + if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME && x->mode_info_context->mbmi.mode != SPLITMV) { @@ -229,8 +247,8 @@ void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x) if ((mv_row | mv_col) & 7) { - x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8); - x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8); + x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8); + x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8); } else { @@ -260,8 +278,8 @@ void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x) void vp8_build_inter_predictors_mby(MACROBLOCKD *x) { - if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME && - x->mode_info_context->mbmi.mode != SPLITMV) + if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME && + x->mode_info_context->mbmi.mode != SPLITMV) { unsigned char *ptr_base; unsigned char *ptr; @@ -275,7 +293,7 @@ void vp8_build_inter_predictors_mby(MACROBLOCKD *x) if ((mv_row | mv_col) & 7) { - x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16); + x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16); } else { @@ -354,8 +372,8 @@ void vp8_build_inter_predictors_mb(MACROBLOCKD *x) if ((mv_row | mv_col) & 7) { - x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8); - x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8); + x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8); + x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8); } else { @@ -492,7 +510,7 @@ void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel) } -/* The following functions are wriiten for skip_recon_mb() to call. Since there is no recon in this +/* The following functions are written for skip_recon_mb() to call. Since there is no recon in this * situation, we can write the result directly to dst buffer instead of writing it to predictor * buffer and then copying it to dst buffer. */ @@ -501,22 +519,20 @@ static void vp8_build_inter_predictors_b_s(BLOCKD *d, unsigned char *dst_ptr, vp int r; unsigned char *ptr_base; unsigned char *ptr; - /*unsigned char *pred_ptr = d->predictor;*/ + /*unsigned char *pred_ptr = d->predictor_base + d->predictor_offset;*/ int dst_stride = d->dst_stride; int pre_stride = d->pre_stride; + int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); ptr_base = *(d->base_pre); + ptr = ptr_base + ptr_offset; if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) { - ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, dst_stride); } else { - ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); - ptr = ptr_base; - for (r = 0; r < 4; r++) { #ifdef MUST_BE_ALIGNED @@ -534,14 +550,17 @@ static void vp8_build_inter_predictors_b_s(BLOCKD *d, unsigned char *dst_ptr, vp } - void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x) { - /*unsigned char *pred_ptr = x->block[0].predictor; - unsigned char *dst_ptr = *(x->block[0].base_dst) + x->block[0].dst;*/ - unsigned char *pred_ptr = x->predictor; unsigned char *dst_ptr = x->dst.y_buffer; +#if CONFIG_OPENCL && ENABLE_CL_SUBPIXEL + if (cl_initialized == CL_SUCCESS){ + vp8_build_inter_predictors_mb_s_cl(x); + return; + } +#endif + if (x->mode_info_context->mbmi.mode != SPLITMV) { int offset; @@ -563,7 +582,7 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x) if ((mv_row | mv_col) & 7) { - x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/ + x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/ } else { @@ -579,8 +598,8 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x) if ((mv_row | mv_col) & 7) { - x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, udst_ptr, x->dst.uv_stride); - x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vdst_ptr, x->dst.uv_stride); + x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, udst_ptr, x->dst.uv_stride); + x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vdst_ptr, x->dst.uv_stride); } else { @@ -592,6 +611,8 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x) { /* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later, * if sth is wrong, go back to what it is in build_inter_predictors_mb. + * + * ACW: note: Not sure who the above comment belongs to. */ int i; @@ -605,7 +626,6 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x) { unsigned char *ptr_base; unsigned char *ptr; - unsigned char *pred_ptr = d->predictor; ptr_base = *(d->base_pre); ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); @@ -621,7 +641,7 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x) } } } - else + else { for (i = 0; i < 16; i += 2) { @@ -633,7 +653,6 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x) /*build_inter_predictors2b(x, d0, 16);*/ unsigned char *ptr_base; unsigned char *ptr; - unsigned char *pred_ptr = d0->predictor; ptr_base = *(d0->base_pre); ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3); @@ -665,7 +684,6 @@ void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x) /*build_inter_predictors2b(x, d0, 8);*/ unsigned char *ptr_base; unsigned char *ptr; - unsigned char *pred_ptr = d0->predictor; ptr_base = *(d0->base_pre); ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3); diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c index 9cf5f6a88..a69212b9f 100644 --- a/vp8/common/reconintra.c +++ b/vp8/common/reconintra.c @@ -24,7 +24,7 @@ void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x) for (i = 16; i < 24; i += 2) { BLOCKD *b = &x->block[i]; - RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); } } diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c index cd70dca73..ce3c0021c 100644 --- a/vp8/common/reconintra4x4.c +++ b/vp8/common/reconintra4x4.c @@ -124,6 +124,18 @@ void vp8_predict_intra4x4(BLOCKD *x, case B_LD_PRED: { unsigned char *ptr = Above; + +#if 0 + //More readable version of the unrolled loop + int stride = 16, r=0, c=0; + for (r=0; r < 4; r++){ + for (c=0; c < 4; c++){ + int off = r+c; + int off2 = off > 5 ? 5: off; //Clamp so [3,3] has max off2 of 7 + predictor[r*stride+c] = (ptr[off] + ptr[off+1]*2 + ptr[off2+2] + 2)>>2; + } + } +#else predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2; predictor[0 * 16 + 1] = predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2; @@ -140,7 +152,8 @@ void vp8_predict_intra4x4(BLOCKD *x, predictor[2 * 16 + 3] = predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2; predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2; - +#endif + } break; case B_RD_PRED: @@ -311,5 +324,3 @@ void vp8_intra_prediction_down_copy(MACROBLOCKD *x) *dst_ptr1 = *src_ptr; *dst_ptr2 = *src_ptr; } - - diff --git a/vp8/common/swapyv12buffer.c b/vp8/common/swapyv12buffer.c index 73656b3d7..634e432ce 100644 --- a/vp8/common/swapyv12buffer.c +++ b/vp8/common/swapyv12buffer.c @@ -11,10 +11,16 @@ #include "swapyv12buffer.h" + + void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame) { unsigned char *temp; - +#if CONFIG_OPENCL + cl_mem temp_mem; +#endif + int temp_size; + temp = last_frame->buffer_alloc; last_frame->buffer_alloc = new_frame->buffer_alloc; new_frame->buffer_alloc = temp; @@ -31,4 +37,14 @@ void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *las last_frame->v_buffer = new_frame->v_buffer; new_frame->v_buffer = temp; + temp_size = last_frame->buffer_size; + last_frame->buffer_size = new_frame->buffer_size; + new_frame->buffer_size = temp_size; + +#if CONFIG_OPENCL + temp_mem = last_frame->buffer_mem; + last_frame->buffer_mem = new_frame->buffer_mem; + new_frame->buffer_mem = temp_mem; +#endif + } diff --git a/vp8/decoder/arm/dequantize_arm.c b/vp8/decoder/arm/dequantize_arm.c index d88adb729..152d8756c 100644 --- a/vp8/decoder/arm/dequantize_arm.c +++ b/vp8/decoder/arm/dequantize_arm.c @@ -27,8 +27,8 @@ extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); void vp8_dequantize_b_neon(BLOCKD *d) { int i; - short *DQ = d->dqcoeff; - short *Q = d->qcoeff; + short *DQ = d->dqcoeff_base + d->dqcoeff_offset; + short *Q = d->qcoeff_base + d->qcoeff_offset; short *DQC = d->dequant; vp8_dequantize_b_loop_neon(Q, DQC, DQ); @@ -39,8 +39,8 @@ void vp8_dequantize_b_neon(BLOCKD *d) void vp8_dequantize_b_v6(BLOCKD *d) { int i; - short *DQ = d->dqcoeff; - short *Q = d->qcoeff; + short *DQ = d->dqcoeff_base + d->dqcoeff_offset; + short *Q = d->qcoeff_base + d->qcoeff_offset; short *DQC = d->dequant; vp8_dequantize_b_loop_v6(Q, DQC, DQ); diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c index 82841e8b8..11e9b5f73 100644 --- a/vp8/decoder/decodframe.c +++ b/vp8/decoder/decodframe.c @@ -17,7 +17,6 @@ #include "vp8/common/reconinter.h" #include "dequantize.h" #include "detokenize.h" -#include "vp8/common/invtrans.h" #include "vp8/common/alloccommon.h" #include "vp8/common/entropymode.h" #include "vp8/common/quant_common.h" @@ -33,10 +32,21 @@ #include "vp8/common/threading.h" #include "decoderthreading.h" #include "dboolhuff.h" +#include "vp8/common/blockd.h" #include #include +#include "vpx_config.h" +#if CONFIG_OPENCL +#include "vp8/common/opencl/vp8_opencl.h" +#include "vp8/common/opencl/blockd_cl.h" +#include "opencl/dequantize_cl.h" +#include "opencl/decodframe_cl.h" +#endif + +#define PROFILE_OUTPUT 0 + void vp8cx_init_de_quantizer(VP8D_COMP *pbi) { int i; @@ -98,6 +108,10 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) xd->block[24].dequant = pc->Y2dequant[QIndex]; +#if CONFIG_OPENCL && ENABLE_CL_IDCT_DEQUANT + mb_init_dequantizer_cl(xd); +#endif + } #if CONFIG_RUNTIME_CPU_DETECT @@ -121,6 +135,14 @@ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd) else { vp8_build_inter_predictors_mb_s(xd); +#if CONFIG_OPENCL + VP8_CL_FINISH(xd->cl_commands); +#if !ONE_CQ_PER_MB + VP8_CL_FINISH(xd->block[0].cl_commands); + VP8_CL_FINISH(xd->block[16].cl_commands); + VP8_CL_FINISH(xd->block[20].cl_commands); +#endif +#endif } } @@ -177,6 +199,7 @@ void clamp_mvs(MACROBLOCKD *xd) static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) { + int eobtotal = 0; int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs; @@ -197,6 +220,27 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) xd->mode_info_context->mbmi.dc_diff = 1; +#if PROFILE_OUTPUT + if (xd->frame_type == KEY_FRAME) + printf("Intra-Coded MB\n"); + else{ + if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME){ + printf("Intra-Coded Inter-Frame MB\n"); + } else { + printf("Inter-Coded MB\n"); + } + } +#endif + +#if CONFIG_OPENCL + //If OpenCL is enabled and initialized, use CL-specific decoder for remains + //of MB decoding. + if (cl_initialized == CL_SUCCESS){ + vp8_decode_macroblock_cl(pbi, xd, eobtotal); + return; + } +#endif + if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0) { xd->mode_info_context->mbmi.dc_diff = 0; @@ -229,68 +273,68 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV) { BLOCKD *b = &xd->block[24]; + short *qcoeff = b->qcoeff_base + b->qcoeff_offset; + vp8_second_order_fn_t second_order; + DEQUANT_INVOKE(&pbi->dequant, block)(b); /* do 2nd order transform on the dc block */ - if (xd->eobs[24] > 1) - { - IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; - ((int *)b->qcoeff)[1] = 0; - ((int *)b->qcoeff)[2] = 0; - ((int *)b->qcoeff)[3] = 0; - ((int *)b->qcoeff)[4] = 0; - ((int *)b->qcoeff)[5] = 0; - ((int *)b->qcoeff)[6] = 0; - ((int *)b->qcoeff)[7] = 0; - } - else - { - IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; + if (xd->eobs[24] > 1){ + second_order = IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16); + ((int *)qcoeff)[0] = 0; + ((int *)qcoeff)[1] = 0; + ((int *)qcoeff)[2] = 0; + ((int *)qcoeff)[3] = 0; + ((int *)qcoeff)[4] = 0; + ((int *)qcoeff)[5] = 0; + ((int *)qcoeff)[6] = 0; + ((int *)qcoeff)[7] = 0; + } else { + second_order = IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1); + ((int *)qcoeff)[0] = 0; } + second_order(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset]); DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block) - (xd->qcoeff, xd->block[0].dequant, - xd->predictor, xd->dst.y_buffer, - xd->dst.y_stride, xd->eobs, xd->block[24].diff); + (xd->qcoeff, xd->block[0].dequant, + xd->predictor, xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs, &xd->block[24].diff_base[xd->block[24].diff_offset]); } else if ((xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED) { for (i = 0; i < 16; i++) { - BLOCKD *b = &xd->block[i]; - vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + short *qcoeff = b->qcoeff_base + b->qcoeff_offset; + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor_base + b->predictor_offset); if (xd->eobs[i] > 1) { DEQUANT_INVOKE(&pbi->dequant, idct_add) - (b->qcoeff, b->dequant, b->predictor, + (qcoeff, b->dequant, b->predictor_base + b->predictor_offset, *(b->base_dst) + b->dst, 16, b->dst_stride); } else { IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add) - (b->qcoeff[0] * b->dequant[0], b->predictor, + (qcoeff[0] * b->dequant[0], b->predictor_base + b->predictor_offset, *(b->base_dst) + b->dst, 16, b->dst_stride); - ((int *)b->qcoeff)[0] = 0; + ((int *)qcoeff)[0] = 0; } } - } else { DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block) - (xd->qcoeff, xd->block[0].dequant, - xd->predictor, xd->dst.y_buffer, - xd->dst.y_stride, xd->eobs); + (xd->qcoeff, xd->block[0].dequant, + xd->predictor, xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs); } DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block) - (xd->qcoeff+16*16, xd->block[16].dequant, - xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer, - xd->dst.uv_stride, xd->eobs+16); + (xd->qcoeff+16*16, xd->block[16].dequant, + xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride, xd->eobs+16); } @@ -343,6 +387,13 @@ decode_mb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mb_row, MACROBLOCKD *xd) xd->mb_to_top_edge = -((mb_row * 16)) << 3; xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3; + + xd->dst.buffer_alloc = pc->yv12_fb[dst_fb_idx].buffer_alloc; + xd->dst.buffer_size = pc->yv12_fb[dst_fb_idx].buffer_size; +#if CONFIG_OPENCL + xd->dst.buffer_mem = pc->yv12_fb[dst_fb_idx].buffer_mem; +#endif + for (mb_col = 0; mb_col < pc->mb_cols; mb_col++) { @@ -378,6 +429,11 @@ decode_mb_row(VP8D_COMP *pbi, VP8_COMMON *pc, int mb_row, MACROBLOCKD *xd) xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset; xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset; xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset; + xd->pre.buffer_alloc = pc->yv12_fb[ref_fb_idx].buffer_alloc; + xd->pre.buffer_size = pc->yv12_fb[ref_fb_idx].buffer_size; +#if CONFIG_OPENCL + xd->pre.buffer_mem = pc->yv12_fb[ref_fb_idx].buffer_mem; +#endif if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) { @@ -517,7 +573,7 @@ static void init_frame(VP8D_COMP *pbi) vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); xd->mb_segement_abs_delta = SEGMENT_DELTADATA; - /* reset the mode ref deltasa for loop filter */ + /* reset the mode ref deltas for loop filter */ vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas)); vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas)); @@ -535,14 +591,13 @@ static void init_frame(VP8D_COMP *pbi) } else { - if (!pc->use_bilinear_mc_filter) - pc->mcomp_filter_type = SIXTAP; - else - pc->mcomp_filter_type = BILINEAR; - /* To enable choice of different interploation filters */ + /* To enable choice of different interpolation filters */ if (pc->mcomp_filter_type == SIXTAP) { +#if CONFIG_OPENCL + xd->sixtap_filter = CL_TRUE; +#endif xd->subpixel_predict = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap4x4); xd->subpixel_predict8x4 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap8x4); xd->subpixel_predict8x8 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap8x8); @@ -550,6 +605,9 @@ static void init_frame(VP8D_COMP *pbi) } else { +#if CONFIG_OPENCL + xd->sixtap_filter = CL_FALSE; +#endif xd->subpixel_predict = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear4x4); xd->subpixel_predict8x4 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear8x4); xd->subpixel_predict8x8 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear8x8); @@ -565,6 +623,7 @@ static void init_frame(VP8D_COMP *pbi) xd->corrupted = 0; /* init without corruption */ } + int vp8_decode_frame(VP8D_COMP *pbi) { vp8_reader *const bc = & pbi->bc; @@ -614,9 +673,12 @@ int vp8_decode_frame(VP8D_COMP *pbi) pc->vert_scale = data[6] >> 6; data += 7; + //Allow resolution changes on key frames. if (Width != pc->Width || Height != pc->Height) { +#if CONFIG_MULTITHREAD int prev_mb_rows = pc->mb_rows; +#endif if (pc->Width <= 0) { @@ -807,19 +869,17 @@ int vp8_decode_frame(VP8D_COMP *pbi) pc->refresh_last_frame = pc->frame_type == KEY_FRAME || vp8_read_bit(bc); - if (0) - { - FILE *z = fopen("decodestats.stt", "a"); - fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n", - pc->current_video_frame, - pc->frame_type, - pc->refresh_golden_frame, - pc->refresh_alt_ref_frame, - pc->refresh_last_frame, - pc->base_qindex); - fclose(z); - } - +#if 0 + FILE *z = fopen("decodestats.stt", "a"); + fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n", + pc->current_video_frame, + pc->frame_type, + pc->refresh_golden_frame, + pc->refresh_alt_ref_frame, + pc->refresh_last_frame, + pc->base_qindex); + fclose(z); +#endif { /* read coef probability tree */ @@ -840,6 +900,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) } } + //Set up the macroblock's previous/destination buffers vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG)); vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG)); @@ -849,13 +910,13 @@ int vp8_decode_frame(VP8D_COMP *pbi) #endif vp8_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]); + /* clear out the coeff buffer */ + vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff)); + vp8_setup_block_dptrs(xd); vp8_build_block_doffsets(xd); - /* clear out the coeff buffer */ - vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff)); - /* Read the mb_no_coeff_skip flag */ pc->mb_no_coeff_skip = (int)vp8_read_bit(bc); @@ -866,6 +927,13 @@ int vp8_decode_frame(VP8D_COMP *pbi) vpx_memcpy(&xd->block[0].bmi, &xd->mode_info_context->bmi[0], sizeof(B_MODE_INFO)); +#if PROFILE_OUTPUT + if (pc->frame_type == KEY_FRAME) + printf("Key Frame\n"); + else + printf("Inter-Frame\n"); +#endif + #if CONFIG_MULTITHREAD if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION) { @@ -886,7 +954,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) int ibc = 0; int num_part = 1 << pc->multi_token_partition; - /* Decode the individual macro block */ + /* Decode the individual macro blocks */ for (mb_row = 0; mb_row < pc->mb_rows; mb_row++) { @@ -903,7 +971,10 @@ int vp8_decode_frame(VP8D_COMP *pbi) } } - +#if CONFIG_OPENCL + vp8_decode_frame_cl_finish(pbi); +#endif + stop_token_decoder(pbi); /* Collect information about decoder corruption. */ diff --git a/vp8/decoder/dequantize.c b/vp8/decoder/dequantize.c index dd0c13b7d..74f6c6b4d 100644 --- a/vp8/decoder/dequantize.c +++ b/vp8/decoder/dequantize.c @@ -21,8 +21,8 @@ extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch); void vp8_dequantize_b_c(BLOCKD *d) { int i; - short *DQ = d->dqcoeff; - short *Q = d->qcoeff; + short *DQ = d->dqcoeff_base + d->dqcoeff_offset; + short *Q = d->qcoeff_base + d->qcoeff_offset; short *DQC = d->dequant; for (i = 0; i < 16; i++) diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c index c22e0f28c..0a818a6f0 100644 --- a/vp8/decoder/detokenize.c +++ b/vp8/decoder/detokenize.c @@ -155,8 +155,8 @@ DECLARE_ALIGNED(16, extern const unsigned char, vp8dx_bitreader_norm[256]); Prob = coef_probs + (ENTROPY_NODES*2); \ if(c < 15){\ qcoeff_ptr [ scan[c] ] = (INT16) v; \ - ++c; \ - goto DO_WHILE; }\ + continue; \ + }\ qcoeff_ptr [ scan[15] ] = (INT16) v; \ goto BLOCK_FINISHED; @@ -249,7 +249,8 @@ BLOCK_LOOP: Prob = coef_probs; Prob += v * ENTROPY_NODES; -DO_WHILE: +do{ + Prob += coef_bands_x[c]; DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED); @@ -328,9 +329,8 @@ ONE_CONTEXT_NODE_0_: if (c < 15) { qcoeff_ptr [ scan[c] ] = (INT16) v; - ++c; - goto DO_WHILE; } +} while (c++ < 15); qcoeff_ptr [ scan[15] ] = (INT16) v; BLOCK_FINISHED: diff --git a/vp8/decoder/generic/dsystemdependent.c b/vp8/decoder/generic/dsystemdependent.c index 2406deaaf..70dbf9c55 100644 --- a/vp8/decoder/generic/dsystemdependent.c +++ b/vp8/decoder/generic/dsystemdependent.c @@ -15,6 +15,7 @@ extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi); extern void vp8_arch_arm_decode_init(VP8D_COMP *pbi); +extern void vp8_arch_opencl_decode_init(VP8D_COMP *pbi); void vp8_dmachine_specific_config(VP8D_COMP *pbi) { @@ -36,4 +37,8 @@ void vp8_dmachine_specific_config(VP8D_COMP *pbi) #if ARCH_ARM vp8_arch_arm_decode_init(pbi); #endif + +#if CONFIG_OPENCL && (ENABLE_CL_IDCT_DEQUANT) + vp8_arch_opencl_decode_init(pbi); +#endif } diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c index 1e83ab542..dd2e08f01 100644 --- a/vp8/decoder/onyxd_if.c +++ b/vp8/decoder/onyxd_if.c @@ -34,9 +34,22 @@ #include "vpx_ports/arm.h" #endif +#include "vpx_config.h" +#if CONFIG_OPENCL +#include "vp8/common/opencl/blockd_cl.h" +#include "vp8/common/opencl/vp8_opencl.h" +#endif + extern void vp8_init_loop_filter(VP8_COMMON *cm); extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi); +#define PROFILE_OUTPUT 0 +#if PROFILE_OUTPUT +struct vpx_usec_timer frame_timer; +struct vpx_usec_timer loop_filter_timer; +unsigned int total_mb = 0; +unsigned int total_loop_filter = 0; +#endif void vp8dx_initialize() { @@ -113,6 +126,7 @@ void vp8dx_remove_decompressor(VP8D_PTR ptr) vp8_decoder_remove_threads(pbi); #endif vp8_remove_common(&pbi->common); + vpx_free(pbi); } @@ -319,8 +333,83 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign pbi->Source = source; pbi->source_sz = size; +#if CONFIG_OPENCL + pbi->mb.cl_commands = NULL; + if (cl_initialized == CL_SUCCESS){ + int err; + //Create command queue for macroblock. + pbi->mb.cl_commands = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err); + if (!pbi->mb.cl_commands || err != CL_SUCCESS) { + printf("Error: Failed to create a command queue!\n"); + cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED); + } + + pbi->mb.cl_diff_mem = NULL; + pbi->mb.cl_predictor_mem = NULL; + pbi->mb.cl_qcoeff_mem = NULL; + pbi->mb.cl_dqcoeff_mem = NULL; + pbi->mb.cl_eobs_mem = NULL; + +#define SET_ON_ALLOC 0 +#if SET_ON_ALLOC + +#if ENABLE_CL_SUBPIXEL || ENABLE_CL_IDCT_DEQUANT + VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_predictor_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, + sizeof(cl_uchar)*384, pbi->mb.predictor, goto BUF_DONE, -1); +#endif + +#if ENABLE_CL_IDCT_DEQUANT + VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_diff_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, + sizeof(cl_short)*400, pbi->mb.diff, goto BUF_DONE, -1); + + VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_qcoeff_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, + sizeof(cl_short)*400, pbi->mb.qcoeff, goto BUF_DONE,-1); + + VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_dqcoeff_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, + sizeof(cl_short)*400, pbi->mb.dqcoeff, goto BUF_DONE,-1); + + VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_eobs_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, + sizeof(cl_char)*25, pbi->mb.eobs, goto BUF_DONE,-1); +#endif +#else +#if ENABLE_CL_IDCT_DEQUANT || ENABLE_CL_SUBPIXEL + VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_predictor_mem, CL_MEM_READ_WRITE, + sizeof(cl_uchar)*384, NULL, goto BUF_DONE,-1); +#endif + +#if ENABLE_CL_IDCT_DEQUANT + VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_diff_mem, CL_MEM_READ_WRITE, + sizeof(cl_short)*400, NULL, goto BUF_DONE,-1); + + VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_qcoeff_mem, CL_MEM_READ_WRITE, + sizeof(cl_short)*400, NULL, goto BUF_DONE,-1); + + VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_dqcoeff_mem, CL_MEM_READ_WRITE, + sizeof(cl_short)*400, NULL, goto BUF_DONE,-1); + + VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_eobs_mem, CL_MEM_READ_WRITE, + sizeof(cl_char) * 25, NULL, goto BUF_DONE,-1); +#endif +#endif + } +#if ENABLE_CL_IDCT_DEQUANT || ENABLE_CL_SUBPIXEL + BUF_DONE: +#endif +#endif + +#if PROFILE_OUTPUT + printf("Frame size = %d * %d\n", cm->Height, cm->Width); + printf("Macroblocks = %d * %d\n", cm->mb_rows, cm->mb_cols); + + vpx_usec_timer_start(&frame_timer); +#endif retcode = vp8_decode_frame(pbi); +#if PROFILE_OUTPUT + vpx_usec_timer_mark(&frame_timer); + total_mb += vpx_usec_timer_elapsed(&frame_timer); +#endif + if (retcode < 0) { #if HAVE_ARMV7 @@ -375,16 +464,53 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign if(pbi->common.filter_level) { + +#if PROFILE_OUTPUT + struct vpx_usec_timer lpftimer; + vpx_usec_timer_start(&lpftimer); +#endif + /* Apply the loop filter if appropriate. */ vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level); +#if PROFILE_OUTPUT + vpx_usec_timer_mark(&lpftimer); + pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer); + + printf("Loop Filter\n"); + total_loop_filter += vpx_usec_timer_elapsed(&lpftimer); +#if 0 + if (pbi->common.filter_type == NORMAL_LOOPFILTER){ + printf("Normal LF Time (us): %d\n", vpx_usec_timer_elapsed(&lpftimer)); + } else { + printf("Simple LF Time (us): %d\n", vpx_usec_timer_elapsed(&lpftimer)); + } +#endif +#endif + cm->last_frame_type = cm->frame_type; cm->last_filter_type = cm->filter_type; cm->last_sharpness_level = cm->sharpness_level; } +#if PROFILE_OUTPUT + else { + printf("No Loop Filter\n"); + } +#endif vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show); } +#if CONFIG_OPENCL + if (cl_initialized == CL_SUCCESS){ + //Copy buffer_alloc to buffer_mem so YV12_BUFFER_CONFIG can be used as + //a reference frame (e.g. YV12..buffer_mem contains same as buffer_alloc). + vp8_cl_mb_prep(&pbi->mb, DST_BUF); + + if (pbi->mb.cl_commands != NULL) + clReleaseCommandQueue(pbi->mb.cl_commands); + pbi->mb.cl_commands = NULL; + } +#endif vp8_clear_system_state(); @@ -439,8 +565,18 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign } #endif pbi->common.error.setjmp = 0; + + +#if PROFILE_OUTPUT + //Dump the total MB/Loop Filter processing times. + //This is cumulative between frames, so only use the last output value. + printf("MB Time (us): %d, LF Time (us): %d\n", total_mb, total_loop_filter); +#endif + + return retcode; } + int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags) { int ret = -1; diff --git a/vp8/decoder/opencl/decodframe_cl.c b/vp8/decoder/opencl/decodframe_cl.c new file mode 100644 index 000000000..dfcc0a7b1 --- /dev/null +++ b/vp8/decoder/opencl/decodframe_cl.c @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "../onyxd_int.h" +#include "vp8/common/header.h" +#include "vp8/common/reconintra.h" +#include "vp8/common/reconintra4x4.h" +#include "vp8/common/recon.h" +#include "vp8/common/reconinter.h" +//#include "../dequantize.h" +//#include "../detokenize.h" +//#include "vp8/common/alloccommon.h" +//#include "vp8/common/entropymode.h" +//#include "vp8/common/quant_common.h" +//#include "vpx_scale/vpxscale.h" +//#include "vpx_scale/yv12extend.h" +//#include "vp8/common/setupintrarecon.h" + +//#include "../decodemv.h" +//#include "vp8/common/extend.h" +//#include "vpx_mem/vpx_mem.h" +//#include "vp8/common/idct.h" +//#include "../dequantize.h" +//#include "vp8/common/predictdc.h" +//#include "vp8/common/threading.h" +//#include "../decoderthreading.h" +//#include "../dboolhuff.h" +//#include "vp8/common/blockd.h" + +#include +#include + +#include "vpx_config.h" +#if CONFIG_OPENCL +#include "vp8/common/opencl/vp8_opencl.h" +#include "vp8/common/opencl/blockd_cl.h" +#include "vp8/common/opencl/reconinter_cl.h" +#include "dequantize_cl.h" +#endif + +#define PROFILE_OUTPUT 0 + +//Implemented in ../decodframe.c +extern void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd); + +void mb_init_dequantizer_cl(MACROBLOCKD *xd){ + int i, err; + //Set up per-block dequant CL memory. Eventually, might be able to set up + //one large buffer containing the entire large dequant buffer. + if (cl_initialized == CL_SUCCESS){ + for (i=0; i < 25; i++){ + +#if 1 //Initialize CL memory on allocation? + VP8_CL_CREATE_BUF(xd->cl_commands, xd->block[i].cl_dequant_mem, + , + 16*sizeof(cl_short), + xd->block[i].dequant,, + ); +#else + VP8_CL_CREATE_BUF(xd->cl_commands, xd->block[i].cl_dequant_mem, + , + 16*sizeof(cl_short), + NULL,, + ); +#endif + } + } +} + +#if CONFIG_RUNTIME_CPU_DETECT +#define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x) +#else +#define RTCD_VTABLE(x) NULL +#endif + +/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it + * to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy. + */ +static void skip_recon_mb_cl(VP8D_COMP *pbi, MACROBLOCKD *xd) +{ + if (xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) + { + + vp8_build_intra_predictors_mbuv_s(xd); + RECON_INVOKE(&pbi->common.rtcd.recon, + build_intra_predictors_mby_s)(xd); + + } + else + { +#if ENABLE_CL_SUBPIXEL + if (cl_initialized == CL_SUCCESS) + { + vp8_build_inter_predictors_mb_s_cl(xd); + } else +#endif + { + vp8_build_inter_predictors_mb_s(xd); + } + VP8_CL_FINISH(xd->cl_commands); +#if !ONE_CQ_PER_MB + VP8_CL_FINISH(xd->block[0].cl_commands); + VP8_CL_FINISH(xd->block[16].cl_commands); + VP8_CL_FINISH(xd->block[20].cl_commands); +#endif + } +} + +void vp8_decode_macroblock_cl(VP8D_COMP *pbi, MACROBLOCKD *xd, int eobtotal) +{ + int i; + + if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0) + { + xd->mode_info_context->mbmi.dc_diff = 0; + skip_recon_mb_cl(pbi, xd); + return; + } + + if (xd->segmentation_enabled) + mb_init_dequantizer(pbi, xd); + + /* do prediction */ + if (xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) + { + vp8_build_intra_predictors_mbuv(xd); + + if (xd->mode_info_context->mbmi.mode != B_PRED) + { + RECON_INVOKE(&pbi->common.rtcd.recon, + build_intra_predictors_mby)(xd); + } else { + vp8_intra_prediction_down_copy(xd); + } + } + else + { +#if ENABLE_CL_SUBPIXEL + vp8_build_inter_predictors_mb_cl(xd); +#else + vp8_build_inter_predictors_mb(xd); +#endif + +#if !ENABLE_CL_IDCT_DEQUANT + //Wait for inter-predict if dequant/IDCT is being done on the CPU + VP8_CL_FINISH(xd->cl_commands); +#endif + } + + /* dequantization and idct */ + if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV) + { + BLOCKD *b = &xd->block[24]; + short *qcoeff = b->qcoeff_base + b->qcoeff_offset; + vp8_second_order_fn_t second_order; + +#if ENABLE_CL_IDCT_DEQUANT + if (cl_initialized == CL_SUCCESS){ + vp8_cl_block_prep(b, DEQUANT|QCOEFF); + vp8_dequantize_b_cl(b); + vp8_cl_block_finish(b, DQCOEFF); + VP8_CL_FINISH(b->cl_commands); //Keep until qcoeff memset below is CL + } + else +#endif + { + DEQUANT_INVOKE(&pbi->dequant, block)(b); + } + + + /* do 2nd order transform on the dc block */ + if (xd->eobs[24] > 1){ + second_order = IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16); + ((int *)qcoeff)[0] = 0; + ((int *)qcoeff)[1] = 0; + ((int *)qcoeff)[2] = 0; + ((int *)qcoeff)[3] = 0; + ((int *)qcoeff)[4] = 0; + ((int *)qcoeff)[5] = 0; + ((int *)qcoeff)[6] = 0; + ((int *)qcoeff)[7] = 0; + } else { + second_order = IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1); + ((int *)qcoeff)[0] = 0; + } + +#if ENABLE_CL_IDCT_DEQUANT + if (cl_initialized == CL_SUCCESS){ + int y_off = xd->dst.y_buffer - xd->dst.buffer_alloc; + vp8_cl_block_prep(b, DQCOEFF|DIFF); + + if (xd->eobs[24] > 1) + { + vp8_short_inv_walsh4x4_cl(b); + } else { + vp8_short_inv_walsh4x4_1_cl(b); + } + vp8_cl_block_finish(b, DIFF); + + vp8_dequant_dc_idct_add_y_block_cl(&xd->block[0], + xd->dst.buffer_alloc, xd->dst.buffer_mem, y_off, xd->dst.y_stride, xd->eobs, + xd->block[24].diff_offset); + } + else +#endif + { + second_order(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset]); + DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block) + (xd->qcoeff, xd->block[0].dequant, + xd->predictor, xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs, &xd->block[24].diff_base[xd->block[24].diff_offset]); + } + } + else if ((xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED) + { +#if ENABLE_CL_IDCT_DEQUANT + if (cl_initialized == CL_SUCCESS) + vp8_cl_mb_prep(xd, DST_BUF); +#endif + for (i = 0; i < 16; i++) + { + BLOCKD *b = &xd->block[i]; + short *qcoeff = b->qcoeff_base + b->qcoeff_offset; +#if ENABLE_CL_IDCT_DEQUANT + VP8_CL_FINISH(b->cl_commands); +#endif + vp8_predict_intra4x4(b, b->bmi.mode, b->predictor_base + b->predictor_offset); + +#if ENABLE_CL_IDCT_DEQUANT + if (cl_initialized == CL_SUCCESS){ + size_t dst_size = (4*b->dst_stride + b->dst + 4); + cl_mem dst_mem = xd->dst.buffer_mem; + + int dst_off = *(b->base_dst) - xd->dst.buffer_alloc; + + if (xd->eobs[i] > 1) + { + vp8_cl_block_prep(b, QCOEFF|DEQUANT|PREDICTOR); + vp8_dequant_idct_add_cl(b, *(b->base_dst), dst_mem, dst_off+b->dst, dst_size, b->qcoeff_offset, b->predictor_offset, 16, b->dst_stride, DEQUANT_INVOKE(&pbi->dequant, idct_add)); + vp8_cl_block_finish(b, QCOEFF); + } + else + { + vp8_cl_block_prep(b, PREDICTOR|DIFF|QCOEFF|DEQUANT); + vp8_dc_only_idct_add_cl(b, CL_FALSE, 0, b->qcoeff_offset, b->predictor_offset, + *(b->base_dst), dst_mem, dst_off+b->dst, dst_size, 16, b->dst_stride); + VP8_CL_FINISH(b->cl_commands); + ((int *)(b->qcoeff_base + b->qcoeff_offset))[0] = 0; //Move into follow-up kernel? + } + vp8_cl_mb_finish(xd,DST_BUF); + } + else +#endif + { + if (xd->eobs[i] > 1) + { + DEQUANT_INVOKE(&pbi->dequant, idct_add) + (qcoeff, b->dequant, b->predictor_base + b->predictor_offset, + *(b->base_dst) + b->dst, 16, b->dst_stride); + } + else + { + IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add) + (qcoeff[0] * b->dequant[0], b->predictor_base + b->predictor_offset, + *(b->base_dst) + b->dst, 16, b->dst_stride); + ((int *)qcoeff)[0] = 0; + } + } + + } + } + else + { +#if ENABLE_CL_IDCT_DEQUANT + if (cl_initialized == CL_SUCCESS){ + vp8_cl_mb_prep(xd,DST_BUF); + vp8_dequant_idct_add_y_block_cl(pbi, xd); + vp8_cl_mb_finish(xd,DST_BUF); + } + else +#endif + { + DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block) + (xd->qcoeff, xd->block[0].dequant, + xd->predictor, xd->dst.y_buffer, + xd->dst.y_stride, xd->eobs); + } + } + +#if ENABLE_CL_IDCT_DEQUANT + if (cl_initialized == CL_SUCCESS){ + vp8_cl_mb_prep(xd,DST_BUF); + vp8_dequant_idct_add_uv_block_cl(pbi, xd, DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)); + vp8_cl_mb_finish(xd,DST_BUF); + VP8_CL_FINISH(xd->cl_commands); + } else +#endif + { + DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block) + (xd->qcoeff+16*16, xd->block[16].dequant, + xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.uv_stride, xd->eobs+16); + } +} + +void vp8_decode_frame_cl_finish(VP8D_COMP *pbi){ + + //If using OpenCL, free all of the GPU buffers we've allocated. + if (cl_initialized == CL_SUCCESS){ +#if ENABLE_CL_IDCT_DEQUANT + int i; +#endif + + //Wait for stuff to finish, just in case + clFinish(pbi->mb.cl_commands); + +#if !ONE_CQ_PER_MB + clFinish(pbi->mb.block[0].cl_commands); + clFinish(pbi->mb.block[16].cl_commands); + clFinish(pbi->mb.block[20].cl_commands); + clReleaseCommandQueue(pbi->mb.block[0].cl_commands); + clReleaseCommandQueue(pbi->mb.block[16].cl_commands); + clReleaseCommandQueue(pbi->mb.block[20].cl_commands); +#endif + +#if ENABLE_CL_IDCT_DEQUANT || ENABLE_CL_SUBPIXEL + //Free Predictor CL buffer + if (pbi->mb.cl_predictor_mem != NULL) + clReleaseMemObject(pbi->mb.cl_predictor_mem); +#endif + +#if ENABLE_CL_IDCT_DEQUANT + //Free other CL Block/MBlock buffers + if (pbi->mb.cl_diff_mem != NULL) + clReleaseMemObject(pbi->mb.cl_diff_mem); + if (pbi->mb.cl_qcoeff_mem != NULL) + clReleaseMemObject(pbi->mb.cl_qcoeff_mem); + if (pbi->mb.cl_dqcoeff_mem != NULL) + clReleaseMemObject(pbi->mb.cl_dqcoeff_mem); + if (pbi->mb.cl_eobs_mem != NULL) + clReleaseMemObject(pbi->mb.cl_eobs_mem); + + for (i = 0; i < 25; i++){ + clReleaseMemObject(pbi->mb.block[i].cl_dequant_mem); + pbi->mb.block[i].cl_dequant_mem = NULL; + } +#endif + } +} diff --git a/vp8/decoder/opencl/decodframe_cl.h b/vp8/decoder/opencl/decodframe_cl.h new file mode 100644 index 000000000..3ed3dc5e1 --- /dev/null +++ b/vp8/decoder/opencl/decodframe_cl.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_DECODFRAME_CL_H +#define VP8_DECODFRAME_CL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "../onyxd_int.h" +#include "vp8/common/blockd.h" + +//Implemented in decodframe_cl.c +extern void mb_init_dequantizer_cl(MACROBLOCKD *xd); +extern void vp8_decode_frame_cl_finish(VP8D_COMP *pbi); +extern void vp8_decode_macroblock_cl(VP8D_COMP *pbi, MACROBLOCKD *xd, int eobtotal); + + +#ifdef __cplusplus +} +#endif + +#endif /* VP8_DECODFRAME_CL_H */ diff --git a/vp8/decoder/opencl/dequantize_cl.c b/vp8/decoder/opencl/dequantize_cl.c new file mode 100644 index 000000000..13dcbeae2 --- /dev/null +++ b/vp8/decoder/opencl/dequantize_cl.c @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +//ACW: Remove me after debugging. +#include +#include + +#include "vp8/common/opencl/blockd_cl.h" +#include "vp8/common/opencl/idct_cl.h" +#include "dequantize_cl.h" + +const char *dequantCompileOptions = ""; +const char *dequant_cl_file_name = "vp8/decoder/opencl/dequantize_cl.cl"; + +void cl_memset_short(short *s, int c, size_t n) { + for (n /= sizeof(short); n > 0; --n) + *s++ = c; +} + +void vp8_memset_short_cl(cl_mem mem, int offset, short val){ + +} + +int cl_destroy_dequant(){ + printf("Freeing dequant decoder resources\n"); + + VP8_CL_RELEASE_KERNEL(cl_data.vp8_dequant_dc_idct_add_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_dequant_idct_add_kernel); + VP8_CL_RELEASE_KERNEL(cl_data.vp8_dequantize_b_kernel); + + if (cl_data.dequant_program) + clReleaseProgram(cl_data.dequant_program); + cl_data.dequant_program = NULL; + + return CL_SUCCESS; +} + +int cl_init_dequant() { + int err; + + //printf("Initializing dequant program/kernels\n"); + + // Create the compute program from the file-defined source code + if (cl_load_program(&cl_data.dequant_program, dequant_cl_file_name, + dequantCompileOptions) != CL_SUCCESS) + return VP8_CL_TRIED_BUT_FAILED; + + // Create the compute kernels in the program we wish to run + VP8_CL_CREATE_KERNEL(cl_data,dequant_program,vp8_dequant_dc_idct_add_kernel,"vp8_dequant_dc_idct_add_kernel"); + VP8_CL_CREATE_KERNEL(cl_data,dequant_program,vp8_dequant_idct_add_kernel,"vp8_dequant_idct_add_kernel"); + VP8_CL_CREATE_KERNEL(cl_data,dequant_program,vp8_dequantize_b_kernel,"vp8_dequantize_b_kernel"); + + //printf("Created dequant kernels\n"); + + return CL_SUCCESS; +} + +void vp8_dequantize_b_cl(BLOCKD *d) +{ + int err; + size_t global = 16; + + /* Set kernel arguments */ + err = 0; + err = clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 0, sizeof (cl_mem), &d->cl_dqcoeff_mem); + err |= clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 1, sizeof (cl_int), &d->dqcoeff_offset); + err |= clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 2, sizeof (cl_mem), &d->cl_qcoeff_mem); + err |= clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 3, sizeof (cl_int), &d->qcoeff_offset); + err |= clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 4, sizeof (cl_mem), &d->cl_dequant_mem); + VP8_CL_CHECK_SUCCESS( d->cl_commands, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n", + vp8_dequantize_b_c(d), + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel( d->cl_commands, cl_data.vp8_dequantize_b_kernel, 1, NULL, &global, NULL , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( d->cl_commands, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err);\ + vp8_dequantize_b_c(d), + ); + +} + +void vp8_dequant_idct_add_cl(BLOCKD *b, unsigned char *dest_base, cl_mem dest_mem, int dest_offset, size_t dst_size, int q_offset, int pred_offset, int pitch, int stride, vp8_dequant_idct_add_fn_t idct_add) +{ + int err; + size_t global = 1; + //cl_mem dest_mem = NULL; + int free_mem = 0; + + if (dest_mem == NULL){ + //Initialize destination memory + VP8_CL_CREATE_BUF(b->cl_commands, dest_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, + dst_size, dest_base,, + ); + free_mem = 1; + } + + /* Set kernel arguments */ + err = 0; + err = clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 0, sizeof (cl_mem), &b->cl_qcoeff_mem); + err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 1, sizeof (int), &q_offset); + err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 2, sizeof (cl_mem), &b->cl_dequant_mem); + err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 3, sizeof (cl_mem), &b->cl_predictor_mem); + err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 4, sizeof (int), &pred_offset); + err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 5, sizeof (cl_mem), &dest_mem); + err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 6, sizeof (int), &dest_offset); + err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 7, sizeof (int), &pitch); + err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 8, sizeof (int), &stride); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n", + idct_add(b->qcoeff_base+q_offset, b->dequant, b->predictor_base + pred_offset, + dest_base + dest_offset, pitch, stride), + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel( b->cl_commands, cl_data.vp8_dequant_idct_add_kernel, 1, NULL, &global, NULL , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err);\ + idct_add(b->qcoeff_base+q_offset, b->dequant, b->predictor_base + pred_offset, + dest_base + dest_offset, pitch, stride), + ); + + if (free_mem == 1){ + /* Read back the result data from the device */ + err = clEnqueueReadBuffer(b->cl_commands, dest_mem, CL_FALSE, 0, dst_size, dest_base, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to read output array!\n", + idct_add(b->qcoeff_base+q_offset, b->dequant, b->predictor_base + pred_offset, + dest_base + dest_offset, pitch, stride), + ); + + //CL Spec says this can be freed without clFinish first + clReleaseMemObject(dest_mem); + } + + return; +} + +//Can modify arguments. Only called from vp8_dequant_dc_idct_add_y_block_cl. +void vp8_dequant_dc_idct_add_cl( + BLOCKD *b, + int qcoeff_offset, + int pred_offset, + unsigned char *dest_base, + int dest_off, + int pitch, + int stride, + int Dc_offset) +{ + int err; + int dq_offset = 0; + unsigned char *dest = dest_base + dest_off; + + cl_mem dest_mem = NULL; + size_t dest_size; + size_t global = 1; + int dest_offset=0; + + //Initialize dest_mem + dest_size = sizeof(cl_uchar)*(4*stride + dest_offset + 4); + VP8_CL_CREATE_BUF(b->cl_commands, dest_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR, + dest_size, dest,, + ); + + //Assuming that all input cl_mem has been initialized outside of this Fn. + + /* Set kernel arguments */ + err = 0; + err = clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 0, sizeof (cl_mem), &b->cl_qcoeff_mem); + err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 1, sizeof (int), &qcoeff_offset); + err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 2, sizeof (cl_mem), &b->cl_dequant_mem); + err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 3, sizeof(int), &dq_offset); + err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 4, sizeof (cl_mem), &b->cl_predictor_mem); + err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 5, sizeof (int), &pred_offset); + err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 6, sizeof (cl_mem), &b->cl_diff_mem); + err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 7, sizeof (int), &Dc_offset); + err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 8, sizeof (cl_mem), &dest_mem); + err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 9, sizeof (int), &pitch); + err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 10, sizeof (int), &stride); + + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to set kernel arguments!\n",, + ); + + /* Execute the kernel */ + err = clEnqueueNDRangeKernel( b->cl_commands, cl_data.vp8_dequant_dc_idct_add_kernel, 1, NULL, &global, NULL , 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to execute kernel!\n", + printf("err = %d\n",err);, + ); + + /* Read back the result data from the device */ + err = clEnqueueReadBuffer(b->cl_commands, dest_mem, CL_FALSE, 0, dest_size, dest, 0, NULL, NULL); + VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS, + "Error: Failed to read output array!\n",, + ); + + //CL Spec says this can be freed without clFinish first + clReleaseMemObject(dest_mem); + dest_mem = NULL; + + return; + +} diff --git a/vp8/decoder/opencl/dequantize_cl.cl b/vp8/decoder/opencl/dequantize_cl.cl new file mode 100644 index 000000000..42ad74d53 --- /dev/null +++ b/vp8/decoder/opencl/dequantize_cl.cl @@ -0,0 +1,272 @@ +#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable +#pragma OPENCL EXTENSION cl_amd_printf : enable + + +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +__constant int cospi8sqrt2minus1 = 20091; +__constant int sinpi8sqrt2 = 35468; +__constant int rounding = 0; + +void vp8_short_idct4x4llm(__global short*, short*, int); +void cl_memset_short(__global short*, int, size_t); + +#define USE_VECTORS 0 + +__kernel void vp8_dequantize_b_kernel( + __global short *dqcoeff_base, + int dqcoeff_offset, + __global short *qcoeff_base, + int qcoeff_offset, + __global short *dequant +) +{ + __global short *DQ = dqcoeff_base + dqcoeff_offset; + __global short *Q = qcoeff_base + qcoeff_offset; + +#if USE_VECTORS + vstore16(vload16(0,Q) * vload16(0,dequant), 0, DQ); +#else + int tid = get_global_id(0); + if (tid < 16) + { + DQ[tid] = Q[tid] * dequant[tid]; + } + +#endif +} + +__kernel void vp8_dequant_idct_add_kernel( + __global short *input_base, + int input_offset, + __global short *dq, + __global unsigned char *pred_base, + int pred_offset, + __global unsigned char *dest_base, + int dest_offset, + int pitch, + int stride +) +{ + short output[16]; + short *diff_ptr = output; + int r, c; + int i; + __global unsigned char *dest = dest_base + dest_offset; + __global short *input = input_base + input_offset; + __global unsigned char *pred = pred_base + pred_offset; + +#if USE_VECTORS + vstore16( (short16)vload16(0,dq) * (short16)vload16(0,input) , 0, input); +#else + for (i = 0; i < 16; i++) + { + input[i] = dq[i] * input[i]; + } +#endif + + /* the idct halves ( >> 1) the pitch */ + vp8_short_idct4x4llm(input, output, 4 << 1); + + //Note, remember to copy back the input buffer (qcoeff) to system memory. + cl_memset_short(input, 0, 32); + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + int a = diff_ptr[c] + pred[c]; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dest[c] = (unsigned char) a; + } + + dest += stride; + diff_ptr += 4; + pred += pitch; + } +} + + +__kernel void vp8_dequant_dc_idct_add_kernel( + __global short *qcoeff_base, + int qcoeff_offset, + + __global short *dequant_base, + int dequant_offset, + + __global unsigned char *pred_base, + int pred_offset, + + __global short *diff_base, + int diff_offset, + + __global unsigned char *dest, + + int pitch, + int stride +) +{ + int i; + short output[16]; + short *diff_ptr = output; + int r, c; + + global short *input = &qcoeff_base[qcoeff_offset]; + global short *dq = &dequant_base[dequant_offset]; + global unsigned char *pred = pred_base + pred_offset; + + //A modified input buffer... copy back to System memory when done! + input[0] = diff_base[diff_offset]; + +#if USE_VECTORS + vstore16( (short16)vload16(0,dq) * (short16)vload16(0,input) , 0, input); +#else + for (i = 1; i < 16; i++) + { + input[i] = dq[i] * input[i]; + } +#endif + + /* the idct halves ( >> 1) the pitch */ + vp8_short_idct4x4llm(input, output, 4 << 1); + + cl_memset_short(input, 0, 32); + + for (r = 0; r < 4; r++) + { + for (c = 0; c < 4; c++) + { + int a = diff_ptr[c] + pred[c]; + + if (a < 0) + a = 0; + + if (a > 255) + a = 255; + + dest[c] = (unsigned char) a; + } + + dest += stride; + diff_ptr += 4; + pred += pitch; + } +} + + + + +//Note that this kernel has been copied from common/opencl/idctllm_cl.cl +void vp8_short_idct4x4llm( + __global short *input, + short *output, + int pitch +) +{ + int i; + int a1, b1, c1, d1; + + __global short *ip = input; + short *op = output; + int temp1, temp2; + int shortpitch = pitch >> 1; + + for (i = 0; i < 4; i++) + { + a1 = ip[0] + ip[8]; + b1 = ip[0] - ip[8]; + + temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16; + temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16); + c1 = temp1 - temp2; + + temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16); + temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16; + d1 = temp1 + temp2; + + op[shortpitch*0] = a1 + d1; + op[shortpitch*3] = a1 - d1; + + op[shortpitch*1] = b1 + c1; + op[shortpitch*2] = b1 - c1; + + ip++; + op++; + } + + op = output; + + for (i = 0; i < 4; i++) + { + a1 = op[0] + op[2]; + b1 = op[0] - op[2]; + + temp1 = (op[1] * sinpi8sqrt2 + rounding) >> 16; + temp2 = op[3] + ((op[3] * cospi8sqrt2minus1 + rounding) >> 16); + c1 = temp1 - temp2; + + temp1 = op[1] + ((op[1] * cospi8sqrt2minus1 + rounding) >> 16); + temp2 = (op[3] * sinpi8sqrt2 + rounding) >> 16; + d1 = temp1 + temp2; + + + op[0] = (a1 + d1 + 4) >> 3; + op[3] = (a1 - d1 + 4) >> 3; + + op[1] = (b1 + c1 + 4) >> 3; + op[2] = (b1 - c1 + 4) >> 3; + + op += shortpitch; + } + +} + +void vp8_dc_only_idct_add_kernel( + short input_dc, + __global unsigned char *pred_ptr, + __global unsigned char *dst_ptr, + int pitch, + int stride +) +{ + int a1 = ((input_dc + 4) >> 3); + int r, c; + int pred_offset,dst_offset; + + int tid = get_global_id(0); + if (tid < 16){ + r = tid / 4; + c = tid % 4; + + pred_offset = r * pitch; + dst_offset = r * stride; + int a = a1 + pred_ptr[pred_offset + c] ; + + if (a < 0) + a = 0; + else if (a > 255) + a = 255; + + dst_ptr[dst_offset + c] = (unsigned char) a ; + } +} + +void cl_memset_short(__global short *s, int c, size_t n) { + int i; + for (i = 0; i < n/2; i++) + *s++ = c; +} diff --git a/vp8/decoder/opencl/dequantize_cl.h b/vp8/decoder/opencl/dequantize_cl.h new file mode 100644 index 000000000..0605ebff9 --- /dev/null +++ b/vp8/decoder/opencl/dequantize_cl.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef DEQUANTIZE_CL_H +#define DEQUANTIZE_CL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "vp8/decoder/onyxd_int.h" +#include "vp8/decoder/dequantize.h" +#include "vp8/common/opencl/vp8_opencl.h" + +#define prototype_dequant_block_cl(sym) \ + void sym(BLOCKD *x) + +#define prototype_dequant_idct_add_cl(sym) \ + void sym(BLOCKD *b, unsigned char *dest_base,cl_mem dest_mem, int dest_offset, size_t dest_size, int q_offset, \ + int pred_offset, int pitch, int stride, \ + vp8_dequant_idct_add_fn_t idct_add) + +#define prototype_dequant_dc_idct_add_cl(sym) \ + void sym(BLOCKD* b, int qcoeff_offset, \ + int pred_offset, unsigned char *dest_base, int dst_offset, \ + int pitch, int stride, \ + int dc) + +#define prototype_dequant_dc_idct_add_y_block_cl(sym) \ + void sym(BLOCKD *b, \ + unsigned char *dst_base, cl_mem dst_mem, int dst_off,\ + int stride, char *eobs, int dc_offset) + +#define prototype_dequant_idct_add_y_block_cl(sym) \ + void sym(VP8D_COMP *pbi, MACROBLOCKD *xd) + +#define prototype_dequant_idct_add_uv_block_cl(sym) \ + void sym(VP8D_COMP *pbi, MACROBLOCKD *xd, \ + vp8_dequant_idct_add_uv_block_fn_t idct_add_uv_block) + + + +extern prototype_dequant_block_cl(vp8_dequantize_b_cl); + +//CL functions +extern prototype_dequant_idct_add_cl(vp8_dequant_idct_add_cl); + +//C functions +extern prototype_dequant_dc_idct_add_cl(vp8_dequant_dc_idct_add_cl); + + +//Might be CL... check implementation. +extern prototype_dequant_dc_idct_add_y_block_cl(vp8_dequant_dc_idct_add_y_block_cl); +extern prototype_dequant_idct_add_y_block_cl(vp8_dequant_idct_add_y_block_cl); +extern prototype_dequant_idct_add_uv_block_cl(vp8_dequant_idct_add_uv_block_cl); + + + +extern const char *dequantCompileOptions; +extern const char *dequant_cl_file_name; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/vp8/decoder/opencl/idct_blk_cl.c b/vp8/decoder/opencl/idct_blk_cl.c new file mode 100644 index 000000000..c1774e5a3 --- /dev/null +++ b/vp8/decoder/opencl/idct_blk_cl.c @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp8/decoder/onyxd_int.h" +#include "vpx_ports/config.h" +#include "../../common/idct.h" +#include "vp8/common/opencl/blockd_cl.h" +#include "dequantize_cl.h" + +//change q/dq/pre/eobs/dc to offsets +void vp8_dequant_dc_idct_add_y_block_cl( + BLOCKD *b, + unsigned char *dst_base, //xd->dst.buffer_alloc + cl_mem dst_mem, + int dst_off, + int stride, //xd->dst.y_stride + char *eobs, //xd->eobs + int dc_offset //xd->block[24].diff_offset +) +{ + int i, j; + int q_offset = 0; + int pre_offset = 0; + int dst_offset = 0; + unsigned char *dst = dst_base+dst_off; + size_t dst_size = 16*(stride+1); + + vp8_cl_block_prep(b, QCOEFF|DEQUANT|DIFF|PREDICTOR); + for (i = 0; i < 4; i++) + { + for (j = 0; j < 4; j++) + { + if (*eobs++ > 1){ + vp8_dequant_dc_idct_add_cl (b, q_offset, pre_offset, dst, dst_offset, 16, stride, dc_offset); + } + else{ + vp8_dc_only_idct_add_cl(b, CL_TRUE, dc_offset, 0, pre_offset, dst, NULL, dst_offset, dst_size, 16, stride); + } + + q_offset += 16; + pre_offset += 4; + dst_offset += 4; + dc_offset++; + } + + pre_offset += 64 - 16; + dst_offset += 4*stride - 16; + } + + vp8_cl_block_finish(b, QCOEFF); + +} + +void vp8_dequant_idct_add_y_block_cl (VP8D_COMP *pbi, MACROBLOCKD *xd) +{ + int i, j; + + short *q = xd->qcoeff; + int q_offset = 0; + int pre_offset = 0; + cl_mem dst_mem = xd->dst.buffer_mem; + unsigned char *dst = xd->dst.buffer_alloc; + int dst_offset = xd->dst.y_buffer - dst; + int stride = xd->dst.y_stride; + char *eobs = xd->eobs; + int dst_size = 16 * (stride + 1); + + + vp8_cl_mb_prep(xd,PREDICTOR|DIFF|QCOEFF); + for (i = 0; i < 4; i++) + { + for (j = 0; j < 4; j++) + { + if (*eobs++ > 1){ + vp8_cl_block_prep(&xd->block[0], DEQUANT); + vp8_dequant_idct_add_cl(&xd->block[0], dst, dst_mem, dst_offset, dst_size+dst_offset, q_offset, pre_offset, 16, stride, pbi->dequant.idct_add); + vp8_cl_block_finish(&xd->block[0], QCOEFF); + } + else + { + vp8_cl_block_prep(&xd->block[0], DEQUANT); + vp8_dc_only_idct_add_cl(&xd->block[0], CL_FALSE, 0, q_offset, pre_offset, dst, dst_mem, dst_offset, dst_size+dst_offset, 16, stride); + VP8_CL_FINISH(xd->cl_commands); + ((int *)(q+q_offset))[0] = 0; + vp8_cl_mb_prep(xd,QCOEFF); + } + + q_offset += 16; + pre_offset += 4; + dst_offset += 4; + } + + pre_offset += 64 - 16; + dst_offset += 4*stride - 16; + } + +} + +void vp8_dequant_idct_add_uv_block_cl(VP8D_COMP *pbi, MACROBLOCKD *xd, + vp8_dequant_idct_add_uv_block_fn_t idct_add_uv_block +) +{ + int i, j; + + int block_num = 16; + BLOCKD b = xd->block[block_num]; + + short *q = xd->qcoeff; + + cl_mem dst_mem = xd->dst.buffer_mem; + unsigned char *dst = xd->dst.buffer_alloc; + int u_off = xd->dst.u_buffer - dst; + int v_off = xd->dst.v_buffer - dst; + + int stride = xd->dst.uv_stride; + size_t dst_size = 8*(stride+1); + char *eobs = xd->eobs+16; + + int pre_offset = block_num*16; + int q_offset = block_num*16; + int dst_offset = 0; + + vp8_cl_mb_prep(xd, DIFF|QCOEFF|PREDICTOR); + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + if (*eobs++ > 1){ + vp8_cl_block_prep(&xd->block[0], DEQUANT); + vp8_dequant_idct_add_cl(&b, dst, dst_mem, u_off+dst_offset, u_off+dst_size, q_offset, pre_offset, 8, stride, DEQUANT_INVOKE (&pbi->dequant, idct_add)); + } + else + { + vp8_cl_block_prep(&xd->block[block_num], DEQUANT); + vp8_dc_only_idct_add_cl (&b, CL_FALSE, 0, q_offset, pre_offset, dst, dst_mem, u_off+dst_offset, u_off+dst_size, 8, stride); + + //Need round trip + finish until qcoeff set in CL + vp8_cl_block_finish(&xd->block[0], QCOEFF); + VP8_CL_FINISH(xd->cl_commands); + ((int *)(q+q_offset))[0] = 0; + vp8_cl_mb_prep(xd,QCOEFF); + } + + q_offset += 16; + pre_offset += 4; + dst_offset += 4; + } + + pre_offset += 32 - 8; + dst_offset += 4*stride - 8; + } + + //Swap dstu out of cl_mem and dstv into it + + dst_offset = 0; + for (i = 0; i < 2; i++) + { + for (j = 0; j < 2; j++) + { + if (*eobs++ > 1){ + vp8_cl_block_prep(&b, DEQUANT); + vp8_dequant_idct_add_cl (&b, dst, dst_mem, v_off+dst_offset, v_off+dst_size, q_offset, + pre_offset, 8, stride, DEQUANT_INVOKE (&pbi->dequant, idct_add)); + } + else + { + vp8_cl_block_prep(&b, DEQUANT); + vp8_dc_only_idct_add_cl (&b, CL_FALSE, 0, q_offset, pre_offset, + dst, dst_mem, v_off+dst_offset, v_off+dst_size, 8, stride); + + //Eventually replace with memset kernel call to prevent round trip + vp8_cl_mb_finish(xd,QCOEFF); + VP8_CL_FINISH(xd->cl_commands); + ((int *)(q+q_offset))[0] = 0; + vp8_cl_mb_prep(xd,QCOEFF); + } + + q_offset += 16; + pre_offset += 4; + dst_offset += 4; + } + + pre_offset += 32 - 8; + dst_offset += 4*stride - 8; + } + + vp8_cl_mb_finish(xd,QCOEFF); + +} diff --git a/vp8/decoder/opencl/opencl_systemdependent.c b/vp8/decoder/opencl/opencl_systemdependent.c new file mode 100644 index 000000000..023b80614 --- /dev/null +++ b/vp8/decoder/opencl/opencl_systemdependent.c @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_ports/config.h" +#include "vp8/decoder/onyxd_int.h" + +#include "vp8/common/opencl/vp8_opencl.h" +#include "vp8_decode_cl.h" + +void vp8_arch_opencl_decode_init(VP8D_COMP *pbi) +{ + + if (cl_initialized == CL_SUCCESS){ + cl_decode_init(); + } + +} diff --git a/vp8/decoder/opencl/vp8_decode_cl.c b/vp8/decoder/opencl/vp8_decode_cl.c new file mode 100644 index 000000000..4dab1fce3 --- /dev/null +++ b/vp8/decoder/opencl/vp8_decode_cl.c @@ -0,0 +1,38 @@ +#include "vpx_ports/config.h" + +#include "../../common/opencl/vp8_opencl.h" +#include "vp8_decode_cl.h" + +#include + +extern int cl_init_dequant(); +extern int cl_destroy_dequant(); + +int cl_decode_destroy(){ + +#if ENABLE_CL_IDCT_DEQUANT + int err; + err = cl_destroy_dequant(); +#endif + + return CL_SUCCESS; +} + +int cl_decode_init() +{ +#if ENABLE_CL_IDCT_DEQUANT + int err; +#endif + + //Initialize programs to null value + //Enables detection of if they've been initialized as well. + cl_data.dequant_program = NULL; + +#if ENABLE_CL_IDCT_DEQUANT + err = cl_init_dequant(); + if (err != CL_SUCCESS) + return err; +#endif + + return CL_SUCCESS; +} diff --git a/vp8/decoder/opencl/vp8_decode_cl.h b/vp8/decoder/opencl/vp8_decode_cl.h new file mode 100644 index 000000000..6839051d0 --- /dev/null +++ b/vp8/decoder/opencl/vp8_decode_cl.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP8_OPENCL_DECODE_H +#define VP8_OPENCL_DECODE_H + +#ifdef __cplusplus +extern "C" { +#endif + +int cl_decode_init(); + +#ifdef __cplusplus +} +#endif + +#endif /* VP8_OPENCL_H */ diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index 3d9d428ef..6ae7926ac 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -151,51 +151,53 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV) { BLOCKD *b = &xd->block[24]; + short *qcoeff = b->qcoeff_base + b->qcoeff_offset; DEQUANT_INVOKE(&pbi->dequant, block)(b); /* do 2nd order transform on the dc block */ if (xd->eobs[24] > 1) { - IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; - ((int *)b->qcoeff)[1] = 0; - ((int *)b->qcoeff)[2] = 0; - ((int *)b->qcoeff)[3] = 0; - ((int *)b->qcoeff)[4] = 0; - ((int *)b->qcoeff)[5] = 0; - ((int *)b->qcoeff)[6] = 0; - ((int *)b->qcoeff)[7] = 0; + IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset]); + ((int *)qcoeff)[0] = 0; + ((int *)qcoeff)[1] = 0; + ((int *)qcoeff)[2] = 0; + ((int *)qcoeff)[3] = 0; + ((int *)qcoeff)[4] = 0; + ((int *)qcoeff)[5] = 0; + ((int *)qcoeff)[6] = 0; + ((int *)qcoeff)[7] = 0; } else { - IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff); - ((int *)b->qcoeff)[0] = 0; + IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset]); + ((int *)qcoeff)[0] = 0; } DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block) (xd->qcoeff, xd->block[0].dequant, xd->predictor, xd->dst.y_buffer, - xd->dst.y_stride, xd->eobs, xd->block[24].diff); + xd->dst.y_stride, xd->eobs, &xd->block[24].diff_base[xd->block[24].diff_offset]); } else if ((xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED) { for (i = 0; i < 16; i++) { BLOCKD *b = &xd->block[i]; - vp8mt_predict_intra4x4(pbi, xd, b->bmi.mode, b->predictor, mb_row, mb_col, i); + short *qcoeff = b->qcoeff_base + b->qcoeff_offset; + vp8mt_predict_intra4x4(pbi, xd, b->bmi.mode, b->predictor_base + b->predictor_offset, mb_row, mb_col, i); if (xd->eobs[i] > 1) { DEQUANT_INVOKE(&pbi->dequant, idct_add) - (b->qcoeff, b->dequant, b->predictor, + (qcoeff, b->dequant, b->predictor_base + b->predictor_offset, *(b->base_dst) + b->dst, 16, b->dst_stride); } else { IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add) - (b->qcoeff[0] * b->dequant[0], b->predictor, + (qcoeff[0] * b->dequant[0], b->predictor_base + b->predictor_offset, *(b->base_dst) + b->dst, 16, b->dst_stride); - ((int *)b->qcoeff)[0] = 0; + ((int *)qcoeff)[0] = 0; } } } diff --git a/vp8/decoder/x86/x86_dsystemdependent.c b/vp8/decoder/x86/x86_dsystemdependent.c index 5c1684230..c656c6df8 100644 --- a/vp8/decoder/x86/x86_dsystemdependent.c +++ b/vp8/decoder/x86/x86_dsystemdependent.c @@ -19,8 +19,8 @@ void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q); static void dequantize_b_mmx(BLOCKD *d) { - short *sq = (short *) d->qcoeff; - short *dq = (short *) d->dqcoeff; + short *sq = (short *) d->qcoeff_base + d->qcoeff_offset; + short *dq = (short *) d->dqcoeff_base + d->dqcoeff_offset; short *q = (short *) d->dequant; vp8_dequantize_b_impl_mmx(sq, dq, q); } diff --git a/vp8/encoder/arm/quantize_arm.c b/vp8/encoder/arm/quantize_arm.c index 0e3334ac7..c81c8e947 100644 --- a/vp8/encoder/arm/quantize_arm.c +++ b/vp8/encoder/arm/quantize_arm.c @@ -28,7 +28,7 @@ extern int vp8_fast_quantize_b_neon_func(short *coeff_ptr, short *zbin_ptr, shor void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { - d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant_fast); + d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff_base + d->qcoeff_offset, d->dqcoeff_base + d->dqcoeff_offset, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant_fast); } /* @@ -42,8 +42,8 @@ void vp8_fast_quantize_b_neon(BLOCK *b,BLOCKD *d) short *zbin_ptr = &b->Zbin[0][0]; short *round_ptr = &b->Round[0][0]; short *quant_ptr = &b->Quant[0][0]; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr= d->dqcoeff; + short *qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset; + short *dqcoeff_ptr= d->dqcoeff_base + d->dqcoeff_offset; short *dequant_ptr= &d->Dequant[0][0]; eob = 0; diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/asm_enc_offsets.c index 9c81c8d0a..817a79314 100644 --- a/vp8/encoder/asm_enc_offsets.c +++ b/vp8/encoder/asm_enc_offsets.c @@ -43,9 +43,11 @@ DEFINE(vp8_block_zbin_extra, offsetof(BLOCK, zbin_extra)); DEFINE(vp8_block_zrun_zbin_boost, offsetof(BLOCK, zrun_zbin_boost)); DEFINE(vp8_block_quant_shift, offsetof(BLOCK, quant_shift)); -DEFINE(vp8_blockd_qcoeff, offsetof(BLOCKD, qcoeff)); +DEFINE(vp8_blockd_qcoeff_base, offsetof(BLOCKD, qcoeff_base)); +DEFINE(vp8_blockd_qcoeff_offset, offsetof(BLOCKD, qcoeff_offset)); DEFINE(vp8_blockd_dequant, offsetof(BLOCKD, dequant)); -DEFINE(vp8_blockd_dqcoeff, offsetof(BLOCKD, dqcoeff)); +DEFINE(vp8_blockd_dqcoeff_base, offsetof(BLOCKD, dqcoeff_base)); +DEFINE(vp8_blockd_dqcoeff_offset, offsetof(BLOCKD, dqcoeff_offset)); DEFINE(vp8_blockd_eob, offsetof(BLOCKD, eob)); // subtract @@ -54,7 +56,8 @@ DEFINE(vp8_block_src, offsetof(BLOCK, src)); DEFINE(vp8_block_src_diff, offsetof(BLOCK, src_diff)); DEFINE(vp8_block_src_stride, offsetof(BLOCK, src_stride)); -DEFINE(vp8_blockd_predictor, offsetof(BLOCKD, predictor)); +DEFINE(vp8_blockd_predictor_base, offsetof(BLOCKD, predictor_base)); +DEFINE(vp8_blockd_predictor_offset, offsetof(BLOCKD, predictor_offset)); //pack tokens DEFINE(vp8_writer_lowvalue, offsetof(vp8_writer, lowvalue)); diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index 7d835a1cc..f0d242475 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -15,7 +15,7 @@ #include "vp8/common/reconintra.h" #include "vp8/common/reconintra4x4.h" #include "encodemb.h" -#include "vp8/common/invtrans.h" +#include "invtrans.h" #include "vp8/common/recon.h" #include "dct.h" #include "vp8/common/g_common.h" @@ -30,9 +30,10 @@ #else #define IF_RTCD(x) NULL #endif + void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode) { - vp8_predict_intra4x4(b, best_mode, b->predictor); + vp8_predict_intra4x4(b, best_mode, b->predictor_base + b->predictor_offset); ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16); @@ -42,7 +43,7 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 32); - RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride); } void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb) diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c index 798aa683a..98bfd8278 100644 --- a/vp8/encoder/encodemb.c +++ b/vp8/encoder/encodemb.c @@ -14,7 +14,7 @@ #include "vp8/common/reconinter.h" #include "quantize.h" #include "tokenize.h" -#include "vp8/common/invtrans.h" +#include "invtrans.h" #include "vp8/common/recon.h" #include "vp8/common/reconintra.h" #include "dct.h" @@ -30,7 +30,7 @@ void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) { unsigned char *src_ptr = (*(be->base_src) + be->src); short *diff_ptr = be->src_diff; - unsigned char *pred_ptr = bd->predictor; + unsigned char *pred_ptr = bd->predictor_base + bd->predictor_offset; int src_stride = be->src_stride; int r, c; @@ -203,7 +203,7 @@ void vp8_stuff_inter16x16(MACROBLOCK *x) // recon = copy from predictors to destination { BLOCKD *b = &x->e_mbd.block[0]; - unsigned char *pred_ptr = b->predictor; + unsigned char *pred_ptr = b->predictor_base + b->predictor_offset; unsigned char *dst_ptr = *(b->base_dst) + b->dst; int stride = b->dst_stride; @@ -212,7 +212,7 @@ void vp8_stuff_inter16x16(MACROBLOCK *x) vpx_memcpy(dst_ptr+i*stride,pred_ptr+16*i,16); b = &x->e_mbd.block[16]; - pred_ptr = b->predictor; + pred_ptr = b->predictor_base + b->predictor_offset; dst_ptr = *(b->base_dst) + b->dst; stride = b->dst_stride; @@ -220,7 +220,7 @@ void vp8_stuff_inter16x16(MACROBLOCK *x) vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8); b = &x->e_mbd.block[20]; - pred_ptr = b->predictor; + pred_ptr = b->predictor_base + b->predictor_offset; dst_ptr = *(b->base_dst) + b->dst; stride = b->dst_stride; @@ -302,8 +302,8 @@ static void optimize_b(MACROBLOCK *mb, int ib, int type, dequant_ptr = d->dequant; coeff_ptr = b->coeff; - qcoeff_ptr = d->qcoeff; - dqcoeff_ptr = d->dqcoeff; + qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset; + dqcoeff_ptr = d->dqcoeff_base + d->qcoeff_offset; i0 = !type; eob = d->eob; diff --git a/vp8/common/invtrans.c b/vp8/encoder/invtrans.c similarity index 71% rename from vp8/common/invtrans.c rename to vp8/encoder/invtrans.c index 81a3f2d89..7d6045bf0 100644 --- a/vp8/common/invtrans.c +++ b/vp8/encoder/invtrans.c @@ -11,8 +11,6 @@ #include "invtrans.h" - - static void recon_dcblock(MACROBLOCKD *x) { BLOCKD *b = &x->block[24]; @@ -20,7 +18,7 @@ static void recon_dcblock(MACROBLOCKD *x) for (i = 0; i < 16; i++) { - x->block[i].dqcoeff[0] = b->diff[i]; + *(x->block[i].dqcoeff_base+x->block[i].dqcoeff_offset) = b->diff_base[b->diff_offset+i]; } } @@ -28,18 +26,18 @@ static void recon_dcblock(MACROBLOCKD *x) void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch) { if (b->eob > 1) - IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch); + IDCT_INVOKE(rtcd, idct16)(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset], pitch); else - IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch); + IDCT_INVOKE(rtcd, idct1)(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset], pitch); } - +/* Only used in the encoder */ void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x) { int i; /* do 2nd order transform on the dc block */ - IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff); + IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff_base + x->block[23].dqcoeff_offset, &x->block[24].diff_base[x->block[24].diff_offset]); recon_dcblock(x); @@ -49,6 +47,8 @@ void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD * } } + +/* Only used in encoder */ void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x) { int i; @@ -57,7 +57,6 @@ void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD { vp8_inverse_transform_b(rtcd, &x->block[i], 16); } - } @@ -69,8 +68,10 @@ void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x x->mode_info_context->mbmi.mode != SPLITMV) { /* do 2nd order transform on the dc block */ + BLOCKD b = x->block[24]; + + IDCT_INVOKE(rtcd, iwalsh16)(b.dqcoeff_base+b.dqcoeff_offset, &b.diff_base[b.diff_offset]); - IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff); recon_dcblock(x); } diff --git a/vp8/common/invtrans.h b/vp8/encoder/invtrans.h similarity index 93% rename from vp8/common/invtrans.h rename to vp8/encoder/invtrans.h index b3ffb7073..a88564b58 100644 --- a/vp8/common/invtrans.h +++ b/vp8/encoder/invtrans.h @@ -13,8 +13,8 @@ #define __INC_INVTRANS_H #include "vpx_ports/config.h" -#include "idct.h" -#include "blockd.h" +#include "vp8/common/idct.h" +#include "vp8/common/blockd.h" extern void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch); extern void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x); extern void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x); diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 92e0cbb19..0726846a2 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1686,11 +1686,6 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type; - if (!cm->use_bilinear_mc_filter) - cm->mcomp_filter_type = SIXTAP; - else - cm->mcomp_filter_type = BILINEAR; - cpi->target_bandwidth = cpi->oxcf.target_bandwidth; cm->Width = cpi->oxcf.Width ; diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 111cd74ba..6d61b6f5f 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -155,7 +155,7 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b, const vp8_variance_rtcd_vt unsigned char *sptr; unsigned char *dptr; sptr = (*(be->base_src) + be->src); - dptr = b->predictor; + dptr = b->predictor_base + b->predictor_offset; return VARIANCE_INVOKE(rtcd, get4x4sse_cs)(sptr, be->src_stride, dptr, 16, 0x7fffffff); @@ -193,7 +193,7 @@ static int pick_intra4x4block( int this_rd; rate = mode_costs[mode]; - vp8_predict_intra4x4(b, mode, b->predictor); + vp8_predict_intra4x4(b, mode, b->predictor_base + b->predictor_offset); distortion = get_prediction_error(be, b, &rtcd->variance); this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c index 86ed267fb..5ea036085 100644 --- a/vp8/encoder/quantize.c +++ b/vp8/encoder/quantize.c @@ -28,8 +28,8 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) short *round_ptr = b->round; short *quant_ptr = b->quant_fast; unsigned char *quant_shift_ptr = b->quant_shift; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr = d->dqcoeff; + short *qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset; + short *dqcoeff_ptr = d->dqcoeff_base + d->dqcoeff_offset; short *dequant_ptr = d->dequant; vpx_memset(qcoeff_ptr, 0, 32); @@ -73,8 +73,8 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) short *coeff_ptr = b->coeff; short *round_ptr = b->round; short *quant_ptr = b->quant_fast; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr = d->dqcoeff; + short *qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset; + short *dqcoeff_ptr = d->dqcoeff_base + d->dqcoeff_offset; short *dequant_ptr = d->dequant; eob = -1; @@ -113,8 +113,8 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d) short *round_ptr = b->round; short *quant_ptr = b->quant; unsigned char *quant_shift_ptr = b->quant_shift; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr = d->dqcoeff; + short *qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset; + short *dqcoeff_ptr = d->dqcoeff_base + d->dqcoeff_offset; short *dequant_ptr = d->dequant; short zbin_oq_value = b->zbin_extra; @@ -174,8 +174,8 @@ void vp8_strict_quantize_b(BLOCK *b, BLOCKD *d) coeff_ptr = b->coeff; quant_ptr = b->quant; quant_shift_ptr = b->quant_shift; - qcoeff_ptr = d->qcoeff; - dqcoeff_ptr = d->dqcoeff; + qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset; + dqcoeff_ptr = d->dqcoeff_base + d->dqcoeff_offset; dequant_ptr = d->dequant; eob = - 1; vpx_memset(qcoeff_ptr, 0, 32); @@ -224,8 +224,8 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d) short *zbin_ptr = b->zbin; short *round_ptr = b->round; short *quant_ptr = b->quant; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr = d->dqcoeff; + short *qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset; + short *dqcoeff_ptr = d->dqcoeff_base + d->dqcoeff_offset; short *dequant_ptr = d->dequant; short zbin_oq_value = b->zbin_extra; diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index c97feede2..e1e76df99 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -300,7 +300,7 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue) void vp8_auto_select_speed(VP8_COMP *cpi) { - int used = cpi->oxcf.cpu_used; + //int used = cpi->oxcf.cpu_used; int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate); @@ -405,7 +405,8 @@ int vp8_mbblock_error_c(MACROBLOCK *mb, int dc) for (j = dc; j < 16; j++) { - int this_diff = be->coeff[j] - bd->dqcoeff[j]; + short *dqcoeff = bd->dqcoeff_base + bd->dqcoeff_offset; + int this_diff = be->coeff[j] - dqcoeff[j]; berror += this_diff * this_diff; } @@ -427,10 +428,13 @@ int vp8_mbuverror_c(MACROBLOCK *mb) for (i = 16; i < 24; i++) { + short *dqcoeff; + be = &mb->block[i]; bd = &mb->e_mbd.block[i]; - error += vp8_block_error_c(be->coeff, bd->dqcoeff); + dqcoeff = bd->dqcoeff_base + bd->dqcoeff_offset; + error += vp8_block_error_c(be->coeff, dqcoeff); } return error; @@ -481,7 +485,7 @@ static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, int eob = b->eob; int pt ; /* surrounding block/prev coef predictor */ int cost = 0; - short *qcoeff_ptr = b->qcoeff; + short *qcoeff_ptr = b->qcoeff_base + b->qcoeff_offset;; VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); @@ -570,7 +574,7 @@ static void macro_block_yrd( MACROBLOCK *mb, // Distortion d = ENCODEMB_INVOKE(rtcd, mberr)(mb, 1) << 2; - d += ENCODEMB_INVOKE(rtcd, berr)(mb_y2->coeff, x_y2->dqcoeff); + d += ENCODEMB_INVOKE(rtcd, berr)(mb_y2->coeff, x_y2->dqcoeff_base + x_y2->dqcoeff_offset); *Distortion = (d >> 4); @@ -623,7 +627,7 @@ static int rd_pick_intra4x4block( rate = bmode_costs[mode]; - vp8_predict_intra4x4(b, mode, b->predictor); + vp8_predict_intra4x4(b, mode, b->predictor_base + b->predictor_offset); ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16); x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32); x->quantize_b(be, b); @@ -633,7 +637,7 @@ static int rd_pick_intra4x4block( ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ); rate += ratey; - distortion = ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), berr)(be->coeff, b->dqcoeff) >> 2; + distortion = ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), berr)(be->coeff, b->dqcoeff_base + b->dqcoeff_offset) >> 2; this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); @@ -646,15 +650,15 @@ static int rd_pick_intra4x4block( *best_mode = mode; *a = tempa; *l = templ; - copy_predictor(best_predictor, b->predictor); - vpx_memcpy(best_dqcoeff, b->dqcoeff, 32); + copy_predictor(best_predictor, b->predictor_base + b->predictor_offset); + vpx_memcpy(best_dqcoeff, b->dqcoeff_base + b->dqcoeff_offset, 32); } } b->bmi.mode = (B_PREDICTION_MODE)(*best_mode); - IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff, b->diff, 32); - RECON_INVOKE(IF_RTCD(&cpi->rtcd.common->recon), recon)(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride); + IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff, b->diff_base + b->diff_offset, 32); + RECON_INVOKE(IF_RTCD(&cpi->rtcd.common->recon), recon)(best_predictor, b->diff_base + b->diff_offset, *(b->base_dst) + b->dst, b->dst_stride); return best_rd; } @@ -984,7 +988,7 @@ static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x, int const *labels //be->coeff[0] = 0; x->quantize_b(be, bd); - distortion += ENCODEMB_INVOKE(rtcd, berr)(be->coeff, bd->dqcoeff); + distortion += ENCODEMB_INVOKE(rtcd, berr)(be->coeff, bd->dqcoeff_base + bd->dqcoeff_offset); } } diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c index e3f423f8a..0eacc060f 100644 --- a/vp8/encoder/tokenize.c +++ b/vp8/encoder/tokenize.c @@ -109,7 +109,7 @@ static void tokenize2nd_order_b const int eob = b->eob; /* one beyond last nonzero coeff */ TOKENEXTRA *t = *tp; /* store tokens starting here */ int x; - const short *qcoeff_ptr = b->qcoeff; + const short *qcoeff_ptr = b->qcoeff_base + b->qcoeff_offset; VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); do @@ -160,7 +160,7 @@ static void tokenize1st_order_b const int eob = b->eob; /* one beyond last nonzero coeff */ TOKENEXTRA *t = *tp; /* store tokens starting here */ int x; - const short *qcoeff_ptr = b->qcoeff; + const short *qcoeff_ptr = b->qcoeff_base + b->qcoeff_offset; VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l); do @@ -224,8 +224,8 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) int plane_type; int b; - TOKENEXTRA *start = *t; - TOKENEXTRA *tp = *t; + //TOKENEXTRA *start = *t; + //TOKENEXTRA *tp = *t; x->mode_info_context->mbmi.dc_diff = 1; diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index 7b7ae706a..0debbc5d4 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -171,7 +171,7 @@ ZIGZAG_LOOP 15 movdqa xmm3, [rsp + qcoeff + 16] mov rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr - mov rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr + mov rdi, [rsi + vp8_blockd_dqcoeff_base + vp8_blockd_dqcoeff_offset] ; dqcoeff_ptr ; y ^ sz pxor xmm2, xmm0 @@ -184,7 +184,7 @@ ZIGZAG_LOOP 15 movdqa xmm0, [rcx] movdqa xmm1, [rcx + 16] - mov rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr + mov rcx, [rsi + vp8_blockd_qcoeff_base + vp8_blockd_dqcoeff_offset] ; qcoeff_ptr pmullw xmm0, xmm2 pmullw xmm1, xmm3 @@ -296,9 +296,9 @@ sym(vp8_fast_quantize_b_sse2): paddw xmm1, [rcx] paddw xmm5, [rcx + 16] - mov rax, [rsi + vp8_blockd_qcoeff] + mov rax, [rsi + vp8_blockd_qcoeff_base + vp8_blockd_qcoeff_offset] mov rcx, [rsi + vp8_blockd_dequant] - mov rdi, [rsi + vp8_blockd_dqcoeff] + mov rdi, [rsi + vp8_blockd_dqcoeff_base + vp8_blockd_dqcoeff_offset] ; y = x * quant >> 16 pmulhw xmm1, [rdx] diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm index 912007e02..393f33980 100644 --- a/vp8/encoder/x86/quantize_ssse3.asm +++ b/vp8/encoder/x86/quantize_ssse3.asm @@ -76,9 +76,9 @@ sym(vp8_fast_quantize_b_ssse3): pmulhw xmm1, [rdx] pmulhw xmm5, [rdx + 16] - mov rax, [rsi + vp8_blockd_qcoeff] + mov rax, [rsi + vp8_blockd_qcoeff_base + vp8_blockd_qcoeff_offset] mov rdi, [rsi + vp8_blockd_dequant] - mov rcx, [rsi + vp8_blockd_dqcoeff] + mov rcx, [rsi + vp8_blockd_dqcoeff_base + vp8_blockd_dqcoeff_offset] pxor xmm1, xmm0 pxor xmm5, xmm4 diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 47a1788bc..f344b204a 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -33,8 +33,8 @@ static void fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) short *zbin_ptr = b->zbin; short *round_ptr = b->round; short *quant_ptr = b->quant_fast; - short *qcoeff_ptr = d->qcoeff; - short *dqcoeff_ptr = d->dqcoeff; + short *qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset; + short *dqcoeff_ptr = d->dqcoeff_base + d->dqcoeff_offset; short *dequant_ptr = d->dequant; d->eob = vp8_fast_quantize_b_impl_mmx( @@ -54,7 +54,7 @@ int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); static int mbblock_error_mmx(MACROBLOCK *mb, int dc) { short *coeff_ptr = mb->block[0].coeff; - short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; + short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff_base + mb->e_mbd.block[0].dqcoeff_offset; return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc); } @@ -74,18 +74,19 @@ static void subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) unsigned char *z = *(be->base_src) + be->src; unsigned int src_stride = be->src_stride; short *diff = &be->src_diff[0]; - unsigned char *predictor = &bd->predictor[0]; + unsigned char *predictor = bd->predictor_base + bd->predictor_offset; vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch); } #endif #if HAVE_SSE2 + int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); static int mbblock_error_xmm(MACROBLOCK *mb, int dc) { short *coeff_ptr = mb->block[0].coeff; - short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff; + short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff_base + mb->e_mbd.block[0].dqcoeff_offset; return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc); } @@ -105,7 +106,7 @@ static void subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) unsigned char *z = *(be->base_src) + be->src; unsigned int src_stride = be->src_stride; short *diff = &be->src_diff[0]; - unsigned char *predictor = &bd->predictor[0]; + unsigned char *predictor = bd->predictor_base + bd->predictor_offset; vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch); } diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 2a2f0cfad..3a8ee9c85 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -40,7 +40,6 @@ VP8_COMMON_SRCS-yes += common/findnearmv.h VP8_COMMON_SRCS-yes += common/g_common.h VP8_COMMON_SRCS-yes += common/header.h VP8_COMMON_SRCS-yes += common/idct.h -VP8_COMMON_SRCS-yes += common/invtrans.h VP8_COMMON_SRCS-yes += common/loopfilter.h VP8_COMMON_SRCS-yes += common/modecont.h VP8_COMMON_SRCS-yes += common/mv.h @@ -56,7 +55,6 @@ VP8_COMMON_SRCS-yes += common/swapyv12buffer.h VP8_COMMON_SRCS-yes += common/systemdependent.h VP8_COMMON_SRCS-yes += common/threading.h VP8_COMMON_SRCS-yes += common/treecoder.h -VP8_COMMON_SRCS-yes += common/invtrans.c VP8_COMMON_SRCS-yes += common/loopfilter.c VP8_COMMON_SRCS-yes += common/loopfilter_filters.c VP8_COMMON_SRCS-yes += common/mbpitch.c @@ -149,3 +147,33 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon16x16mb_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/buildintrapredictorsmby_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/save_neon_reg$(ASM) VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon_neon.c + +#Append OpenCL source files to source listing if needed +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/vp8_opencl.c +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/vp8_opencl.h + +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/blockd_cl.h +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/blockd_cl.c + +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/filter_cl.h +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/filter_cl.c +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/filter_cl.cl +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/subpixel_cl.h + +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/reconinter_cl.h +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/reconinter_cl.c + +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/idctllm_cl.h +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/idctllm_cl.c +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/idctllm_cl.cl +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/idct_cl.h + +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/loopfilter_cl.h +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/loopfilter_cl.c +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/loopfilter_cl.cl +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/loopfilter_filters_cl.c + + +VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/opencl_systemdependent.c +VP8_COMMON_SRCS-$(HAVE_DLOPEN) += common/opencl/dynamic_cl.c +VP8_COMMON_SRCS-$(HAVE_DLOPEN) += common/opencl/dynamic_cl.h diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index 99657e0a5..c34e7322f 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -18,6 +18,10 @@ #include "common/onyxd.h" #include "decoder/onyxd_int.h" +#if CONFIG_OPENCL +#include "common/opencl/vp8_opencl.h" +#endif + #define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0) typedef vpx_codec_stream_info_t vp8_stream_info_t; @@ -222,6 +226,15 @@ static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) ctx->mmaps[i].dtor(&ctx->mmaps[i]); } +#if CONFIG_OPENCL + if (cl_initialized == CL_SUCCESS){ + cl_destroy(NULL, VP8_CL_NOT_INITIALIZED); +#if HAVE_DLOPEN + close_cl(); +#endif + } +#endif + return VPX_CODEC_OK; } diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index a7b68dba7..bac2bec88 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -39,6 +39,8 @@ VP8_CX_SRCS-yes += encoder/boolhuff.c VP8_CX_SRCS-yes += encoder/dct.c VP8_CX_SRCS-yes += encoder/encodeframe.c VP8_CX_SRCS-yes += encoder/encodeintra.c +VP8_CX_SRCS-yes += encoder/invtrans.h +VP8_CX_SRCS-yes += encoder/invtrans.c VP8_CX_SRCS-yes += encoder/encodemb.c VP8_CX_SRCS-yes += encoder/encodemv.c VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c diff --git a/vp8/vp8dx.mk b/vp8/vp8dx.mk index 564967191..d3bf212c5 100644 --- a/vp8/vp8dx.mk +++ b/vp8/vp8dx.mk @@ -74,3 +74,14 @@ VP8_DX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += decoder/x86/x86_dsystemdependent.c VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c VP8_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c + + +VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/vp8_decode_cl.c +VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/vp8_decode_cl.h +VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/opencl_systemdependent.c +VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/dequantize_cl.c +VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/dequantize_cl.h +VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/dequantize_cl.cl +VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/idct_blk_cl.c +VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/decodframe_cl.c +VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/decodframe_cl.h diff --git a/vpx_ports/config.h b/vpx_ports/config.h index 1abe70da9..ab7d3378a 100644 --- a/vpx_ports/config.h +++ b/vpx_ports/config.h @@ -7,4 +7,4 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#include "vpx_config.h" +#include "../vpx_config.h" diff --git a/vpx_ports/mem.h b/vpx_ports/mem.h index 9ec34fec6..973c6bda6 100644 --- a/vpx_ports/mem.h +++ b/vpx_ports/mem.h @@ -11,7 +11,7 @@ #ifndef VPX_PORTS_MEM_H #define VPX_PORTS_MEM_H -#include "vpx_config.h" +#include "../vpx_config.h" #include "vpx/vpx_integer.h" #if defined(__GNUC__) && __GNUC__ diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c index cb0ab9466..690adeaf9 100644 --- a/vpx_scale/generic/yv12config.c +++ b/vpx_scale/generic/yv12config.c @@ -26,7 +26,22 @@ vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) { duck_free(ybf->buffer_alloc); - ybf->buffer_alloc = 0; + ybf->buffer_alloc = NULL; + ybf->buffer_size = -1; + + ybf->y_buffer = NULL; + ybf->u_buffer = NULL; + ybf->v_buffer = NULL; + +#if CONFIG_OPENCL + if (cl_initialized == CL_SUCCESS){ + if (ybf->buffer_mem){ + clReleaseMemObject(ybf->buffer_mem); + ybf->buffer_mem = NULL; + } + } +#endif + } else { @@ -66,18 +81,31 @@ vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int * when we have a large motion vector in V on the last v block. * Note : We never use these pixels anyway so this doesn't hurt. */ + ybf->buffer_alloc = (unsigned char *) duck_memalign(32, ybf->frame_size + (ybf->y_stride * 2) + 32, 0); if (ybf->buffer_alloc == NULL) return -1; + ybf->buffer_size = ybf->frame_size + (ybf->y_stride * 2) + 32; + +#if CONFIG_OPENCL + ybf->buffer_mem = NULL; + if (cl_initialized == CL_SUCCESS){ + ybf->buffer_mem = clCreateBuffer(cl_data.context, CL_MEM_READ_WRITE, ybf->buffer_size, NULL, NULL); + if (ybf->buffer_mem == NULL){ + cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED); + } + } +#endif + ybf->y_buffer = ybf->buffer_alloc + (border * ybf->y_stride) + border; if (yplane_size & 0xf) yplane_size += 16 - (yplane_size & 0xf); ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * ybf->uv_stride) + border / 2; - ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2 * ybf->uv_stride) + border / 2; + ybf->v_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * ybf->uv_stride) + border / 2 + uvplane_size; ybf->corrupted = 0; /* assume not currupted by errors */ } diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h index e10db3468..f4c99bd22 100644 --- a/vpx_scale/yv12config.h +++ b/vpx_scale/yv12config.h @@ -19,6 +19,12 @@ extern "C" #define VP7BORDERINPIXELS 48 #define VP8BORDERINPIXELS 32 +#include "../vpx_config.h" + +#if CONFIG_OPENCL +#include "../vp8/common/opencl/vp8_opencl.h" +#endif + /************************************* For INT_YUV: @@ -54,6 +60,11 @@ extern "C" unsigned char *v_buffer; unsigned char *buffer_alloc; + int buffer_size; +#if CONFIG_OPENCL + cl_mem buffer_mem; +#endif + int border; int frame_size; YUV_TYPE clrtype; -- 2.50.1