/vpx_config.h
/vpx_version.h
TAGS
+vpxdec
+vpxenc
+.project
+.cproject
+*.csv
+*.oclpj
enabled rvct && check_add_cflags -Otime
enabled small && check_add_cflags -O2 || check_add_cflags -O3
fi
+
+ if enabled opencl; then
+ disable multithread
+ echo " disabling multithread"
+ soft_enable opencl #Provide output to make user comfortable
+ enable runtime_cpu_detect
+
+ #Use dlopen() to load OpenCL when possible.
+ case ${toolchain} in
+ *darwin10*)
+ check_add_cflags -D__APPLE__
+ add_extralibs -framework OpenCL
+ ;;
+ *-win32-gcc)
+ if check_header dlfcn.h; then
+ add_extralibs -ldl
+ enable dlopen
+ else
+ #This shouldn't be a hard-coded path in the long term
+ add_extralibs -L/cygdrive/c/Windows/System32 -lOpenCL
+ fi
+ ;;
+ *)
+ if check_header dlfcn.h; then
+ add_extralibs -ldl
+ enable dlopen
+ else
+ add_extralibs -lOpenCL
+ fi
+ ;;
+ esac
+ fi
# Position Independent Code (PIC) support, for building relocatable
# shared objects
${toggle_runtime_cpu_detect} runtime cpu detection
${toggle_shared} shared library support
${toggle_small} favor smaller size over speed
+ ${toggle_opencl} support for OpenCL-assisted VP8 decoding (experimental)
${toggle_postproc_visualizer} macro block / block level visualizers
Codecs:
all_platforms="${all_platforms} x86-darwin8-icc"
all_platforms="${all_platforms} x86-darwin9-gcc"
all_platforms="${all_platforms} x86-darwin9-icc"
+all_platforms="${all_platforms} x86-darwin10-gcc"
all_platforms="${all_platforms} x86-linux-gcc"
all_platforms="${all_platforms} x86-linux-icc"
all_platforms="${all_platforms} x86-solaris-gcc"
alt_tree_layout
pthread_h
sys_mman_h
+ dlopen
"
CONFIG_LIST="
external_build
realtime_only
shared
small
+ opencl
postproc_visualizer
os_support
"
realtime_only
shared
small
+ opencl
postproc_visualizer
"
cat <<EOF > ${BUILD_PFX}vpx_config.c
static const char* const cfg = "$CONFIGURE_ARGS";
const char *vpx_codec_build_config(void) {return cfg;}
+static const char* const libdir = "$libdir";
+const char *vpx_codec_lib_dir(void) {return libdir;}
EOF
else
INSTALL-LIBS-yes += $(LIBSUBDIR)/libvpx.a
INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(LIBSUBDIR)/libvpx_g.a
+
+#Install the OpenCL kernels if CL enabled.
+ifeq ($(CONFIG_OPENCL),yes)
+INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/common/opencl/filter_cl.cl
+INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/common/opencl/idctllm_cl.cl
+INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/common/opencl/loopfilter.cl
+#only install decoder CL files if VP8 decoder enabled
+ifeq ($(CONFIG_VP8_DECODER),yes)
+INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/decoder/opencl/dequantize_cl.cl
+endif
+endif #CONFIG_OPENCL=yes
+
endif
CODEC_SRCS=$(call enabled,CODEC_SRCS)
case 0:
cm->no_lpf = 0;
cm->simpler_lpf = 0;
- cm->use_bilinear_mc_filter = 0;
+ cm->mcomp_filter_type = SIXTAP;
cm->full_pixel = 0;
break;
case 1:
cm->no_lpf = 0;
cm->simpler_lpf = 1;
- cm->use_bilinear_mc_filter = 1;
+ cm->mcomp_filter_type = BILINEAR;
cm->full_pixel = 0;
break;
case 2:
cm->no_lpf = 1;
cm->simpler_lpf = 0;
- cm->use_bilinear_mc_filter = 1;
+ cm->mcomp_filter_type = BILINEAR;
cm->full_pixel = 0;
break;
case 3:
cm->no_lpf = 1;
cm->simpler_lpf = 1;
- cm->use_bilinear_mc_filter = 1;
+ cm->mcomp_filter_type = BILINEAR;
cm->full_pixel = 1;
break;
default:
/*4,5,6,7 are reserved for future use*/
cm->no_lpf = 0;
cm->simpler_lpf = 0;
- cm->use_bilinear_mc_filter = 0;
+ cm->mcomp_filter_type = SIXTAP;
cm->full_pixel = 0;
break;
}
oci->mb_no_coeff_skip = 1;
oci->no_lpf = 0;
oci->simpler_lpf = 0;
- oci->use_bilinear_mc_filter = 0;
+ oci->mcomp_filter_type = SIXTAP;
oci->full_pixel = 0;
oci->multi_token_partition = ONE_PARTITION;
oci->clr_type = REG_YUV;
void vpx_log(const char *format, ...);
-#include "vpx_ports/config.h"
-#include "vpx_scale/yv12config.h"
+#include "../../vpx_ports/config.h"
+#include "../../vpx_scale/yv12config.h"
#include "mv.h"
#include "treecoder.h"
#include "subpixel.h"
-#include "vpx_ports/mem.h"
+#include "../../vpx_ports/mem.h"
+
+#include "../../vpx_config.h"
+#if CONFIG_OPENCL
+#include "opencl/vp8_opencl.h"
+#endif
#define TRUE 1
#define FALSE 0
typedef enum
{
- DC_PRED, /* average of above and left pixels */
- V_PRED, /* vertical prediction */
- H_PRED, /* horizontal prediction */
- TM_PRED, /* Truemotion prediction */
- B_PRED, /* block based prediction, each block has its own prediction mode */
-
- NEARESTMV,
- NEARMV,
- ZEROMV,
- NEWMV,
- SPLITMV,
-
- MB_MODE_COUNT
+ DC_PRED = 0, /* average of above and left pixels */
+ V_PRED = 1, /* vertical prediction */
+ H_PRED = 2, /* horizontal prediction */
+ TM_PRED = 3, /* Truemotion prediction */
+ B_PRED = 4, /* block based prediction, each block has its own prediction mode */
+
+ NEARESTMV = 5,
+ NEARMV = 6,
+ ZEROMV = 7,
+ NEWMV = 8,
+ SPLITMV = 9,
+
+ MB_MODE_COUNT = 10
} MB_PREDICTION_MODE;
/* Macroblock level features */
typedef struct
{
- short *qcoeff;
- short *dqcoeff;
- unsigned char *predictor;
- short *diff;
- short *reference;
+ short *qcoeff_base;
+ int qcoeff_offset;
+
+ short *dqcoeff_base;
+ int dqcoeff_offset;
+
+ unsigned char *predictor_base;
+ int predictor_offset;
+
+ short *diff_base;
+ int diff_offset;
short *dequant;
+#if CONFIG_OPENCL
+ cl_command_queue cl_commands; //pointer to macroblock CL command queue
+
+ cl_mem cl_diff_mem;
+ cl_mem cl_predictor_mem;
+ cl_mem cl_qcoeff_mem;
+ cl_mem cl_dqcoeff_mem;
+ cl_mem cl_eobs_mem;
+
+ cl_mem cl_dequant_mem; //Block-specific, not shared
+
+ cl_bool sixtap_filter; //Subpixel Prediction type (true=sixtap, false=bilinear)
+
+#endif
+
/* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
- unsigned char **base_pre;
+ unsigned char **base_pre; //previous frame, same Macroblock, base pointer
int pre;
int pre_stride;
- unsigned char **base_dst;
+ unsigned char **base_dst; //destination base pointer
int dst;
int dst_stride;
- int eob;
+ int eob; //only used in encoder? Decoder uses MBD.eobs
+
+ char *eobs_base; //beginning of MB.eobs
B_MODE_INFO bmi;
{
DECLARE_ALIGNED(16, short, diff[400]); /* from idct diff */
DECLARE_ALIGNED(16, unsigned char, predictor[384]);
-/* not used DECLARE_ALIGNED(16, short, reference[384]); */
DECLARE_ALIGNED(16, short, qcoeff[400]);
DECLARE_ALIGNED(16, short, dqcoeff[400]);
DECLARE_ALIGNED(16, char, eobs[25]);
+#if CONFIG_OPENCL
+ cl_command_queue cl_commands; //Each macroblock gets its own command queue.
+ cl_mem cl_diff_mem;
+ cl_mem cl_predictor_mem;
+ cl_mem cl_qcoeff_mem;
+ cl_mem cl_dqcoeff_mem;
+ cl_mem cl_eobs_mem;
+
+ cl_bool sixtap_filter;
+#endif
+
/* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
BLOCKD block[25];
YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
- YV12_BUFFER_CONFIG dst;
+ YV12_BUFFER_CONFIG dst; /* Destination buffer for current frame */
MODE_INFO *mode_info_context;
int mode_info_stride;
unsigned int frames_since_golden;
unsigned int frames_till_alt_ref_frame;
+
vp8_subpix_fn_t subpixel_predict;
vp8_subpix_fn_t subpixel_predict8x4;
vp8_subpix_fn_t subpixel_predict8x8;
#include <stdlib.h>
+#include <stdio.h>
+
+#define REGISTER_FILTER 1
+#define CLAMP(x,min,max) if (x < min) x = min; else if ( x > max ) x = max;
+
+#if REGISTER_FILTER
+#define FILTER0 filter0
+#define FILTER1 filter1
+#define FILTER2 filter2
+#define FILTER3 filter3
+#define FILTER4 filter4
+#define FILTER5 filter5
+#else
+#define FILTER0 vp8_filter[0]
+#define FILTER1 vp8_filter[1]
+#define FILTER2 vp8_filter[2]
+#define FILTER3 vp8_filter[3]
+#define FILTER4 vp8_filter[4]
+#define FILTER5 vp8_filter[5]
+#endif
+
+#define SRC_INCREMENT src_increment
+
#include "filter.h"
#include "vpx_ports/mem.h"
DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
{
-
{ 0, 0, 128, 0, 0, 0 }, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
{ 0, -6, 123, 12, -1, 0 },
{ 2, -11, 108, 36, -8, 1 }, /* New 1/4 pel 6 tap filter */
const short *vp8_filter
)
{
+
unsigned int i, j;
- int Temp;
+ int Temp;
+
+#if REGISTER_FILTER
+ short filter0 = vp8_filter[0];
+ short filter1 = vp8_filter[1];
+ short filter2 = vp8_filter[2];
+ short filter3 = vp8_filter[3];
+ short filter4 = vp8_filter[4];
+ short filter5 = vp8_filter[5];
+#endif
+
+ int ps2 = 2*(int)pixel_step;
+ int ps3 = 3*(int)pixel_step;
+ unsigned int src_increment = src_pixels_per_line - output_width;
for (i = 0; i < output_height; i++)
{
for (j = 0; j < output_width; j++)
{
- Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
- ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
- ((int)src_ptr[0] * vp8_filter[2]) +
- ((int)src_ptr[pixel_step] * vp8_filter[3]) +
- ((int)src_ptr[2*pixel_step] * vp8_filter[4]) +
- ((int)src_ptr[3*pixel_step] * vp8_filter[5]) +
- (VP8_FILTER_WEIGHT >> 1); /* Rounding */
+ Temp = ((int)src_ptr[-1*ps2] * FILTER0);
+ Temp += ((int)src_ptr[-1*(int)pixel_step] * FILTER1) +
+ ((int)src_ptr[0] * FILTER2) +
+ ((int)src_ptr[pixel_step] * FILTER3) +
+ ((int)src_ptr[ps2] * FILTER4) +
+ ((int)src_ptr[ps3] * FILTER5) +
+ (VP8_FILTER_WEIGHT >> 1); /* Rounding */
/* Normalize back to 0-255 */
Temp = Temp >> VP8_FILTER_SHIFT;
-
- if (Temp < 0)
- Temp = 0;
- else if (Temp > 255)
- Temp = 255;
+ CLAMP(Temp, 0, 255);
output_ptr[j] = Temp;
src_ptr++;
}
/* Next row... */
- src_ptr += src_pixels_per_line - output_width;
+ src_ptr += SRC_INCREMENT;
output_ptr += output_width;
}
}
const short *vp8_filter
)
{
- unsigned int i, j;
- int Temp;
+ unsigned int i, j;
+ int Temp;
+
+#if REGISTER_FILTER
+ short filter0 = vp8_filter[0];
+ short filter1 = vp8_filter[1];
+ short filter2 = vp8_filter[2];
+ short filter3 = vp8_filter[3];
+ short filter4 = vp8_filter[4];
+ short filter5 = vp8_filter[5];
+#endif
+
+ int ps2 = ((int)pixel_step) << 1;
+ int ps3 = ps2 + (int)pixel_step;
+ unsigned int src_increment = src_pixels_per_line - output_width;
for (i = 0; i < output_height; i++)
{
for (j = 0; j < output_width; j++)
{
/* Apply filter */
- Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
- ((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
- ((int)src_ptr[0] * vp8_filter[2]) +
- ((int)src_ptr[pixel_step] * vp8_filter[3]) +
- ((int)src_ptr[2*pixel_step] * vp8_filter[4]) +
- ((int)src_ptr[3*pixel_step] * vp8_filter[5]) +
+ Temp = ((int)src_ptr[-1*ps2] * FILTER0) +
+ ((int)src_ptr[-1*(int)pixel_step] * FILTER1) +
+ ((int)src_ptr[0] * FILTER2) +
+ ((int)src_ptr[pixel_step] * FILTER3) +
+ ((int)src_ptr[ps2] * FILTER4) +
+ ((int)src_ptr[ps3] * FILTER5) +
(VP8_FILTER_WEIGHT >> 1); /* Rounding */
/* Normalize back to 0-255 */
Temp = Temp >> VP8_FILTER_SHIFT;
-
- if (Temp < 0)
- Temp = 0;
- else if (Temp > 255)
- Temp = 255;
+ CLAMP(Temp, 0, 255);
output_ptr[j] = (unsigned char)Temp;
src_ptr++;
}
/* Start next row */
- src_ptr += src_pixels_per_line - output_width;
+ src_ptr += src_increment;
output_ptr += output_pitch;
}
}
filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
}
+
void vp8_sixtap_predict8x8_c
(
unsigned char *src_ptr,
extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);
extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);
+extern void vp8_arch_opencl_common_init(VP8_COMMON *ctx);
void vp8_machine_specific_config(VP8_COMMON *ctx)
{
vp8_arch_arm_common_init(ctx);
#endif
+#if CONFIG_OPENCL && (ENABLE_CL_IDCT_DEQUANT || ENABLE_CL_SUBPIXEL || ENABLE_CL_LOOPFILTER)
+ vp8_arch_opencl_common_init(ctx);
+#endif
+
}
#include "arm/idct_arm.h"
#endif
+#if CONFIG_OPENCL
+#include "opencl/idct_cl.h"
+#endif
+
#ifndef vp8_idct_idct1
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_c
#endif
#include "loopfilter.h"
#include "onyxc_int.h"
+#if CONFIG_OPENCL
+#include "opencl/loopfilter_cl.h"
+#endif
+
typedef unsigned char uc;
int i;
unsigned char *y_ptr, *u_ptr, *v_ptr;
+#if CONFIG_OPENCL && ENABLE_CL_LOOPFILTER
+ if ( cl_initialized == CL_SUCCESS ){
+ vp8_loop_filter_frame_cl(cm,mbd,default_filt_lvl);
+ return;
+ }
+#endif
+
mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */
/* Note the baseline filter values for each segment */
}
+/* Encoder only... */
void vp8_loop_filter_frame_yonly
(
VP8_COMMON *cm,
}
-
+/* Encoder only... */
void vp8_loop_filter_partial_frame
(
VP8_COMMON *cm,
}
static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *op0, uc *oq0, uc *oq1)
-
{
signed char ps0, qs0;
signed char ps1, qs1;
*op1 = u ^ 0x80;
}
+
void vp8_loop_filter_horizontal_edge_c
(
unsigned char *s,
#include "blockd.h"
+#include "stdio.h"
+#include "vpx_config.h"
+#if CONFIG_OPENCL
+#include "opencl/vp8_opencl.h"
+#endif
+
typedef enum
{
PRED = 0,
static void setup_block
(
BLOCKD *b,
- int mv_stride,
unsigned char **base,
int Stride,
int offset,
int block;
unsigned char **y, **u, **v;
+ unsigned char **buf_base;
+ int y_off, u_off, v_off;
if (bs == DEST)
{
+ buf_base = &x->dst.buffer_alloc;
+ y_off = x->dst.y_buffer - x->dst.buffer_alloc;
+ u_off = x->dst.u_buffer - x->dst.buffer_alloc;
+ v_off = x->dst.v_buffer - x->dst.buffer_alloc;
y = &x->dst.y_buffer;
u = &x->dst.u_buffer;
v = &x->dst.v_buffer;
+ y_off = 0;
+
+ //y = buf_base;
+ //y_off = x->dst.y_buffer - x->dst.buffer_alloc;
+
+ u = buf_base;
+ v = buf_base;
+
+ u_off = x->dst.u_buffer - x->dst.buffer_alloc;
+ v_off = x->dst.v_buffer - x->dst.buffer_alloc;
+
}
else
{
+ buf_base = &x->pre.buffer_alloc;
y = &x->pre.y_buffer;
u = &x->pre.u_buffer;
v = &x->pre.v_buffer;
+ y_off = u_off = v_off = 0;
+
+ //y = buf_base;
+ //y_off = x->pre.y_buffer - x->pre.buffer_alloc;
+ //u = buf_base;
+ //u_off = x->pre.u_buffer - x->pre.buffer_alloc;
+ //v = buf_base;
+ //v_off = x->pre.v_buffer - x->pre.buffer_alloc;
}
for (block = 0; block < 16; block++) /* y blocks */
{
- setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride,
- (block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4, bs);
+ setup_block(&x->block[block], y, x->dst.y_stride,
+ y_off + ((block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4), bs);
}
for (block = 16; block < 20; block++) /* U and V blocks */
{
- setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride,
- ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
+ int block_off = ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4;
- setup_block(&x->block[block+4], x->dst.uv_stride, v, x->dst.uv_stride,
- ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
+ setup_block(&x->block[block], u, x->dst.uv_stride,
+ u_off + block_off, bs);
+
+ setup_block(&x->block[block+4], v, x->dst.uv_stride,
+ v_off + block_off, bs);
}
}
void vp8_setup_block_dptrs(MACROBLOCKD *x)
{
int r, c;
+ unsigned int offset;
+
+#if CONFIG_OPENCL && !ONE_CQ_PER_MB
+ cl_command_queue y_cq, u_cq, v_cq;
+ int err;
+ if (cl_initialized == CL_SUCCESS){
+ //Create command queue for Y/U/V Planes
+ y_cq = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err);
+ if (!y_cq || err != CL_SUCCESS) {
+ printf("Error: Failed to create a command queue!\n");
+ cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
+ }
+ u_cq = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err);
+ if (!u_cq || err != CL_SUCCESS) {
+ printf("Error: Failed to create a command queue!\n");
+ cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
+ }
+ v_cq = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err);
+ if (!v_cq || err != CL_SUCCESS) {
+ printf("Error: Failed to create a command queue!\n");
+ cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
+ }
+ }
+#endif
+ /* 16 Y blocks */
for (r = 0; r < 4; r++)
{
for (c = 0; c < 4; c++)
{
- x->block[r*4+c].diff = &x->diff[r * 4 * 16 + c * 4];
- x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4;
+ offset = r * 4 * 16 + c * 4;
+ x->block[r*4+c].diff_offset = offset;
+ x->block[r*4+c].predictor_offset = offset;
+#if CONFIG_OPENCL && !ONE_CQ_PER_MB
+ if (cl_initialized == CL_SUCCESS)
+ x->block[r*4+c].cl_commands = y_cq;
+#endif
}
}
+ /* 4 U Blocks */
for (r = 0; r < 2; r++)
{
for (c = 0; c < 2; c++)
{
- x->block[16+r*2+c].diff = &x->diff[256 + r * 4 * 8 + c * 4];
- x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4;
-
+ offset = 256 + r * 4 * 8 + c * 4;
+ x->block[16+r*2+c].diff_offset = offset;
+ x->block[16+r*2+c].predictor_offset = offset;
+
+#if CONFIG_OPENCL && !ONE_CQ_PER_MB
+ if (cl_initialized == CL_SUCCESS)
+ x->block[16+r*2+c].cl_commands = u_cq;
+#endif
}
}
+ /* 4 V Blocks */
for (r = 0; r < 2; r++)
{
for (c = 0; c < 2; c++)
{
- x->block[20+r*2+c].diff = &x->diff[320+ r * 4 * 8 + c * 4];
- x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4;
-
+ offset = 320+ r * 4 * 8 + c * 4;
+ x->block[20+r*2+c].diff_offset = offset;
+ x->block[20+r*2+c].predictor_offset = offset;
+
+#if CONFIG_OPENCL && !ONE_CQ_PER_MB
+ if (cl_initialized == CL_SUCCESS)
+ x->block[20+r*2+c].cl_commands = v_cq;
+#endif
}
}
- x->block[24].diff = &x->diff[384];
+ x->block[24].diff_offset = 384;
for (r = 0; r < 25; r++)
{
- x->block[r].qcoeff = x->qcoeff + r * 16;
- x->block[r].dqcoeff = x->dqcoeff + r * 16;
+ x->block[r].qcoeff_base = x->qcoeff;
+ x->block[r].qcoeff_offset = r * 16;
+ x->block[r].dqcoeff_base = x->dqcoeff;
+ x->block[r].dqcoeff_offset = r * 16;
+
+ x->block[r].predictor_base = x->predictor;
+ x->block[r].diff_base = x->diff;
+ x->block[r].eobs_base = x->eobs;
+
+#if CONFIG_OPENCL
+ if (cl_initialized == CL_SUCCESS){
+ /* Copy command queue reference from macroblock */
+#if ONE_CQ_PER_MB
+ x->block[r].cl_commands = x->cl_commands;
+#endif
+
+ /* Set up CL memory buffers as appropriate */
+ x->block[r].cl_diff_mem = x->cl_diff_mem;
+ x->block[r].cl_dqcoeff_mem = x->cl_dqcoeff_mem;
+ x->block[r].cl_eobs_mem = x->cl_eobs_mem;
+ x->block[r].cl_predictor_mem = x->cl_predictor_mem;
+ x->block[r].cl_qcoeff_mem = x->cl_qcoeff_mem;
+ }
+
+ //Copy filter type to block.
+ x->block[r].sixtap_filter = x->sixtap_filter;
+#endif
}
+
}
void vp8_build_block_doffsets(MACROBLOCKD *x)
{
-
/* handle the destination pitch features */
setup_macroblock(x, DEST);
setup_macroblock(x, PRED);
int mb_no_coeff_skip;
int no_lpf;
int simpler_lpf;
- int use_bilinear_mc_filter;
int full_pixel;
int base_qindex;
--- /dev/null
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "../../decoder/onyxd_int.h"
+#include "../../../vpx_ports/config.h"
+#include "../../common/idct.h"
+#include "blockd_cl.h"
+#include "../../decoder/opencl/dequantize_cl.h"
+
+
+int vp8_cl_mb_prep(MACROBLOCKD *x, int flags){
+ int err;
+
+ if (cl_initialized != CL_SUCCESS){
+ return cl_initialized;
+ }
+
+ //Copy all blockd.cl_*_mem objects
+ if (flags & DIFF)
+ VP8_CL_SET_BUF(x->cl_commands, x->cl_diff_mem, sizeof(cl_short)*400, x->diff,
+ ,err
+ );
+
+ if (flags & PREDICTOR)
+ VP8_CL_SET_BUF(x->cl_commands, x->cl_predictor_mem, sizeof(cl_uchar)*384, x->predictor,
+ ,err
+ );
+
+ if (flags & QCOEFF)
+ VP8_CL_SET_BUF(x->cl_commands, x->cl_qcoeff_mem, sizeof(cl_short)*400, x->qcoeff,
+ ,err
+ );
+
+ if (flags & DQCOEFF)
+ VP8_CL_SET_BUF(x->cl_commands, x->cl_dqcoeff_mem, sizeof(cl_short)*400, x->dqcoeff,
+ ,err
+ );
+
+ if (flags & EOBS)
+ VP8_CL_SET_BUF(x->cl_commands, x->cl_eobs_mem, sizeof(cl_char)*25, x->eobs,
+ ,err
+ );
+
+ if (flags & PRE_BUF){
+ VP8_CL_SET_BUF(x->cl_commands, x->pre.buffer_mem, x->pre.buffer_size, x->pre.buffer_alloc,
+ ,err
+ );
+ }
+
+ if (flags & DST_BUF){
+ VP8_CL_SET_BUF(x->cl_commands, x->dst.buffer_mem, x->dst.buffer_size, x->dst.buffer_alloc,
+ ,err
+ );
+ }
+
+
+ return CL_SUCCESS;
+}
+
+int vp8_cl_mb_finish(MACROBLOCKD *x, int flags){
+ int err;
+
+ if (cl_initialized != CL_SUCCESS){
+ return cl_initialized;
+ }
+
+ if (flags & DIFF){
+ err = clEnqueueReadBuffer(x->cl_commands, x->cl_diff_mem, CL_FALSE, 0, sizeof(cl_short)*400, x->diff, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read from GPU!\n",
+ , err
+ );
+ }
+
+ if (flags & PREDICTOR){
+ err = clEnqueueReadBuffer(x->cl_commands, x->cl_predictor_mem, CL_FALSE, 0, sizeof(cl_uchar)*384, x->predictor, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read from GPU!\n",
+ , err
+ );
+ }
+
+ if (flags & QCOEFF){
+ err = clEnqueueReadBuffer(x->cl_commands, x->cl_qcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, x->qcoeff, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read from GPU!\n",
+ , err
+ );
+ }
+
+ if (flags & DQCOEFF){
+ err = clEnqueueReadBuffer(x->cl_commands, x->cl_dqcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, x->dqcoeff, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read from GPU!\n",
+ , err
+ );
+ }
+
+ if (flags & EOBS){
+ err = clEnqueueReadBuffer(x->cl_commands, x->cl_eobs_mem, CL_FALSE, 0, sizeof(cl_char)*25, x->eobs, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read from GPU!\n",
+ , err
+ );
+ }
+
+ if (flags & PRE_BUF){
+ err = clEnqueueReadBuffer(x->cl_commands, x->pre.buffer_mem, CL_FALSE,
+ 0, x->pre.buffer_size, x->pre.buffer_alloc, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read from GPU!\n",
+ , err
+ );
+ }
+
+ if (flags & DST_BUF){
+ err = clEnqueueReadBuffer(x->cl_commands, x->dst.buffer_mem, CL_FALSE,
+ 0, x->dst.buffer_size, x->dst.buffer_alloc, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read from GPU!\n",
+ , err
+ );
+ }
+
+
+ return CL_SUCCESS;
+}
+
+int vp8_cl_block_prep(BLOCKD *b, int flags){
+ int err;
+
+ if (cl_initialized != CL_SUCCESS){
+ return cl_initialized;
+ }
+
+ //Copy all blockd.cl_*_mem objects
+ if (flags & DIFF)
+ VP8_CL_SET_BUF(b->cl_commands, b->cl_diff_mem, sizeof(cl_short)*400, b->diff_base,
+ ,err
+ );
+
+ if (flags & PREDICTOR)
+ VP8_CL_SET_BUF(b->cl_commands, b->cl_predictor_mem, sizeof(cl_uchar)*384, b->predictor_base,
+ ,err
+ );
+
+ if (flags & QCOEFF)
+ VP8_CL_SET_BUF(b->cl_commands, b->cl_qcoeff_mem, sizeof(cl_short)*400, b->qcoeff_base,
+ ,err
+ );
+
+ if (flags & DQCOEFF)
+ VP8_CL_SET_BUF(b->cl_commands, b->cl_dqcoeff_mem, sizeof(cl_short)*400, b->dqcoeff_base,
+ ,err
+ );
+
+ if (flags & EOBS)
+ VP8_CL_SET_BUF(b->cl_commands, b->cl_eobs_mem, sizeof(cl_char)*25, b->eobs_base,
+ ,err
+ );
+
+ if (flags & DEQUANT)
+ VP8_CL_SET_BUF(b->cl_commands, b->cl_dequant_mem, sizeof(cl_short)*16 ,b->dequant,
+ ,err
+ );
+
+ return CL_SUCCESS;
+}
+
+int vp8_cl_block_finish(BLOCKD *b, int flags){
+ int err;
+
+ if (cl_initialized != CL_SUCCESS){
+ return cl_initialized;
+ }
+
+ if (flags & DIFF){
+ err = clEnqueueReadBuffer(b->cl_commands, b->cl_diff_mem, CL_FALSE, 0, sizeof(cl_short)*400, b->diff_base, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read from GPU!\n",
+ , err
+ );
+ }
+
+ if (flags & PREDICTOR){
+ err = clEnqueueReadBuffer(b->cl_commands, b->cl_predictor_mem, CL_FALSE, 0, sizeof(cl_uchar)*384, b->predictor_base, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read from GPU!\n",
+ , err
+ );
+ }
+
+ if (flags & QCOEFF){
+ err = clEnqueueReadBuffer(b->cl_commands, b->cl_qcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, b->qcoeff_base, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read from GPU!\n",
+ , err
+ );
+ }
+
+ if (flags & DQCOEFF){
+ err = clEnqueueReadBuffer(b->cl_commands, b->cl_dqcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, b->dqcoeff_base, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read from GPU!\n",
+ , err
+ );
+ }
+
+ if (flags & EOBS){
+ err = clEnqueueReadBuffer(b->cl_commands, b->cl_eobs_mem, CL_FALSE, 0, sizeof(cl_char)*25, b->eobs_base, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read from GPU!\n",
+ , err
+ );
+ }
+
+ if (flags & DEQUANT){
+ err = clEnqueueReadBuffer(b->cl_commands, b->cl_dequant_mem, CL_FALSE, 0, sizeof(cl_short)*16 ,b->dequant, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read from GPU!\n",
+ , err
+ );
+ }
+
+ return CL_SUCCESS;
+}
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef BLOCKD_OPENCL_H
+#define BLOCKD_OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp8_opencl.h"
+#include "../blockd.h"
+
+#define DIFF 0x0001
+#define PREDICTOR 0x0002
+#define QCOEFF 0x0004
+#define DQCOEFF 0x0008
+#define EOBS 0x0010
+#define DEQUANT 0x0020
+#define PRE_BUF 0x0040
+#define DST_BUF 0x0080
+
+#define BLOCK_COPY_ALL 0xffff
+
+/*
+#define BLOCK_MEM_SIZE 6
+enum {
+ DIFF_MEM = 0,
+ PRED_MEM = 1,
+ QCOEFF_MEM = 2,
+ DQCOEFF_MEM = 3,
+ EOBS_MEM = 4,
+ DEQUANT_MEM = 5
+} BLOCK_MEM_TYPES;
+
+
+struct cl_block_mem{
+ cl_mem gpu_mem;
+ size_t size;
+ void *host_mem;
+};
+
+typedef struct cl_block_mem block_mem;
+*/
+
+extern int vp8_cl_block_finish(BLOCKD *b, int flags);
+extern int vp8_cl_block_prep(BLOCKD *b, int flags);
+
+extern int vp8_cl_mb_prep(MACROBLOCKD *x, int flags);
+extern int vp8_cl_mb_finish(MACROBLOCKD *x, int flags);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
\ No newline at end of file
--- /dev/null
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8_opencl.h"
+
+#include <stdio.h>
+
+CL_FUNCTIONS cl;
+void *dll = NULL;
+int cl_loaded = VP8_CL_NOT_INITIALIZED;
+
+int close_cl(){
+ int ret = dlclose(dll);
+
+ if (ret != 0)
+ fprintf(stderr, "Error closing OpenCL library: %s", dlerror());
+
+ return ret;
+}
+
+int load_cl(char *lib_name){
+
+ //printf("Loading OpenCL library\n");
+ dll = dlopen(lib_name, RTLD_NOW|RTLD_LOCAL);
+ if (dll != NULL){
+ //printf("Found CL library\n");
+ } else {
+ //printf("Didn't find CL library\n");
+ return VP8_CL_TRIED_BUT_FAILED;
+ }
+
+ CL_LOAD_FN("clGetPlatformIDs", cl.getPlatformIDs);
+ CL_LOAD_FN("clGetPlatformInfo", cl.getPlatformInfo);
+ CL_LOAD_FN("clGetDeviceIDs", cl.getDeviceIDs);
+ CL_LOAD_FN("clGetDeviceInfo", cl.getDeviceInfo);
+ CL_LOAD_FN("clCreateContext", cl.createContext);
+// CL_LOAD_FN("clCreateContextFromType", cl.createContextFromType);
+// CL_LOAD_FN("clRetainContext", cl.retainContext);
+ CL_LOAD_FN("clReleaseContext", cl.releaseContext);
+// CL_LOAD_FN("clGetContextInfo", cl.getContextInfo);
+ CL_LOAD_FN("clCreateCommandQueue", cl.createCommandQueue);
+// CL_LOAD_FN("clRetainCommandQueue", cl.retainCommandQueue);
+ CL_LOAD_FN("clReleaseCommandQueue", cl.releaseCommandQueue);
+// CL_LOAD_FN("clGetCommandQueueInfo", cl.getCommandQueue);
+ CL_LOAD_FN("clCreateBuffer", cl.createBuffer);
+// CL_LOAD_FN("clCreateImage2D", cl.createImage2D);
+// CL_LOAD_FN("clCreateImage3D", cl.createImage3D);
+// CL_LOAD_FN("clRetainMemObject", cl.retainMemObject);
+ CL_LOAD_FN("clReleaseMemObject", cl.releaseMemObject);
+// CL_LOAD_FN("clGetSupportedImageFormats", cl.getSupportedImageFormats);
+// CL_LOAD_FN("clGetMemObjectInfo", cl.getMemObjectInfo);
+// CL_LOAD_FN("clGetImageInfo", cl.getImageInfo);
+// CL_LOAD_FN("clCreateSampler", cl.createSampler);
+// CL_LOAD_FN("clRetainSampler", cl.retainSampler);
+// CL_LOAD_FN("clReleaseSampler", cl.releaseSampler);
+// CL_LOAD_FN("clGetSamplerInfo", cl.getSamplerInfo);
+ CL_LOAD_FN("clCreateProgramWithSource", cl.createProgramWithSource);
+// CL_LOAD_FN("clCreateProgramWithBinary", cl.createProgramWithBinary);
+// CL_LOAD_FN("clRetainProgram", cl.retainProgram);
+ CL_LOAD_FN("clReleaseProgram", cl.releaseProgram);
+ CL_LOAD_FN("clBuildProgram", cl.buildProgram);
+// CL_LOAD_FN("clUnloadCompiler", cl.unloadCompiler);
+ CL_LOAD_FN("clGetProgramInfo", cl.getProgramInfo);
+ CL_LOAD_FN("clGetProgramBuildInfo", cl.getProgramBuildInfo);
+ CL_LOAD_FN("clCreateKernel", cl.createKernel);
+// CL_LOAD_FN("clCreateKernelsInProgram", cl.createKernelsInProgram);
+// CL_LOAD_FN("clRetainKernel", cl.retainKernel);
+ CL_LOAD_FN("clReleaseKernel", cl.releaseKernel);
+ CL_LOAD_FN("clSetKernelArg", cl.setKernelArg);
+// CL_LOAD_FN("clGetKernelInfo", cl.getKernelInfo);
+ CL_LOAD_FN("clGetKernelWorkGroupInfo", cl.getKernelWorkGroupInfo);
+// CL_LOAD_FN("clWaitForEvents", cl.waitForEvents);
+// CL_LOAD_FN("clGetEventInfo", cl.getEventInfo);
+// CL_LOAD_FN("clRetainEvent", cl.retainEvent);
+// CL_LOAD_FN("clReleaseEvent", cl.releaseEvent);
+// CL_LOAD_FN("clGetEventProfilingInfo", cl.getEventProfilingInfo);
+ CL_LOAD_FN("clFlush", cl.flush);
+ CL_LOAD_FN("clFinish", cl.finish);
+ CL_LOAD_FN("clEnqueueReadBuffer", cl.enqueueReadBuffer);
+ CL_LOAD_FN("clEnqueueWriteBuffer", cl.enqueueWriteBuffer);
+ CL_LOAD_FN("clEnqueueCopyBuffer", cl.enqueueCopyBuffer);
+// CL_LOAD_FN("clEnqueueReadImage", cl.enqueueReadImage);
+// CL_LOAD_FN("clEnqueueWriteImage", cl.enqueueWriteImage);
+// CL_LOAD_FN("clEnqueueCopyImage", cl.enqueueCopyImage);
+// CL_LOAD_FN("clEnqueueCopyImageToBuffer", cl.enqueueCopyImageToBuffer);
+// CL_LOAD_FN("clEnqueueCopyBufferToImage", cl.enqueueCopyBufferToImage);
+// CL_LOAD_FN("clEnqueueMapBuffer", cl.enqueueMapBuffer);
+// CL_LOAD_FN("clEnqueueMapImage", cl.enqueueMapImage);
+// CL_LOAD_FN("clEnqueueUnmapMemObject", cl.enqueueUnmapMemObject);
+ CL_LOAD_FN("clEnqueueNDRangeKernel", cl.enqueueNDRAngeKernel);
+// CL_LOAD_FN("clEnqueueTask", cl.enqueueTask);
+// CL_LOAD_FN("clEnqueueNativeKernel", cl.enqueueNativeKernel);
+// CL_LOAD_FN("clEnqueueMarker", cl.enqueueMarker);
+// CL_LOAD_FN("clEnqueueWaitForEvents", cl.enqueueWaitForEvents);
+ CL_LOAD_FN("clEnqueueBarrier", cl.enqueueBarrier);
+// CL_LOAD_FN("clGetExtensionFunctionAddress", cl.getExtensionFunctionAddress);
+
+ return CL_SUCCESS;
+}
--- /dev/null
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef DYNAMIC_CL_H
+#define DYNAMIC_CL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include <dlfcn.h>
+
+int load_cl(char *lib_name);
+int close_cl();
+
+extern int cl_loaded;
+
+typedef cl_int(*fn_clGetPlatformIDs_t)(cl_uint, cl_platform_id *, cl_uint *);
+typedef cl_int(*fn_clGetPlatformInfo_t)(cl_platform_id, cl_platform_info, size_t, void *, size_t *);
+typedef cl_int(*fn_clGetDeviceIDs_t)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
+typedef cl_int(*fn_clGetDeviceInfo_t)(cl_device_id, cl_device_info, size_t, void *, size_t *);
+typedef cl_context(*fn_clCreateContext_t)(const cl_context_properties *, cl_uint, const cl_device_id *, void (*pfn_notify)(const char *, const void *, size_t, void *), void *, cl_int *);
+typedef cl_context(*fn_clCreateContextFromType_t)(const cl_context_properties *, cl_device_type, void (*pfn_notify)(const char *, const void *, size_t, void *), void *, cl_int *);
+typedef cl_int(*fn_clRetainContext_t)(cl_context);
+typedef cl_int(*fn_clReleaseContext_t)(cl_context);
+typedef cl_int(*fn_clGetContextInfo_t)(cl_context, cl_context_info, size_t, void *, size_t *);
+typedef cl_command_queue(*fn_clCreateCommandQueue_t)(cl_context, cl_device_id, cl_command_queue_properties, cl_int *);
+typedef cl_int(*fn_clRetainCommandQueue_t)(cl_command_queue);
+typedef cl_int(*fn_clReleaseCommandQueue_t)(cl_command_queue);
+typedef cl_int(*fn_clGetCommandQueueInfo_t)(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *);
+typedef cl_mem(*fn_clCreateBuffer_t)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
+typedef cl_mem(*fn_clCreateImage2D_t)(cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t, void *, cl_int *);
+typedef cl_mem(*fn_clCreateImage3D_t)(cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t, size_t, size_t, void *, cl_int *);
+typedef cl_int(*fn_clRetainMemObject_t)(cl_mem);
+typedef cl_int(*fn_clReleaseMemObject_t)(cl_mem);
+typedef cl_int(*fn_clGetSupportedImageFormats_t)(cl_context, cl_mem_flags, cl_mem_object_type, cl_uint, cl_image_format *, cl_uint *);
+typedef cl_int(*fn_clGetMemObjectInfo_t)(cl_mem, cl_mem_info, size_t, void *, size_t *);
+typedef cl_int(*fn_clGetImageInfo_t)(cl_mem, cl_image_info, size_t, void *, size_t *);
+typedef cl_sampler(*fn_clCreateSampler_t)(cl_context, cl_bool, cl_addressing_mode, cl_filter_mode, cl_int *);
+typedef cl_int(*fn_clRetainSampler_t)(cl_sampler);
+typedef cl_int(*fn_clReleaseSampler_t)(cl_sampler);
+typedef cl_int(*fn_clGetSamplerInfo_t)(cl_sampler, cl_sampler_info, size_t, void *, size_t *);
+typedef cl_program(*fn_clCreateProgramWithSource_t)(cl_context, cl_uint, const char **, const size_t *, cl_int *);
+typedef cl_program(*fn_clCreateProgramWithBinary_t)(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *);
+typedef cl_int(*fn_clRetainProgram_t)(cl_program);
+typedef cl_int(*fn_clReleaseProgram_t)(cl_program);
+typedef cl_int(*fn_clBuildProgram_t)(cl_program, cl_uint, const cl_device_id *, const char *, void (*pfn_notify)(cl_program,void*), void *);
+typedef cl_int(*fn_clUnloadCompiler_t)(void);
+typedef cl_int(*fn_clGetProgramInfo_t)(cl_program, cl_program_info, size_t, void *, size_t *);
+typedef cl_int(*fn_clGetProgramBuildInfo_t)(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *);
+typedef cl_kernel(*fn_clCreateKernel_t)(cl_program, const char *, cl_int *);
+typedef cl_int(*fn_clCreateKernelsInProgram_t)(cl_program, cl_uint, cl_kernel *, cl_uint *);
+typedef cl_int(*fn_clRetainKernel_t)(cl_kernel);
+typedef cl_int(*fn_clReleaseKernel_t)(cl_kernel);
+typedef cl_int(*fn_clSetKernelArg_t)(cl_kernel, cl_uint, size_t, const void *);
+typedef cl_int(*fn_clGetKernelInfo_t)(cl_kernel, cl_kernel_info, size_t, void *, size_t *);
+typedef cl_int(*fn_clGetKernelWorkGroupInfo_t)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *);
+typedef cl_int(*fn_clWaitForEvents_t)(cl_uint, const cl_event *);
+typedef cl_int(*fn_clGetEventInfo_t)(cl_event, cl_event_info, size_t, void *, size_t *);
+typedef cl_int(*fn_clRetainEvent_t)(cl_event);
+typedef cl_int(*fn_clReleaseEvent_t)(cl_event);
+typedef cl_int(*fn_clGetEventProfilingInfo_t)(cl_event, cl_profiling_info, size_t, void *, size_t *);
+typedef cl_int(*fn_clFlush_t)(cl_command_queue);
+typedef cl_int(*fn_clFinish_t)(cl_command_queue);
+typedef cl_int(*fn_clEnqueueReadBuffer_t)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueWriteBuffer_t)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueCopyBuffer_t)(cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueReadImage_t)(cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueWriteImage_t)(cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueCopyImage_t)(cl_command_queue, cl_mem, cl_mem, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueCopyImageToBuffer_t)(cl_command_queue, cl_mem, cl_mem, const size_t *, const size_t *, size_t, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueCopyBufferToImage_t)(cl_command_queue, cl_mem, cl_mem, size_t, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
+typedef void*(*fn_clEnqueueMapBuffer_t)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *);
+typedef void*(*fn_clEnqueueMapImage_t)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, const size_t *, const size_t *, size_t *, size_t *, cl_uint, const cl_event *, cl_event *, cl_int *);
+typedef cl_int(*fn_clEnqueueUnmapMemObject_t)(cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueNDRangeKernel_t)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueTask_t)(cl_command_queue, cl_kernel, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueNativeKernel_t)(cl_command_queue, void (*user_func)(void *), void *, size_t, cl_uint, const cl_mem *, const void **, cl_uint, const cl_event *, cl_event *);
+typedef cl_int(*fn_clEnqueueMarker_t)(cl_command_queue, cl_event *);
+typedef cl_int(*fn_clEnqueueWaitForEvents_t)(cl_command_queue, cl_uint, const cl_event *);
+typedef cl_int(*fn_clEnqueueBarrier_t)(cl_command_queue);
+typedef void*(*fn_clGetExtensionFunctionAddress_t)(const char *);
+
+typedef struct CL_FUNCTIONS {
+ fn_clGetPlatformIDs_t getPlatformIDs;
+ fn_clGetPlatformInfo_t getPlatformInfo;
+ fn_clGetDeviceIDs_t getDeviceIDs;
+ fn_clGetDeviceInfo_t getDeviceInfo;
+ fn_clCreateContext_t createContext;
+ fn_clCreateContextFromType_t createContextFromType;
+ fn_clRetainContext_t retainContext;
+ fn_clReleaseContext_t releaseContext;
+ fn_clGetContextInfo_t getContextInfo;
+ fn_clCreateCommandQueue_t createCommandQueue;
+ fn_clRetainCommandQueue_t retainCommandQueue;
+ fn_clReleaseCommandQueue_t releaseCommandQueue;
+ fn_clGetCommandQueueInfo_t getCommandQueue;
+ fn_clCreateBuffer_t createBuffer;
+ fn_clCreateImage2D_t createImage2D;
+ fn_clCreateImage3D_t createImage3D;
+ fn_clRetainMemObject_t retainMemObject;
+ fn_clReleaseMemObject_t releaseMemObject;
+ fn_clGetSupportedImageFormats_t getSupportedImageFormats;
+ fn_clGetMemObjectInfo_t getMemObjectInfo;
+ fn_clGetImageInfo_t getImageInfo;
+ fn_clCreateSampler_t createSampler;
+ fn_clRetainSampler_t retainSampler;
+ fn_clReleaseSampler_t releaseSampler;
+ fn_clGetSamplerInfo_t getSamplerInfo;
+ fn_clCreateProgramWithSource_t createProgramWithSource;
+ fn_clCreateProgramWithBinary_t createProgramWithBinary;
+ fn_clRetainProgram_t retainProgram;
+ fn_clReleaseProgram_t releaseProgram;
+ fn_clBuildProgram_t buildProgram;
+ fn_clUnloadCompiler_t unloadCompiler;
+ fn_clGetProgramInfo_t getProgramInfo;
+ fn_clGetProgramBuildInfo_t getProgramBuildInfo;
+ fn_clCreateKernel_t createKernel;
+ fn_clCreateKernelsInProgram_t createKernelsInProgram;
+ fn_clRetainKernel_t retainKernel;
+ fn_clReleaseKernel_t releaseKernel;
+ fn_clSetKernelArg_t setKernelArg;
+ fn_clGetKernelInfo_t getKernelInfo;
+ fn_clGetKernelWorkGroupInfo_t getKernelWorkGroupInfo;
+ fn_clWaitForEvents_t waitForEvents;
+ fn_clGetEventInfo_t getEventInfo;
+ fn_clRetainEvent_t retainEvent;
+ fn_clReleaseEvent_t releaseEvent;
+ fn_clGetEventProfilingInfo_t getEventProfilingInfo;
+ fn_clFlush_t flush;
+ fn_clFinish_t finish;
+ fn_clEnqueueReadBuffer_t enqueueReadBuffer;
+ fn_clEnqueueWriteBuffer_t enqueueWriteBuffer;
+ fn_clEnqueueCopyBuffer_t enqueueCopyBuffer;
+ fn_clEnqueueReadImage_t enqueueReadImage;
+ fn_clEnqueueWriteImage_t enqueueWriteImage;
+ fn_clEnqueueCopyImage_t enqueueCopyImage;
+ fn_clEnqueueCopyImageToBuffer_t enqueueCopyImageToBuffer;
+ fn_clEnqueueCopyBufferToImage_t enqueueCopyBufferToImage;
+ fn_clEnqueueMapBuffer_t enqueueMapBuffer;
+ fn_clEnqueueMapImage_t enqueueMapImage;
+ fn_clEnqueueUnmapMemObject_t enqueueUnmapMemObject;
+ fn_clEnqueueNDRangeKernel_t enqueueNDRAngeKernel;
+ fn_clEnqueueTask_t enqueueTask;
+ fn_clEnqueueNativeKernel_t enqueueNativeKernel;
+ fn_clEnqueueMarker_t enqueueMarker;
+ fn_clEnqueueWaitForEvents_t enqueueWaitForEvents;
+ fn_clEnqueueBarrier_t enqueueBarrier;
+ fn_clGetExtensionFunctionAddress_t getExtensionFunctionAddress;
+} CL_FUNCTIONS;
+
+extern CL_FUNCTIONS cl;
+
+#define clGetPlatformIDs cl.getPlatformIDs
+#define clGetPlatformInfo cl.getPlatformInfo
+#define clGetDeviceIDs cl.getDeviceIDs
+#define clGetDeviceInfo cl.getDeviceInfo
+#define clCreateContext cl.createContext
+#define clCreateContextFromType cl.createContextFromType
+#define clRetainContext cl.retainContext
+#define clReleaseContext cl.releaseContext
+#define clGetContextInfo cl.getContextInfo
+#define clCreateCommandQueue cl.createCommandQueue
+#define clRetainCommandQueue cl.retainCommandQueue
+#define clReleaseCommandQueue cl.releaseCommandQueue
+#define clGetCommandQueueInfo cl.getCommandQueue
+#define clCreateBuffer cl.createBuffer
+#define clCreateSubBuffer cl.createSubBuffer
+#define clCreateImage2D cl.createImage2D
+#define clCreateImage3D cl.createImage3D
+#define clRetainMemObject cl.retainMemObject
+#define clReleaseMemObject cl.releaseMemObject
+#define clGetSupportedImageFormats cl.getSupportedImageFormats
+#define clGetMemObjectInfo cl.getMemObjectInfo
+#define clGetImageInfo cl.getImageInfo
+#define clSetMemObjectDestructorCallback cl.setMemObjectDestructorCallback
+#define clCreateSampler cl.createSampler
+#define clRetainSampler cl.retainSampler
+#define clReleaseSampler cl.releaseSampler
+#define clGetSamplerInfo cl.getSamplerInfo
+#define clCreateProgramWithSource cl.createProgramWithSource
+#define clCreateProgramWithBinary cl.createProgramWithBinary
+#define clRetainProgram cl.retainProgram
+#define clReleaseProgram cl.releaseProgram
+#define clBuildProgram cl.buildProgram
+#define clUnloadCompiler cl.unloadCompiler
+#define clGetProgramInfo cl.getProgramInfo
+#define clGetProgramBuildInfo cl.getProgramBuildInfo
+#define clCreateKernel cl.createKernel
+#define clCreateKernelsInProgram cl.createKernelsInProgram
+#define clRetainKernel cl.retainKernel
+#define clReleaseKernel cl.releaseKernel
+#define clSetKernelArg cl.setKernelArg
+#define clGetKernelInfo cl.getKernelInfo
+#define clGetKernelWorkGroupInfo cl.getKernelWorkGroupInfo
+#define clWaitForEvents cl.waitForEvents
+#define clGetEventInfo cl.getEventInfo
+#define clCreateUserEvent cl.createUserEvent
+#define clRetainEvent cl.retainEvent
+#define clReleaseEvent cl.releaseEvent
+#define clSetUserEventStatus cl.setUserEventStatus
+#define clSetEventCallback cl.setEventCallback
+#define clGetEventProfilingInfo cl.getEventProfilingInfo
+#define clFlush cl.flush
+#define clFinish cl.finish
+#define clEnqueueReadBuffer cl.enqueueReadBuffer
+#define clEnqueueReadBufferRect cl.enqueueReadBufferRect
+#define clEnqueueWriteBuffer cl.enqueueWriteBuffer
+#define clEnqueueWriteBufferRect cl.enqueueWriteBufferRect
+#define clEnqueueCopyBuffer cl.enqueueCopyBuffer
+#define clEnqueueCopyBufferRect cl.enqueueCopyBufferRect
+#define clEnqueueReadImage cl.enqueueReadImage
+#define clEnqueueWriteImage cl.enqueueWriteImage
+#define clEnqueueCopyImage cl.enqueueCopyImage
+#define clEnqueueCopyImageToBuffer cl.enqueueCopyImageToBuffer
+#define clEnqueueCopyBufferToImage cl.enqueueCopyBufferToImage
+#define clEnqueueMapBuffer cl.enqueueMapBuffer
+#define clEnqueueMapImage cl.enqueueMapImage
+#define clEnqueueUnmapMemObject cl.enqueueUnmapMemObject
+#define clEnqueueNDRangeKernel cl.enqueueNDRAngeKernel
+#define clEnqueueTask cl.enqueueTask
+#define clEnqueueNativeKernel cl.enqueueNativeKernel
+#define clEnqueueMarker cl.enqueueMarker
+#define clEnqueueWaitForEvents cl.enqueueWaitForEvents
+#define clEnqueueBarrier cl.enqueueBarrier
+#define clGetExtensionFunctionAddress cl.getExtensionFunctionAddress
+
+#define CL_LOAD_FN(name, ref) \
+ ref = dlsym(dll,name); \
+ if (ref == NULL){ \
+ dlclose(dll); \
+ return CL_INVALID_PLATFORM; \
+ }
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DYNAMIC_CL_H */
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+
+//ACW: Remove me after debugging.
+#include <stdio.h>
+#include <string.h>
+
+#include "vp8_opencl.h"
+#include "filter_cl.h"
+#include "../blockd.h"
+
+#define SIXTAP_FILTER_LEN 6
+
+const char *filterCompileOptions = "-Ivp8/common/opencl -DVP8_FILTER_WEIGHT=128 -DVP8_FILTER_SHIFT=7 -DFILTER_OFFSET";
+const char *filter_cl_file_name = "vp8/common/opencl/filter_cl.cl";
+
+#define STATIC_MEM 1
+#if STATIC_MEM
+static cl_mem int_mem = NULL;
+#endif
+
+void cl_destroy_filter(){
+
+ if (cl_data.filter_program)
+ clReleaseProgram(cl_data.filter_program);
+
+ //VP8_CL_RELEASE_KERNEL(cl_data.vp8_block_variation_kernel);
+#if !TWO_PASS_SIXTAP
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict8x8_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict8x4_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict16x16_kernel);
+#else
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_first_pass_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_second_pass_kernel);
+#endif
+ //VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict4x4_kernel);
+ //VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict8x4_kernel);
+ //VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict8x8_kernel);
+ //VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict16x16_kernel);
+
+#if MEM_COPY_KERNEL
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_memcpy_kernel);
+#endif
+
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_bil_first_pass_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_bil_second_pass_kernel);
+
+#if STATIC_MEM
+ if (int_mem != NULL)
+ clReleaseMemObject(int_mem);
+ int_mem = NULL;
+#endif
+
+ cl_data.filter_program = NULL;
+}
+
+int cl_init_filter() {
+ int err;
+
+
+ // Create the filter compute program from the file-defined source code
+ if ( cl_load_program(&cl_data.filter_program, filter_cl_file_name,
+ filterCompileOptions) != CL_SUCCESS )
+ return VP8_CL_TRIED_BUT_FAILED;
+
+ // Create the compute kernel in the program we wish to run
+#if TWO_PASS_SIXTAP
+ VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_first_pass_kernel,"vp8_filter_block2d_first_pass_kernel");
+ VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_second_pass_kernel,"vp8_filter_block2d_second_pass_kernel");
+ VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_first_pass_kernel,vp8_filter_block2d_first_pass_kernel_size);
+ VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_second_pass_kernel,vp8_filter_block2d_second_pass_kernel_size);
+#else
+ VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict_kernel,"vp8_sixtap_predict_kernel");
+ VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict_kernel,vp8_sixtap_predict_kernel_size);
+ VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict8x8_kernel,"vp8_sixtap_predict8x8_kernel");
+ VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict8x8_kernel,vp8_sixtap_predict8x8_kernel_size);
+ VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict8x4_kernel,"vp8_sixtap_predict8x4_kernel");
+ VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict8x4_kernel,vp8_sixtap_predict8x4_kernel_size);
+ VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict16x16_kernel,"vp8_sixtap_predict16x16_kernel");
+ VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict16x16_kernel,vp8_sixtap_predict16x16_kernel_size);
+#endif
+
+ //VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_bil_first_pass_kernel,vp8_filter_block2d_bil_first_pass_kernel_size);
+ //VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_bil_second_pass_kernel,vp8_filter_block2d_bil_second_pass_kernel_size);
+ VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_bil_first_pass_kernel,"vp8_filter_block2d_bil_first_pass_kernel");
+ VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_bil_second_pass_kernel,"vp8_filter_block2d_bil_second_pass_kernel");
+
+
+ //VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict4x4_kernel,"vp8_bilinear_predict4x4_kernel");
+ //VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict8x4_kernel,"vp8_bilinear_predict8x4_kernel");
+ //VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict8x8_kernel,"vp8_bilinear_predict8x8_kernel");
+ //VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict16x16_kernel,"vp8_bilinear_predict16x16_kernel");
+
+#if MEM_COPY_KERNEL
+ VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_memcpy_kernel,"vp8_memcpy_kernel");
+ VP8_CL_CALC_LOCAL_SIZE(vp8_memcpy_kernel,vp8_memcpy_kernel_size);
+#endif
+
+#if STATIC_MEM
+ VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,err);
+#endif
+
+ return CL_SUCCESS;
+}
+
+void vp8_filter_block2d_first_pass_cl(
+ cl_command_queue cq,
+ cl_mem src_mem,
+ int src_offset,
+ cl_mem int_mem,
+ unsigned int src_pixels_per_line,
+ unsigned int int_height,
+ unsigned int int_width,
+ int xoffset
+){
+ int err;
+ size_t global = int_width*int_height;
+ size_t local = cl_data.vp8_filter_block2d_first_pass_kernel_size;
+ if (local > global)
+ local = global;
+
+ err = clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 0, sizeof (cl_mem), &src_mem);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 1, sizeof (int), &src_offset);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 2, sizeof (cl_mem), &int_mem);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 3, sizeof (cl_uint), &src_pixels_per_line);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 4, sizeof (cl_uint), &int_height);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 5, sizeof (cl_int), &int_width);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 6, sizeof (int), &xoffset);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",
+ ,
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_first_pass_kernel, 1, NULL, &global, &local , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);,
+ );
+}
+
+void vp8_filter_block2d_second_pass_cl(
+ cl_command_queue cq,
+ cl_mem int_mem,
+ int int_offset,
+ cl_mem dst_mem,
+ int dst_offset,
+ int dst_pitch,
+ unsigned int output_height,
+ unsigned int output_width,
+ int yoffset
+){
+ int err;
+ size_t global = output_width*output_height;
+ size_t local = cl_data.vp8_filter_block2d_second_pass_kernel_size;
+ if (local > global){
+ //printf("Local is now %ld\n",global);
+ local = global;
+ }
+
+ /* Set kernel arguments */
+ err = clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 0, sizeof (cl_mem), &int_mem);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 1, sizeof (int), &int_offset);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 2, sizeof (cl_mem), &dst_mem);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 3, sizeof (int), &dst_offset);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 4, sizeof (int), &dst_pitch);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 5, sizeof (int), &output_width);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 6, sizeof (int), &output_width);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 7, sizeof (int), &output_height);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 8, sizeof (int), &output_width);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 9, sizeof (int), &yoffset);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",
+ ,
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_second_pass_kernel, 1, NULL, &global, &local , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);,
+ );
+}
+
+void vp8_sixtap_single_pass(
+ cl_command_queue cq,
+ cl_kernel kernel,
+ size_t local,
+ size_t global,
+ cl_mem src_mem,
+ cl_mem dst_mem,
+ unsigned char *src_base,
+ int src_offset,
+ size_t src_len,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_base,
+ int dst_offset,
+ int dst_pitch,
+ size_t dst_len
+){
+ int err;
+
+#if !STATIC_MEM
+ cl_mem int_mem;
+#endif
+
+ int free_src = 0, free_dst = 0;
+
+ if (local > global){
+ local = global;
+ }
+
+ /* Make space for kernel input/output data.
+ * Initialize the buffer as well if needed.
+ */
+ if (src_mem == NULL){
+ VP8_CL_CREATE_BUF( cq, src_mem,, sizeof (unsigned char) * src_len, src_base-2,,);
+ src_offset = 2;
+ free_src = 1;
+ } else {
+ src_offset -= 2*src_pixels_per_line;
+ }
+
+ if (dst_mem == NULL){
+ VP8_CL_CREATE_BUF( cq, dst_mem,, sizeof (unsigned char) * dst_len + dst_offset, dst_base,, );
+ free_dst = 1;
+ }
+
+#if !STATIC_MEM
+ CL_CREATE_BUF( cq, int_mem,, sizeof(cl_int)*FData_height*FData_width, NULL,, );
+#endif
+
+ err = clSetKernelArg(kernel, 0, sizeof (cl_mem), &src_mem);
+ err |= clSetKernelArg(kernel, 1, sizeof (int), &src_offset);
+ err |= clSetKernelArg(kernel, 2, sizeof (cl_int), &src_pixels_per_line);
+ err |= clSetKernelArg(kernel, 3, sizeof (cl_int), &xoffset);
+ err |= clSetKernelArg(kernel, 4, sizeof (cl_int), &yoffset);
+ err |= clSetKernelArg(kernel, 5, sizeof (cl_mem), &dst_mem);
+ err |= clSetKernelArg(kernel, 6, sizeof (cl_int), &dst_offset);
+ err |= clSetKernelArg(kernel, 7, sizeof (int), &dst_pitch);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",
+ ,
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel( cq, kernel, 1, NULL, &global, &local , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);,
+ );
+
+ if (free_src == 1)
+ clReleaseMemObject(src_mem);
+
+ if (free_dst == 1){
+ /* Read back the result data from the device */
+ err = clEnqueueReadBuffer(cq, dst_mem, CL_FALSE, 0, sizeof (unsigned char) * dst_len + dst_offset, dst_base, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to read output array!\n",
+ ,
+ );
+ clReleaseMemObject(dst_mem);
+ }
+}
+
+void vp8_sixtap_run_cl(
+ cl_command_queue cq,
+ cl_mem src_mem,
+ cl_mem dst_mem,
+ unsigned char *src_base,
+ int src_offset,
+ size_t src_len,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_base,
+ int dst_offset,
+ int dst_pitch,
+ size_t dst_len,
+ unsigned int FData_height,
+ unsigned int FData_width,
+ unsigned int output_height,
+ unsigned int output_width,
+ int int_offset
+)
+{
+ int err;
+
+#if !STATIC_MEM
+ cl_mem int_mem;
+#endif
+
+ int free_src = 0, free_dst = 0;
+
+ /* Make space for kernel input/output data.
+ * Initialize the buffer as well if needed.
+ */
+ if (src_mem == NULL){
+ VP8_CL_CREATE_BUF( cq, src_mem,, sizeof (unsigned char) * src_len, src_base-2,,);
+ src_offset = 2;
+ free_src = 1;
+ } else {
+ src_offset -= 2*src_pixels_per_line;
+ }
+
+ if (dst_mem == NULL){
+ VP8_CL_CREATE_BUF( cq, dst_mem,, sizeof (unsigned char) * dst_len + dst_offset, dst_base,, );
+ free_dst = 1;
+ }
+
+#if !STATIC_MEM
+ CL_CREATE_BUF( cq, int_mem,, sizeof(cl_int)*FData_height*FData_width, NULL,, );
+#endif
+
+ vp8_filter_block2d_first_pass_cl(
+ cq, src_mem, src_offset, int_mem, src_pixels_per_line,
+ FData_height, FData_width, xoffset
+ );
+
+ vp8_filter_block2d_second_pass_cl(cq,int_mem,int_offset,dst_mem,dst_offset,dst_pitch,
+ output_height,output_width,yoffset);
+
+ if (free_src == 1)
+ clReleaseMemObject(src_mem);
+
+ if (free_dst == 1){
+ /* Read back the result data from the device */
+ err = clEnqueueReadBuffer(cq, dst_mem, CL_FALSE, 0, sizeof (unsigned char) * dst_len + dst_offset, dst_base, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to read output array!\n",
+ ,
+ );
+ clReleaseMemObject(dst_mem);
+ }
+
+#if !STATIC_MEM
+ clReleaseMemObject(int_mem);
+#endif
+}
+
+void vp8_sixtap_predict4x4_cl
+(
+ cl_command_queue cq,
+ unsigned char *src_base,
+ cl_mem src_mem,
+ int src_offset,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_base,
+ cl_mem dst_mem,
+ int dst_offset,
+ int dst_pitch
+) {
+
+ int output_width=4, output_height=4, FData_height=9, FData_width=4;
+
+ //Size of output to transfer
+ int dst_len = DST_LEN(dst_pitch,output_height,output_width);
+ int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line);
+
+#if TWO_PASS_SIXTAP
+ int int_offset = 8;
+ unsigned char *src_ptr = src_base + src_offset;
+
+ vp8_sixtap_run_cl(cq, src_mem, dst_mem,
+ (src_ptr-2*src_pixels_per_line),src_offset, src_len,
+ src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset,
+ dst_pitch,dst_len,FData_height,FData_width,output_height,
+ output_width,int_offset
+ );
+#else
+ vp8_sixtap_single_pass(
+ cq,
+ cl_data.vp8_sixtap_predict_kernel,
+ cl_data.vp8_sixtap_predict_kernel_size,
+ FData_height*FData_width,
+ src_mem,
+ dst_mem,
+ src_base,
+ src_offset,
+ src_len,
+ src_pixels_per_line,
+ xoffset,
+ yoffset,
+ dst_base,
+ dst_offset,
+ dst_pitch,
+ dst_len
+ );
+#endif
+
+
+ return;
+}
+
+void vp8_sixtap_predict8x8_cl
+(
+ cl_command_queue cq,
+ unsigned char *src_base,
+ cl_mem src_mem,
+ int src_offset,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_base,
+ cl_mem dst_mem,
+ int dst_offset,
+ int dst_pitch
+) {
+ int output_width=8, output_height=8, FData_height=13, FData_width=8;
+
+ //Size of output to transfer
+ int dst_len = DST_LEN(dst_pitch,output_height,output_width);
+ int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line);
+
+#if TWO_PASS_SIXTAP
+ int int_offset = 16;
+ unsigned char *src_ptr = src_base + src_offset;
+
+ vp8_sixtap_run_cl(cq, src_mem, dst_mem,
+ (src_ptr-2*src_pixels_per_line),src_offset, src_len,
+ src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset,
+ dst_pitch,dst_len,FData_height,FData_width,output_height,
+ output_width,int_offset
+ );
+#else
+ vp8_sixtap_single_pass(
+ cq,
+ cl_data.vp8_sixtap_predict8x8_kernel,
+ cl_data.vp8_sixtap_predict8x8_kernel_size,
+ FData_height*FData_width,
+ src_mem,
+ dst_mem,
+ src_base,
+ src_offset,
+ src_len,
+ src_pixels_per_line,
+ xoffset,
+ yoffset,
+ dst_base,
+ dst_offset,
+ dst_pitch,
+ dst_len
+ );
+#endif
+
+ return;
+}
+
+void vp8_sixtap_predict8x4_cl
+(
+ cl_command_queue cq,
+ unsigned char *src_base,
+ cl_mem src_mem,
+ int src_offset,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_base,
+ cl_mem dst_mem,
+ int dst_offset,
+ int dst_pitch
+) {
+
+ int output_width=8, output_height=4, FData_height=9, FData_width=8;
+
+ //Size of output to transfer
+ int dst_len = DST_LEN(dst_pitch,output_height,output_width);
+ int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line);
+
+#if TWO_PASS_SIXTAP
+ int int_offset = 16;
+ unsigned char *src_ptr = src_base + src_offset;
+
+ vp8_sixtap_run_cl(cq, src_mem, dst_mem,
+ (src_ptr-2*src_pixels_per_line),src_offset, src_len,
+ src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset,
+ dst_pitch,dst_len,FData_height,FData_width,output_height,
+ output_width,int_offset
+ );
+#else
+ vp8_sixtap_single_pass(
+ cq,
+ cl_data.vp8_sixtap_predict8x4_kernel,
+ cl_data.vp8_sixtap_predict8x4_kernel_size,
+ FData_height*FData_width,
+ src_mem,
+ dst_mem,
+ src_base,
+ src_offset,
+ src_len,
+ src_pixels_per_line,
+ xoffset,
+ yoffset,
+ dst_base,
+ dst_offset,
+ dst_pitch,
+ dst_len
+ );
+#endif
+
+ return;
+}
+
+void vp8_sixtap_predict16x16_cl
+(
+ cl_command_queue cq,
+ unsigned char *src_base,
+ cl_mem src_mem,
+ int src_offset,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_base,
+ cl_mem dst_mem,
+ int dst_offset,
+ int dst_pitch
+) {
+
+ int output_width=16, output_height=16, FData_height=21, FData_width=16;
+
+ //Size of output to transfer
+ int dst_len = DST_LEN(dst_pitch,output_height,output_width);
+ int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line);
+
+#if TWO_PASS_SIXTAP
+ int int_offset = 32;
+ unsigned char *src_ptr = src_base + src_offset;
+
+ vp8_sixtap_run_cl(cq, src_mem, dst_mem,
+ (src_ptr-2*src_pixels_per_line),src_offset, src_len,
+ src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset,
+ dst_pitch,dst_len,FData_height,FData_width,output_height,
+ output_width,int_offset
+ );
+#else
+ vp8_sixtap_single_pass(
+ cq,
+ cl_data.vp8_sixtap_predict16x16_kernel,
+ cl_data.vp8_sixtap_predict16x16_kernel_size,
+ FData_height*FData_width,
+ src_mem,
+ dst_mem,
+ src_base,
+ src_offset,
+ src_len,
+ src_pixels_per_line,
+ xoffset,
+ yoffset,
+ dst_base,
+ dst_offset,
+ dst_pitch,
+ dst_len
+ );
+#endif
+
+ return;
+
+}
+
+
+
+void vp8_filter_block2d_bil_first_pass_cl(
+ cl_command_queue cq,
+ unsigned char *src_base,
+ cl_mem src_mem,
+ int src_offset,
+ cl_mem int_mem,
+ int src_pixels_per_line,
+ int height,
+ int width,
+ int xoffset
+)
+{
+ int err;
+ size_t global = width*height;
+ int free_src = 0;
+
+ if (src_mem == NULL){
+ int src_len = BIL_SRC_LEN(width,height,src_pixels_per_line);
+
+ /*Make space for kernel input/output data. Initialize the buffer as well if needed. */
+ VP8_CL_CREATE_BUF(cq, src_mem, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
+ sizeof (unsigned char) * src_len, src_base+src_offset,,
+ );
+ src_offset = 0; //Set to zero as long as src_mem starts at base+offset
+ free_src = 1;
+ }
+
+ err = clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 0, sizeof (cl_mem), &src_mem);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 1, sizeof (int), &src_offset);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 2, sizeof (cl_mem), &int_mem);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 3, sizeof (int), &src_pixels_per_line);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 4, sizeof (int), &height);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 5, sizeof (int), &width);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 6, sizeof (int), &xoffset);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",
+ ,
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_bil_first_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);,
+ );
+
+ if (free_src == 1)
+ clReleaseMemObject(src_mem);
+}
+
+
+void vp8_filter_block2d_bil_second_pass_cl(
+ cl_command_queue cq,
+ cl_mem int_mem,
+ unsigned char *dst_base,
+ cl_mem dst_mem,
+ int dst_offset,
+ int dst_pitch,
+ int height,
+ int width,
+ int yoffset
+)
+{
+ int err;
+ size_t global = width*height;
+
+ //Size of output data
+ int dst_len = DST_LEN(dst_pitch,height,width);
+
+ int free_dst = 0;
+ if (dst_mem == NULL){
+ VP8_CL_CREATE_BUF(cq, dst_mem, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR,
+ sizeof (unsigned char) * dst_len + dst_offset, dst_base,,
+ );
+ free_dst = 1;
+ }
+
+ err = clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 0, sizeof (cl_mem), &int_mem);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 1, sizeof (cl_mem), &dst_mem);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 2, sizeof (int), &dst_offset);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 3, sizeof (int), &dst_pitch);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 4, sizeof (int), &height);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 5, sizeof (int), &width);
+ err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 6, sizeof (int), &yoffset);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",
+ ,
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_bil_second_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);,
+ );
+
+ if (free_dst == 1){
+ /* Read back the result data from the device */
+ err = clEnqueueReadBuffer(cq, dst_mem, CL_FALSE, 0, sizeof (unsigned char) * dst_len + dst_offset, dst_base, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to read output array!\n",
+ ,
+ );
+ clReleaseMemObject(dst_mem);
+ }
+
+}
+
+void vp8_bilinear_predict4x4_cl
+(
+ cl_command_queue cq,
+ unsigned char *src_base,
+ cl_mem src_mem,
+ int src_offset,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_base,
+ cl_mem dst_mem,
+ int dst_offset,
+ int dst_pitch
+) {
+
+ const int height = 4, width = 4;
+
+#if !STATIC_MEM
+ int err;
+ cl_mem int_mem = NULL;
+ VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,);
+#endif
+
+ /* First filter 1-D horizontally... */
+ vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset);
+
+ /* then 1-D vertically... */
+ vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset);
+
+#if !STATIC_MEM
+ clReleaseMemObject(int_mem);
+#endif
+
+}
+
+void vp8_bilinear_predict8x8_cl
+(
+ cl_command_queue cq,
+ unsigned char *src_base,
+ cl_mem src_mem,
+ int src_offset,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_base,
+ cl_mem dst_mem,
+ int dst_offset,
+ int dst_pitch
+) {
+
+ const int height = 8, width = 8;
+
+#if !STATIC_MEM
+ int err;
+ cl_mem int_mem = NULL;
+ VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,);
+#endif
+
+ /* First filter 1-D horizontally... */
+ vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset);
+
+ /* then 1-D vertically... */
+ vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset);
+
+#if !STATIC_MEM
+ clReleaseMemObject(int_mem);
+#endif
+
+}
+
+void vp8_bilinear_predict8x4_cl
+(
+ cl_command_queue cq,
+ unsigned char *src_base,
+ cl_mem src_mem,
+ int src_offset,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_base,
+ cl_mem dst_mem,
+ int dst_offset,
+ int dst_pitch
+) {
+
+ const int height = 4, width = 8;
+
+#if !STATIC_MEM
+ int err;
+ cl_mem int_mem = NULL;
+ VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,);
+#endif
+
+ /* First filter 1-D horizontally... */
+ vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset);
+
+ /* then 1-D vertically... */
+ vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset);
+
+#if !STATIC_MEM
+ clReleaseMemObject(int_mem);
+#endif
+
+}
+
+void vp8_bilinear_predict16x16_cl
+(
+ cl_command_queue cq,
+ unsigned char *src_base,
+ cl_mem src_mem,
+ int src_offset,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ unsigned char *dst_base,
+ cl_mem dst_mem,
+ int dst_offset,
+ int dst_pitch
+) {
+
+ const int height = 16, width = 16;
+
+#if !STATIC_MEM
+ int err;
+ cl_mem int_mem = NULL;
+ VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,);
+#endif
+
+ /* First filter 1-D horizontally... */
+ vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset);
+
+ /* then 1-D vertically... */
+ vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset);
+
+#if !STATIC_MEM
+ clReleaseMemObject(int_mem);
+#endif
+
+}
--- /dev/null
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\r
+#pragma OPENCL EXTENSION cl_amd_printf : enable\r
+\r
+__constant int bilinear_filters[8][2] = {\r
+ { 128, 0},\r
+ { 112, 16},\r
+ { 96, 32},\r
+ { 80, 48},\r
+ { 64, 64},\r
+ { 48, 80},\r
+ { 32, 96},\r
+ { 16, 112}\r
+};\r
+\r
+__constant short sub_pel_filters[8][8] = {\r
+ //These were originally 8x6, but are padded for vector ops\r
+ { 0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */\r
+ { 0, -6, 123, 12, -1, 0, 0, 0},\r
+ { 2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */\r
+ { 0, -9, 93, 50, -6, 0, 0, 0},\r
+ { 3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */\r
+ { 0, -6, 50, 93, -9, 0, 0, 0},\r
+ { 1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */\r
+ { 0, -1, 12, 123, -6, 0, 0, 0},\r
+};\r
+\r
+\r
+kernel void vp8_filter_block2d_first_pass_kernel(\r
+ __global unsigned char *src_base,\r
+ int src_offset,\r
+ __global int *output_ptr,\r
+ unsigned int src_pixels_per_line,\r
+ unsigned int output_height,\r
+ unsigned int output_width,\r
+ int filter_offset\r
+){\r
+ uint tid = get_global_id(0);\r
+\r
+ global unsigned char *src_ptr = &src_base[src_offset];\r
+ //Note that src_offset will be reset later, which is why we use it now\r
+\r
+ int Temp;\r
+\r
+ __constant short *vp8_filter = sub_pel_filters[filter_offset];\r
+\r
+ if (tid < (output_width*output_height)){\r
+ src_offset = tid + (tid/output_width * (src_pixels_per_line - output_width));\r
+\r
+ Temp = (int)(src_ptr[src_offset - 2] * vp8_filter[0]) +\r
+ (int)(src_ptr[src_offset - 1] * vp8_filter[1]) +\r
+ (int)(src_ptr[src_offset] * vp8_filter[2]) +\r
+ (int)(src_ptr[src_offset + 1] * vp8_filter[3]) +\r
+ (int)(src_ptr[src_offset + 2] * vp8_filter[4]) +\r
+ (int)(src_ptr[src_offset + 3] * vp8_filter[5]) +\r
+ (VP8_FILTER_WEIGHT >> 1); /* Rounding */\r
+\r
+ /* Normalize back to 0-255 */\r
+ Temp = Temp >> VP8_FILTER_SHIFT;\r
+\r
+ if (Temp < 0)\r
+ Temp = 0;\r
+ else if ( Temp > 255 )\r
+ Temp = 255;\r
+\r
+ output_ptr[tid] = Temp;\r
+ }\r
+\r
+}\r
+\r
+kernel void vp8_filter_block2d_second_pass_kernel\r
+(\r
+ __global int *src_base,\r
+ int src_offset,\r
+ __global unsigned char *output_base,\r
+ int output_offset,\r
+ int output_pitch,\r
+ unsigned int src_pixels_per_line,\r
+ unsigned int pixel_step,\r
+ unsigned int output_height,\r
+ unsigned int output_width,\r
+ int filter_offset\r
+) {\r
+\r
+ uint i = get_global_id(0);\r
+\r
+ global int *src_ptr = &src_base[src_offset];\r
+ global unsigned char *output_ptr = &output_base[output_offset];\r
+\r
+ int out_offset; //Not same as output_offset...\r
+ int Temp;\r
+ int PS2 = 2*(int)pixel_step;\r
+ int PS3 = 3*(int)pixel_step;\r
+\r
+ unsigned int src_increment = src_pixels_per_line - output_width;\r
+\r
+ __constant short *vp8_filter = sub_pel_filters[filter_offset];\r
+\r
+ if (i < (output_width * output_height)){\r
+ out_offset = i/output_width;\r
+ src_offset = out_offset;\r
+\r
+ src_offset = i + (src_offset * src_increment);\r
+ out_offset = i%output_width + (out_offset * output_pitch);\r
+\r
+ /* Apply filter */\r
+ Temp = ((int)src_ptr[src_offset - PS2] * vp8_filter[0]) +\r
+ ((int)src_ptr[src_offset -(int)pixel_step] * vp8_filter[1]) +\r
+ ((int)src_ptr[src_offset] * vp8_filter[2]) +\r
+ ((int)src_ptr[src_offset + pixel_step] * vp8_filter[3]) +\r
+ ((int)src_ptr[src_offset + PS2] * vp8_filter[4]) +\r
+ ((int)src_ptr[src_offset + PS3] * vp8_filter[5]) +\r
+ (VP8_FILTER_WEIGHT >> 1); /* Rounding */\r
+\r
+ /* Normalize back to 0-255 */\r
+ Temp = Temp >> VP8_FILTER_SHIFT;\r
+ if (Temp < 0)\r
+ Temp = 0;\r
+ else if (Temp > 255)\r
+ Temp = 255;\r
+\r
+ output_ptr[out_offset] = (unsigned char)Temp;\r
+ }\r
+}\r
+\r
+\r
+kernel void vp8_filter_block2d_bil_first_pass_kernel(\r
+ __global unsigned char *src_base,\r
+ int src_offset,\r
+ __global int *output_ptr,\r
+ unsigned int src_pixels_per_line,\r
+ unsigned int output_height,\r
+ unsigned int output_width,\r
+ int filter_offset\r
+)\r
+{\r
+ uint tid = get_global_id(0);\r
+\r
+ if (tid < output_width * output_height){\r
+ global unsigned char *src_ptr = &src_base[src_offset];\r
+\r
+ unsigned int i, j;\r
+ __constant int *vp8_filter = bilinear_filters[filter_offset];\r
+\r
+ unsigned int out_row,out_offset;\r
+ int src_increment = src_pixels_per_line - output_width;\r
+\r
+ i = tid / output_width;\r
+ j = tid % output_width;\r
+\r
+ src_offset = i*(output_width+src_increment) + j;\r
+ out_row = output_width * i;\r
+\r
+ out_offset = out_row + j;\r
+\r
+ /* Apply bilinear filter */\r
+ output_ptr[out_offset] = (((int)src_ptr[src_offset] * vp8_filter[0]) +\r
+ ((int)src_ptr[src_offset+1] * vp8_filter[1]) +\r
+ (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;\r
+ }\r
+}\r
+\r
+kernel void vp8_filter_block2d_bil_second_pass_kernel\r
+(\r
+ __global int *src_ptr,\r
+ __global unsigned char *output_base,\r
+ int output_offset,\r
+ int output_pitch,\r
+ unsigned int output_height,\r
+ unsigned int output_width,\r
+ int filter_offset\r
+)\r
+{\r
+\r
+ uint tid = get_global_id(0);\r
+\r
+ if (tid < output_width * output_height){\r
+ global unsigned char *output_ptr = &output_base[output_offset];\r
+\r
+ unsigned int i, j;\r
+ int Temp;\r
+ __constant int *vp8_filter = bilinear_filters[filter_offset];\r
+\r
+ int out_offset;\r
+ int src_offset;\r
+\r
+ i = tid / output_width;\r
+ j = tid % output_width;\r
+\r
+ src_offset = i*(output_width) + j;\r
+ out_offset = i*output_pitch + j;\r
+\r
+ /* Apply filter */\r
+ Temp = ((int)src_ptr[src_offset] * vp8_filter[0]) +\r
+ ((int)src_ptr[src_offset+output_width] * vp8_filter[1]) +\r
+ (VP8_FILTER_WEIGHT / 2);\r
+\r
+ output_ptr[out_offset++] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);\r
+ }\r
+}\r
+\r
+\r
+\r
+\r
+//Called from reconinter_cl.c\r
+kernel void vp8_memcpy_kernel(\r
+ global unsigned char *src_base,\r
+ int src_offset,\r
+ int src_stride,\r
+ global unsigned char *dst_base,\r
+ int dst_offset,\r
+ int dst_stride,\r
+ int num_bytes,\r
+ int num_iter\r
+){\r
+\r
+ int i,r;\r
+ global unsigned char *src = &src_base[src_offset];\r
+ global unsigned char *dst = &dst_base[dst_offset];\r
+ src_offset = dst_offset = 0;\r
+\r
+ r = get_global_id(1);\r
+ if (r < get_global_size(1)){\r
+ i = get_global_id(0);\r
+ if (i < get_global_size(0)){\r
+ src_offset = r*src_stride + i;\r
+ dst_offset = r*dst_stride + i;\r
+ dst[dst_offset] = src[src_offset];\r
+ }\r
+ }\r
+}\r
+\r
+//Not used currently.\r
+void vp8_memset_short(\r
+ global short *mem,\r
+ int offset,\r
+ short newval,\r
+ unsigned int size\r
+)\r
+{\r
+ int tid = get_global_id(0);\r
+\r
+ if (tid < (size/2)){\r
+ mem[offset+tid/2] = newval;\r
+ }\r
+}\r
+\r
+\r
+\r
+__kernel void vp8_bilinear_predict4x4_kernel\r
+(\r
+ __global unsigned char *src_base,\r
+ int src_offset,\r
+ int src_pixels_per_line,\r
+ int xoffset,\r
+ int yoffset,\r
+ __global unsigned char *dst_base,\r
+ int dst_offset,\r
+ int dst_pitch,\r
+ __global int *int_mem\r
+)\r
+{\r
+ int Height = 4, Width = 4;\r
+\r
+ /* First filter 1-D horizontally... */\r
+ vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset);\r
+\r
+ /* then 1-D vertically... */\r
+ vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset);\r
+}\r
+\r
+__kernel void vp8_bilinear_predict8x8_kernel\r
+(\r
+ __global unsigned char *src_base,\r
+ int src_offset,\r
+ int src_pixels_per_line,\r
+ int xoffset,\r
+ int yoffset,\r
+ __global unsigned char *dst_base,\r
+ int dst_offset,\r
+ int dst_pitch,\r
+ __global int *int_mem\r
+)\r
+{\r
+ int Height = 8, Width = 8;\r
+\r
+ /* First filter 1-D horizontally... */\r
+ vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset);\r
+\r
+ /* then 1-D vertically... */\r
+ vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset);\r
+\r
+}\r
+\r
+__kernel void vp8_bilinear_predict8x4_kernel\r
+(\r
+ __global unsigned char *src_base,\r
+ int src_offset,\r
+ int src_pixels_per_line,\r
+ int xoffset,\r
+ int yoffset,\r
+ __global unsigned char *dst_base,\r
+ int dst_offset,\r
+ int dst_pitch,\r
+ __global int *int_mem\r
+)\r
+{\r
+ int Height = 4, Width = 8;\r
+\r
+ /* First filter 1-D horizontally... */\r
+ vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset);\r
+\r
+ /* then 1-D vertically... */\r
+ vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset);\r
+}\r
+\r
+__kernel void vp8_bilinear_predict16x16_kernel\r
+(\r
+ __global unsigned char *src_base,\r
+ int src_offset,\r
+ int src_pixels_per_line,\r
+ int xoffset,\r
+ int yoffset,\r
+ __global unsigned char *dst_base,\r
+ int dst_offset,\r
+ int dst_pitch,\r
+ __global int *int_mem\r
+)\r
+{\r
+\r
+ int Height = 16, Width = 16;\r
+\r
+ /* First filter 1-D horizontally... */\r
+ vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset);\r
+\r
+ /* then 1-D vertically... */\r
+ vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset);\r
+\r
+}\r
+\r
+void vp8_filter_block2d_first_pass(\r
+ global unsigned char *src_base,\r
+ int src_offset,\r
+ local int *output_ptr,\r
+ unsigned int src_pixels_per_line,\r
+ unsigned int pixel_step,\r
+ unsigned int output_height,\r
+ unsigned int output_width,\r
+ int filter_offset\r
+){\r
+ uint tid = get_global_id(0);\r
+ uint i = tid;\r
+\r
+ int nthreads = get_global_size(0);\r
+ int ngroups = nthreads / get_local_size(0);\r
+\r
+ global unsigned char *src_ptr = &src_base[src_offset];\r
+ //Note that src_offset will be reset later, which is why we capture it now\r
+\r
+ int Temp;\r
+\r
+ __constant short *vp8_filter = sub_pel_filters[filter_offset];\r
+\r
+ if (tid < (output_width*output_height)){\r
+ short filter0 = vp8_filter[0];\r
+ short filter1 = vp8_filter[1];\r
+ short filter2 = vp8_filter[2];\r
+ short filter3 = vp8_filter[3];\r
+ short filter4 = vp8_filter[4];\r
+ short filter5 = vp8_filter[5];\r
+\r
+ if (ngroups > 1){\r
+ //This is generally only true on Apple CPU-CL, which gives a group\r
+ //size of 1, regardless of the CPU core count.\r
+ for (i=0; i < output_width*output_height; i++){\r
+ src_offset = i + (i/output_width * (src_pixels_per_line - output_width));\r
+\r
+ Temp = (int)(src_ptr[src_offset - 2] * filter0) +\r
+ (int)(src_ptr[src_offset - 1] * filter1) +\r
+ (int)(src_ptr[src_offset] * filter2) +\r
+ (int)(src_ptr[src_offset + 1] * filter3) +\r
+ (int)(src_ptr[src_offset + 2] * filter4) +\r
+ (int)(src_ptr[src_offset + 3] * filter5) +\r
+ (VP8_FILTER_WEIGHT >> 1); /* Rounding */\r
+\r
+ /* Normalize back to 0-255 */\r
+ Temp >>= VP8_FILTER_SHIFT;\r
+\r
+ if (Temp < 0)\r
+ Temp = 0;\r
+ else if ( Temp > 255 )\r
+ Temp = 255;\r
+\r
+ output_ptr[i] = Temp;\r
+ }\r
+ } else {\r
+ src_offset = i + (i/output_width * (src_pixels_per_line - output_width));\r
+\r
+ Temp = (int)(src_ptr[src_offset - 2] * filter0) +\r
+ (int)(src_ptr[src_offset - 1] * filter1) +\r
+ (int)(src_ptr[src_offset] * filter2) +\r
+ (int)(src_ptr[src_offset + 1] * filter3) +\r
+ (int)(src_ptr[src_offset + 2] * filter4) +\r
+ (int)(src_ptr[src_offset + 3] * filter5) +\r
+ (VP8_FILTER_WEIGHT >> 1); /* Rounding */\r
+\r
+ /* Normalize back to 0-255 */\r
+ Temp >>= VP8_FILTER_SHIFT;\r
+\r
+ if (Temp < 0)\r
+ Temp = 0;\r
+ else if ( Temp > 255 )\r
+ Temp = 255;\r
+\r
+ output_ptr[i] = Temp;\r
+ }\r
+ }\r
+\r
+ //Add a fence so that no 2nd pass stuff starts before 1st pass writes are done.\r
+ barrier(CLK_LOCAL_MEM_FENCE);\r
+}\r
+\r
+void vp8_filter_block2d_second_pass\r
+(\r
+ local int *src_ptr,\r
+ global unsigned char *output_base,\r
+ int output_offset,\r
+ int output_pitch,\r
+ unsigned int src_pixels_per_line,\r
+ unsigned int pixel_step,\r
+ unsigned int output_height,\r
+ unsigned int output_width,\r
+ int filter_offset\r
+) {\r
+\r
+ global unsigned char *output_ptr = &output_base[output_offset];\r
+\r
+ int out_offset; //Not same as output_offset...\r
+ int src_offset;\r
+ int Temp;\r
+ int PS2 = 2*(int)pixel_step;\r
+ int PS3 = 3*(int)pixel_step;\r
+\r
+ unsigned int src_increment = src_pixels_per_line - output_width;\r
+\r
+ uint i = get_global_id(0);\r
+\r
+ __constant short *vp8_filter = sub_pel_filters[filter_offset];\r
+\r
+ if (i < (output_width * output_height)){\r
+ out_offset = i/output_width;\r
+ src_offset = out_offset;\r
+\r
+ src_offset = i + (src_offset * src_increment);\r
+ out_offset = i%output_width + (out_offset * output_pitch);\r
+\r
+ /* Apply filter */\r
+ Temp = ((int)src_ptr[src_offset - PS2] * vp8_filter[0]) +\r
+ ((int)src_ptr[src_offset -(int)pixel_step] * vp8_filter[1]) +\r
+ ((int)src_ptr[src_offset] * vp8_filter[2]) +\r
+ ((int)src_ptr[src_offset + pixel_step] * vp8_filter[3]) +\r
+ ((int)src_ptr[src_offset + PS2] * vp8_filter[4]) +\r
+ ((int)src_ptr[src_offset + PS3] * vp8_filter[5]) +\r
+ (VP8_FILTER_WEIGHT >> 1); /* Rounding */\r
+\r
+ /* Normalize back to 0-255 */\r
+ Temp = Temp >> VP8_FILTER_SHIFT;\r
+ if (Temp < 0)\r
+ Temp = 0;\r
+ else if (Temp > 255)\r
+ Temp = 255;\r
+\r
+ output_ptr[out_offset] = (unsigned char)Temp;\r
+ }\r
+}\r
+\r
+__kernel void vp8_sixtap_predict_kernel\r
+(\r
+ __global unsigned char *src_ptr,\r
+ int src_offset,\r
+ int src_pixels_per_line,\r
+ int xoffset,\r
+ int yoffset,\r
+ __global unsigned char *dst_ptr,\r
+ int dst_offset,\r
+ int dst_pitch\r
+)\r
+{\r
+\r
+ local int FData[9*4];\r
+\r
+ /* First filter 1-D horizontally... */\r
+ vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 9, 4, xoffset);\r
+\r
+ /* then filter vertically... */\r
+ vp8_filter_block2d_second_pass(&FData[8], dst_ptr, dst_offset, dst_pitch, 4, 4, 4, 4, yoffset);\r
+}\r
+\r
+__kernel void vp8_sixtap_predict8x8_kernel\r
+(\r
+ __global unsigned char *src_ptr,\r
+ int src_offset,\r
+ int src_pixels_per_line,\r
+ int xoffset,\r
+ int yoffset,\r
+ __global unsigned char *dst_ptr,\r
+ int dst_offset,\r
+ int dst_pitch\r
+)\r
+{\r
+ local int FData[13*16]; /* Temp data bufffer used in filtering */\r
+\r
+ /* First filter 1-D horizontally... */\r
+ vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 13, 8, xoffset);\r
+\r
+ /* then filter vertically... */\r
+ vp8_filter_block2d_second_pass(&FData[16], dst_ptr, dst_offset, dst_pitch, 8, 8, 8, 8, yoffset);\r
+\r
+}\r
+\r
+__kernel void vp8_sixtap_predict8x4_kernel\r
+(\r
+ __global unsigned char *src_ptr,\r
+ int src_offset,\r
+ int src_pixels_per_line,\r
+ int xoffset,\r
+ int yoffset,\r
+ __global unsigned char *dst_ptr,\r
+ int dst_offset,\r
+ int dst_pitch\r
+)\r
+{\r
+ local int FData[13*16]; /* Temp data buffer used in filtering */\r
+\r
+ /* First filter 1-D horizontally... */\r
+ vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 9, 8, xoffset);\r
+\r
+ /* then filter verticaly... */\r
+ vp8_filter_block2d_second_pass(&FData[16], dst_ptr, dst_offset, dst_pitch, 8, 8, 4, 8, yoffset);\r
+}\r
+\r
+__kernel void vp8_sixtap_predict16x16_kernel\r
+(\r
+ __global unsigned char *src_ptr,\r
+ int src_offset,\r
+ int src_pixels_per_line,\r
+ int xoffset,\r
+ int yoffset,\r
+ __global unsigned char *dst_ptr,\r
+ int dst_offset,\r
+ int dst_pitch\r
+)\r
+{\r
+ local int FData[21*24]; /* Temp data buffer used in filtering */\r
+\r
+ /* First filter 1-D horizontally... */\r
+ vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 21, 16, xoffset);\r
+\r
+ /* then filter verticaly... */\r
+ vp8_filter_block2d_second_pass(&FData[32], dst_ptr, dst_offset, dst_pitch, 16, 16, 16, 16, yoffset);\r
+\r
+ return;\r
+}\r
--- /dev/null
+/*\r
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.\r
+ *\r
+ * Use of this source code is governed by a BSD-style license\r
+ * that can be found in the LICENSE file in the root of the source\r
+ * tree. An additional intellectual property rights grant can be found\r
+ * in the file PATENTS. All contributing project authors may\r
+ * be found in the AUTHORS file in the root of the source tree.\r
+ */\r
+\r
+#ifndef FILTER_CL_H_\r
+#define FILTER_CL_H_\r
+\r
+#ifdef __cplusplus\r
+extern "C" {\r
+#endif\r
+\r
+#include "vp8_opencl.h"\r
+\r
+#define VP8_FILTER_WEIGHT 128\r
+#define VP8_FILTER_SHIFT 7\r
+\r
+#define REGISTER_FILTER 1\r
+#define CLAMP(x,min,max) if (x < min) x = min; else if ( x > max ) x = max;\r
+#define PRE_CALC_PIXEL_STEPS 1\r
+#define PRE_CALC_SRC_INCREMENT 1\r
+\r
+#if PRE_CALC_PIXEL_STEPS\r
+#define PS2 two_pixel_steps\r
+#define PS3 three_pixel_steps\r
+#else\r
+#define PS2 2*(int)pixel_step\r
+#define PS3 3*(int)pixel_step\r
+#endif\r
+\r
+#if REGISTER_FILTER\r
+#define FILTER0 filter0\r
+#define FILTER1 filter1\r
+#define FILTER2 filter2\r
+#define FILTER3 filter3\r
+#define FILTER4 filter4\r
+#define FILTER5 filter5\r
+#else\r
+#define FILTER0 vp8_filter[0]\r
+#define FILTER1 vp8_filter[1]\r
+#define FILTER2 vp8_filter[2]\r
+#define FILTER3 vp8_filter[3]\r
+#define FILTER4 vp8_filter[4]\r
+#define FILTER5 vp8_filter[5]\r
+#endif\r
+\r
+#if PRE_CALC_SRC_INCREMENT\r
+#define SRC_INCREMENT src_increment\r
+#else\r
+#define SRC_INCREMENT (src_pixels_per_line - output_width)\r
+#endif\r
+\r
+#define FILTER_OFFSET //Filter data stored as CL constant memory\r
+#define FILTER_REF sub_pel_filters[filter_offset]\r
+\r
+extern const char *filterCompileOptions;\r
+extern const char *filter_cl_file_name;\r
+\r
+//Copy the -2*pixel_step (and ps*3) bytes because the filter algorithm\r
+//accesses negative indexes\r
+#define SIXTAP_SRC_LEN(out_width,out_height,src_px) ((out_width)*(out_height) + (((out_width)*(out_height)-1)/(out_width))*(src_px - out_width) + 5)\r
+#define BIL_SRC_LEN(out_width,out_height,src_px) ((out_height) * src_px + out_width)\r
+#define DST_LEN(dst_pitch,dst_height,dst_width) (dst_pitch * (dst_height) + (dst_width))\r
+\r
+#ifdef __cplusplus\r
+}\r
+#endif\r
+\r
+#endif /* FILTER_CL_H_ */\r
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef IDCT_OPENCL_H
+#define IDCT_OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp8_opencl.h"
+#include "vp8/common/blockd.h"
+
+#define prototype_second_order_cl(sym) \
+ void sym(BLOCKD *b)
+
+#define prototype_idct_cl(sym) \
+ void sym(BLOCKD *b, int pitch)
+
+#define prototype_idct_scalar_add_cl(sym) \
+ void sym(BLOCKD *b, cl_int use_diff, int diff_offset, int qcoeff_offset, \
+ int pred_offset, unsigned char *output, cl_mem out_mem, int out_offset, size_t out_size, \
+ int pitch, int stride)\
+
+
+extern prototype_idct_cl(vp8_short_idct4x4llm_1_cl);
+extern prototype_idct_cl(vp8_short_idct4x4llm_cl);
+extern prototype_idct_scalar_add_cl(vp8_dc_only_idct_add_cl);
+
+extern prototype_second_order_cl(vp8_short_inv_walsh4x4_1_cl);
+extern prototype_second_order_cl(vp8_short_inv_walsh4x4_cl);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+
+//ACW: Remove me after debugging.
+#include <stdio.h>
+#include <string.h>
+
+#include "idct_cl.h"
+#include "idctllm_cl.h"
+#include "blockd_cl.h"
+
+void cl_destroy_idct(){
+
+ if (cl_data.idct_program)
+ clReleaseProgram(cl_data.idct_program);
+
+ cl_data.idct_program = NULL;
+
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_inv_walsh4x4_1_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_dc_only_idct_add_kernel);
+ //VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_idct4x4llm_1_kernel);
+ //VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_idct4x4llm_kernel);
+
+}
+
+int cl_init_idct() {
+ int err;
+
+ // Create the filter compute program from the file-defined source code
+ if (cl_load_program(&cl_data.idct_program, idctllm_cl_file_name,
+ idctCompileOptions) != CL_SUCCESS)
+ return VP8_CL_TRIED_BUT_FAILED;
+
+ // Create the compute kernel in the program we wish to run
+ VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_inv_walsh4x4_1_kernel,"vp8_short_inv_walsh4x4_1_kernel");
+ VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_inv_walsh4x4_1st_pass_kernel,"vp8_short_inv_walsh4x4_1st_pass_kernel");
+ VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_inv_walsh4x4_2nd_pass_kernel,"vp8_short_inv_walsh4x4_2nd_pass_kernel");
+ VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_dc_only_idct_add_kernel,"vp8_dc_only_idct_add_kernel");
+
+ ////idct4x4llm kernels are only useful for the encoder
+ //VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_idct4x4llm_1_kernel,"vp8_short_idct4x4llm_1_kernel");
+ //VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_idct4x4llm_kernel,"vp8_short_idct4x4llm_kernel");
+
+ return CL_SUCCESS;
+}
+
+#define max(x,y) (x > y ? x: y)
+//#define NO_CL
+
+/* Only useful for encoder... Untested... */
+void vp8_short_idct4x4llm_cl(BLOCKD *b, int pitch)
+{
+ int err;
+
+ short *input = b->dqcoeff_base + b->dqcoeff_offset;
+ short *output = &b->diff_base[b->diff_offset];
+
+ cl_mem src_mem, dst_mem;
+
+ //1 instance for now. This should be split into 2-pass * 4 thread.
+ size_t global = 1;
+
+ if (cl_initialized != CL_SUCCESS){
+ vp8_short_idct4x4llm_c(input,output,pitch);
+ return;
+ }
+
+ VP8_CL_CREATE_BUF(b->cl_commands, src_mem,,
+ sizeof(short)*16, input,
+ vp8_short_idct4x4llm_c(input,output,pitch),
+ );
+
+ VP8_CL_CREATE_BUF(b->cl_commands, dst_mem,,
+ sizeof(short)*(4+(pitch/2)*3), output,
+ vp8_short_idct4x4llm_c(input,output,pitch),
+ );
+
+ //Set arguments and run kernel
+ err = 0;
+ err = clSetKernelArg(cl_data.vp8_short_idct4x4llm_kernel, 0, sizeof (cl_mem), &src_mem);
+ err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_kernel, 1, sizeof (cl_mem), &dst_mem);
+ err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_kernel, 2, sizeof (int), &pitch);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",
+ vp8_short_idct4x4llm_c(input,output,pitch),
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_idct4x4llm_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);
+ vp8_short_idct4x4llm_c(input,output,pitch),
+ );
+
+ /* Read back the result data from the device */
+ err = clEnqueueReadBuffer(b->cl_commands, dst_mem, CL_FALSE, 0, sizeof(short)*(4+pitch/2*3), output, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS(b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read output array!\n",
+ vp8_short_idct4x4llm_c(input,output,pitch),
+ );
+
+ clReleaseMemObject(src_mem);
+ clReleaseMemObject(dst_mem);
+
+ return;
+}
+
+/* Only useful for encoder... Untested... */
+void vp8_short_idct4x4llm_1_cl(BLOCKD *b, int pitch)
+{
+ int err;
+ size_t global = 4;
+
+ short *input = b->dqcoeff_base + b->dqcoeff_offset;
+ short *output = &b->diff_base[b->diff_offset];
+
+ cl_mem src_mem, dst_mem;
+
+ if (cl_initialized != CL_SUCCESS){
+ vp8_short_idct4x4llm_1_c(input,output,pitch);
+ return;
+ }
+
+ printf("vp8_short_idct4x4llm_1_cl\n");
+
+ VP8_CL_CREATE_BUF(b->cl_commands, src_mem,,
+ sizeof(short), input,
+ vp8_short_idct4x4llm_1_c(input,output,pitch),
+ );
+
+ VP8_CL_CREATE_BUF(b->cl_commands, dst_mem,,
+ sizeof(short)*(4+(pitch/2)*3), output,
+ vp8_short_idct4x4llm_1_c(input,output,pitch),
+ );
+
+ //Set arguments and run kernel
+ err = 0;
+ err = clSetKernelArg(cl_data.vp8_short_idct4x4llm_1_kernel, 0, sizeof (cl_mem), &src_mem);
+ err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_1_kernel, 1, sizeof (cl_mem), &dst_mem);
+ err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_1_kernel, 2, sizeof (int), &pitch);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",
+ vp8_short_idct4x4llm_1_c(input,output,pitch),
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_idct4x4llm_1_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);
+ vp8_short_idct4x4llm_1_c(input,output,pitch),
+ );
+
+ /* Read back the result data from the device */
+ err = clEnqueueReadBuffer(b->cl_commands, dst_mem, CL_FALSE, 0, sizeof(short)*(4+pitch/2*3), output, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS(b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read output array!\n",
+ vp8_short_idct4x4llm_1_c(input,output,pitch),
+ );
+
+ clReleaseMemObject(src_mem);
+ clReleaseMemObject(dst_mem);
+
+ return;
+
+}
+
+void vp8_dc_only_idct_add_cl(BLOCKD *b, cl_int use_diff, int diff_offset,
+ int qcoeff_offset, int pred_offset,
+ unsigned char *dst_base, cl_mem dst_mem, int dst_offset, size_t dest_size,
+ int pitch, int stride
+)
+{
+
+ int err;
+ size_t global = 16;
+
+ int free_mem = 0;
+ //cl_mem dest_mem = NULL;
+
+ if (dst_mem == NULL){
+ VP8_CL_CREATE_BUF(b->cl_commands, dst_mem,,
+ dest_size, dst_base,,
+ );
+ free_mem = 1;
+ }
+
+ //Set arguments and run kernel
+ err = clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 0, sizeof (cl_mem), &b->cl_predictor_mem);
+ err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 1, sizeof (int), &pred_offset);
+ err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 2, sizeof (cl_mem), &dst_mem);
+ err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 3, sizeof (int), &dst_offset);
+ err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 4, sizeof (int), &pitch);
+ err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 5, sizeof (int), &stride);
+ err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 6, sizeof (cl_int), &use_diff);
+ err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 7, sizeof (cl_mem), &b->cl_diff_mem);
+ err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 8, sizeof (int), &diff_offset);
+ err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 9, sizeof (cl_mem), &b->cl_qcoeff_mem);
+ err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 10, sizeof (int), &qcoeff_offset);
+ err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 11, sizeof (cl_mem), &b->cl_dequant_mem);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",,
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_dc_only_idct_add_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);,
+ );
+
+
+ if (free_mem == 1){
+ /* Read back the result data from the device */
+ err = clEnqueueReadBuffer(b->cl_commands, dst_mem, CL_FALSE, 0,
+ dest_size, dst_base, 0, NULL, NULL);
+
+ VP8_CL_CHECK_SUCCESS(b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read output array!\n",,
+ );
+
+ clReleaseMemObject(dst_mem);
+ }
+
+ return;
+}
+
+void vp8_short_inv_walsh4x4_cl(BLOCKD *b)
+{
+ int err;
+ size_t global = 4;
+
+ if (cl_initialized != CL_SUCCESS){
+ vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset,&b->diff_base[b->diff_offset]);
+ return;
+ }
+
+ //Set arguments and run kernel
+ err = 0;
+ err = clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 0, sizeof (cl_mem), &b->cl_dqcoeff_mem);
+ err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 1, sizeof(int), &b->dqcoeff_offset);
+ err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 2, sizeof (cl_mem), &b->cl_diff_mem);
+ err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 3, sizeof(int), &b->diff_offset);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",
+ vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]),
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);
+ vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]),
+ );
+
+ //Second pass
+ //Set arguments and run kernel
+ err = 0;
+ err = clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel, 0, sizeof (cl_mem), &b->cl_diff_mem);
+ err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel, 1, sizeof(int), &b->diff_offset);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",
+ vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]),
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);
+ vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]),
+ );
+
+ return;
+}
+
+void vp8_short_inv_walsh4x4_1_cl(BLOCKD *b)
+{
+
+ int err;
+ size_t global = 4;
+
+ if (cl_initialized != CL_SUCCESS){
+ vp8_short_inv_walsh4x4_1_c(b->dqcoeff_base + b->dqcoeff_offset,
+ &b->diff_base[b->diff_offset]);
+ return;
+ }
+
+ //Set arguments and run kernel
+ err = 0;
+ err = clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 0, sizeof (cl_mem), &b->cl_dqcoeff_mem);
+ err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 1, sizeof (int), &b->dqcoeff_offset);
+ err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 2, sizeof (cl_mem), &b->cl_diff_mem);
+ err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 3, sizeof (int), &b->diff_offset);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",
+ vp8_short_inv_walsh4x4_1_c(b->dqcoeff_base + b->dqcoeff_offset,
+ &b->diff_base[b->diff_offset]),
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_inv_walsh4x4_1_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);
+ vp8_short_inv_walsh4x4_1_c(b->dqcoeff_base + b->dqcoeff_offset,
+ &b->diff_base[b->diff_offset]),
+ );
+
+ return;
+}
--- /dev/null
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+__constant int cospi8sqrt2minus1 = 20091;
+__constant int sinpi8sqrt2 = 35468;
+__constant int rounding = 0;
+
+
+kernel void vp8_short_idct4x4llm_1st_pass_kernel(global short*,global short *,int);
+kernel void vp8_short_idct4x4llm_2nd_pass_kernel(global short*,int);
+
+
+__kernel void vp8_short_idct4x4llm_kernel(
+ __global short *input,
+ __global short *output,
+ int pitch
+){
+ vp8_short_idct4x4llm_1st_pass_kernel(input,output,pitch);
+ vp8_short_idct4x4llm_2nd_pass_kernel(output,pitch);
+}
+
+__kernel void vp8_short_idct4x4llm_1st_pass_kernel(
+ __global short *ip,
+ __global short *op,
+ int pitch
+)
+{
+ int i;
+ int a1, b1, c1, d1;
+
+ int temp1, temp2;
+ int shortpitch = pitch >> 1;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[8];
+ b1 = ip[0] - ip[8];
+
+ temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
+ temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
+ temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
+ d1 = temp1 + temp2;
+
+ op[shortpitch*0] = a1 + d1;
+ op[shortpitch*3] = a1 - d1;
+
+ op[shortpitch*1] = b1 + c1;
+ op[shortpitch*2] = b1 - c1;
+
+ ip++;
+ op++;
+ }
+
+ return;
+}
+
+__kernel void vp8_short_idct4x4llm_2nd_pass_kernel(
+ __global short *output,
+ int pitch
+)
+{
+ int i;
+ int a1, b1, c1, d1;
+
+ int temp1, temp2;
+ int shortpitch = pitch >> 1;
+ __global short *ip = output;
+ __global short *op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[2];
+ b1 = ip[0] - ip[2];
+
+ temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
+ temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
+ temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
+ d1 = temp1 + temp2;
+
+ op[0] = (a1 + d1 + 4) >> 3;
+ op[3] = (a1 - d1 + 4) >> 3;
+
+ op[1] = (b1 + c1 + 4) >> 3;
+ op[2] = (b1 - c1 + 4) >> 3;
+
+ ip += shortpitch;
+ op += shortpitch;
+ }
+
+ return;
+}
+
+__kernel void vp8_short_idct4x4llm_1_kernel(
+ __global short *input,
+ __global short *output,
+ int pitch
+)
+{
+ int a1;
+ int out_offset;
+ int shortpitch = pitch >> 1;
+
+ //short4 a;
+ a1 = ((input[0] + 4) >> 3);
+ //a = a1;
+
+ int tid = get_global_id(0);
+ if (tid < 4){
+ out_offset = shortpitch * tid;
+
+ //vstore4(a,0,&output[out_offset];
+ output[out_offset] = a1;
+ output[out_offset+1] = a1;
+ output[out_offset+2] = a1;
+ output[out_offset+3] = a1;
+ }
+}
+
+__kernel void vp8_dc_only_idct_add_kernel(
+ __global unsigned char *pred_base,
+ int pred_offset,
+ __global unsigned char *dst_base,
+ int dst_offset,
+ int pitch,
+ int stride,
+ int use_diff,
+ global short *diff_base,
+ int diff_offset,
+ global short *qcoeff_base,
+ int qcoeff_offset,
+ global short *dequant
+)
+{
+ int r, c;
+ //int pred_offset;
+ global unsigned char *pred_ptr = &pred_base[pred_offset];
+ global unsigned char *dst_ptr = &dst_base[dst_offset];
+
+ int tid = get_global_id(0);
+
+ int a1;
+
+ if (tid < 16){
+
+ if (use_diff == 1){
+ a1 = diff_base[diff_offset];
+ } else {
+ a1 = qcoeff_base[qcoeff_offset] * dequant[0];
+ }
+ a1 = (a1 + 4)>>3;
+
+ r = tid / 4;
+ c = tid % 4;
+
+ pred_offset = r * pitch;
+ dst_offset += r * stride;
+ int a = a1 + pred_ptr[pred_offset + c] ;
+
+ if (a < 0)
+ a = 0;
+ else if (a > 255)
+ a = 255;
+
+ dst_base[dst_offset + c] = (unsigned char) a ;
+ }
+}
+
+
+__kernel void vp8_short_inv_walsh4x4_1st_pass_kernel(
+ __global short *src_base,
+ int src_offset,
+ __global short *output_base,
+ int out_offset
+)
+{
+
+ __global short *input = src_base + src_offset;
+ __global short *output = output_base + src_offset;
+ int tid = get_global_id(0);
+
+#define VEC_WALSH 0
+#if VEC_WALSH
+ //4-short vectors to calculate things in
+ short4 a,b,c,d, a2v, b2v, c2v, d2v, a1t, b1t, c1t, d1t;
+ short16 out;
+
+ if (tid == 0){
+ //first pass loop in vector form
+ a = vload4(0,input) + vload4(3,input);
+ b = vload4(1,input) + vload4(2,input);
+ c = vload4(1,input) - vload4(2,input);
+ d = vload4(0,input) - vload4(3,input);
+ vstore4(a + b, 0, output);
+ vstore4(c + d, 1, output);
+ vstore4(a - b, 2, output);
+ vstore4(d - c, 3, output);
+
+ return;
+
+ //2nd pass
+ a = (short4)(output[0], output[4], output[8], output[12]);
+ b = (short4)(output[1], output[5], output[9], output[13]);
+ c = (short4)(output[1], output[5], output[9], output[13]);
+ d = (short4)(output[0], output[4], output[8], output[12]);
+ a1t = (short4)(output[3], output[7], output[11], output[15]);
+ b1t = (short4)(output[2], output[6], output[10], output[14]);
+ c1t = (short4)(output[2], output[6], output[10], output[14]);
+ d1t = (short4)(output[3], output[7], output[11], output[15]);
+
+ a = a + a1t + (short)3;
+ b = b + b1t;
+ c = c - c1t;
+ d = d - d1t + (short)3;
+
+ a2v = (a + b) >> (short)3;
+ b2v = (c + d) >> (short)3;
+ c2v = (a - b) >> (short)3;
+ d2v = (d - c) >> (short)3;
+
+ out.s048c = a2v;
+ out.s159d = b2v;
+ out.s26ae = c2v;
+ out.s37bf = d2v;
+ vstore16(out,0,output);
+ }
+#else
+
+ int i;
+ int a1, b1, c1, d1;
+ int a2, b2, c2, d2;
+ global short *ip = input;
+ global short *op = output;
+
+ int offset;
+
+ if (tid < 4){
+ offset = tid;
+ a1 = ip[offset] + ip[offset + 12];
+ b1 = ip[offset + 4] + ip[offset + 8];
+ c1 = ip[offset + 4] - ip[offset + 8];
+ d1 = ip[offset] - ip[offset + 12];
+
+ op[offset] = a1 + b1;
+ op[offset + 4] = c1 + d1;
+ op[offset + 8] = a1 - b1;
+ op[offset + 12] = d1 - c1;
+ }
+#endif
+}
+
+__kernel void vp8_short_inv_walsh4x4_2nd_pass_kernel(
+ __global short *output_base,
+ int out_offset
+)
+{
+ int i;
+ int a1, b1, c1, d1;
+ int a2, b2, c2, d2;
+
+ __global short *output = output_base + out_offset;
+ int tid = get_global_id(0);
+ int offset = 0;
+
+ if (tid < 4){
+ offset = 4*tid;
+ a1 = output[offset] + output[offset + 3];
+ b1 = output[offset + 1] + output[offset + 2];
+ c1 = output[offset + 1] - output[offset + 2];
+ d1 = output[offset + 0] - output[offset + 3];
+
+ a2 = a1 + b1;
+ b2 = c1 + d1;
+ c2 = a1 - b1;
+ d2 = d1 - c1;
+
+ output[offset + 0] = (a2 + 3) >> 3;
+ output[offset + 1] = (b2 + 3) >> 3;
+ output[offset + 2] = (c2 + 3) >> 3;
+ output[offset + 3] = (d2 + 3) >> 3;
+ }
+}
+
+__kernel void vp8_short_inv_walsh4x4_1_kernel(
+ __global short *src_data,
+ int src_offset,
+ __global short *dst_data,
+ int dst_offset
+){
+ int a1;
+ int tid = get_global_id(0);
+ //short16 a;
+ int i;
+ short4 a;
+ __global short *input = src_data + src_offset;
+ __global short *output = dst_data + dst_offset;
+
+ if (tid < 4)
+ {
+ a1 = ((input[0] + 3) >> 3);
+ a = (short)a1; //Set all elements of vector to a1
+ vstore4(a, tid, output);
+ }
+}
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_config.h"
+#include "vp8_opencl.h"
+#include "vp8/common/blockd.h"
+
+#define CLAMP(x,min,max) if (x < min) x = min; else if ( x > max ) x = max;
+
+//External functions that are fallbacks if CL is unavailable
+extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch);
+extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
+extern void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride);
+extern void vp8_short_inv_walsh4x4_c(short *input, short *output);
+extern void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+
+const char *idctCompileOptions = "-Ivp8/common/opencl";
+const char *idctllm_cl_file_name = "vp8/common/opencl/idctllm_cl.cl";
+
--- /dev/null
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable\r
+#pragma OPENCL EXTENSION cl_amd_printf : enable\r
+\r
+typedef unsigned char uc;\r
+typedef signed char sc;\r
+\r
+__inline signed char vp8_filter_mask(sc, sc, uc, uc, uc, uc, uc, uc, uc, uc);\r
+__inline signed char vp8_simple_filter_mask(signed char, signed char, uc, uc, uc, uc);\r
+__inline signed char vp8_hevmask(signed char, uc, uc, uc, uc);\r
+__inline signed char vp8_signed_char_clamp(int);\r
+\r
+__inline void vp8_mbfilter(signed char mask,signed char hev,global uc *op2,\r
+ global uc *op1,global uc *op0,global uc *oq0,global uc *oq1,global uc *oq2);\r
+\r
+void vp8_simple_filter(signed char mask,global uc *base, int op1_off,int op0_off,int oq0_off,int oq1_off);\r
+\r
+\r
+typedef struct\r
+{\r
+ signed char lim[16];\r
+ signed char flim[16];\r
+ signed char thr[16];\r
+ signed char mbflim[16];\r
+ signed char mbthr[16];\r
+ signed char uvlim[16];\r
+ signed char uvflim[16];\r
+ signed char uvthr[16];\r
+ signed char uvmbflim[16];\r
+ signed char uvmbthr[16];\r
+} loop_filter_info;\r
+\r
+\r
+\r
+\r
+void vp8_filter(\r
+ signed char mask,\r
+ signed char hev,\r
+ global uc *base,\r
+ int op1_off,\r
+ int op0_off,\r
+ int oq0_off,\r
+ int oq1_off\r
+)\r
+{\r
+\r
+ global uc *op1 = &base[op1_off];\r
+ global uc *op0 = &base[op0_off];\r
+ global uc *oq0 = &base[oq0_off];\r
+ global uc *oq1 = &base[oq1_off];\r
+\r
+ signed char ps0, qs0;\r
+ signed char ps1, qs1;\r
+ signed char vp8_filter, Filter1, Filter2;\r
+ signed char u;\r
+\r
+ ps1 = (signed char) * op1 ^ 0x80;\r
+ ps0 = (signed char) * op0 ^ 0x80;\r
+ qs0 = (signed char) * oq0 ^ 0x80;\r
+ qs1 = (signed char) * oq1 ^ 0x80;\r
+\r
+ /* add outer taps if we have high edge variance */\r
+ vp8_filter = vp8_signed_char_clamp(ps1 - qs1);\r
+ vp8_filter &= hev;\r
+\r
+ /* inner taps */\r
+ vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));\r
+ vp8_filter &= mask;\r
+\r
+ /* save bottom 3 bits so that we round one side +4 and the other +3\r
+ * if it equals 4 we'll set to adjust by -1 to account for the fact\r
+ * we'd round 3 the other way\r
+ */\r
+ Filter1 = vp8_signed_char_clamp(vp8_filter + 4);\r
+ Filter2 = vp8_signed_char_clamp(vp8_filter + 3);\r
+ Filter1 >>= 3;\r
+ Filter2 >>= 3;\r
+ u = vp8_signed_char_clamp(qs0 - Filter1);\r
+ *oq0 = u ^ 0x80;\r
+ u = vp8_signed_char_clamp(ps0 + Filter2);\r
+ *op0 = u ^ 0x80;\r
+ vp8_filter = Filter1;\r
+\r
+ /* outer tap adjustments */\r
+ vp8_filter += 1;\r
+ vp8_filter >>= 1;\r
+ vp8_filter &= ~hev;\r
+\r
+ u = vp8_signed_char_clamp(qs1 - vp8_filter);\r
+ *oq1 = u ^ 0x80;\r
+ u = vp8_signed_char_clamp(ps1 + vp8_filter);\r
+ *op1 = u ^ 0x80;\r
+}\r
+\r
+\r
+kernel void vp8_loop_filter_horizontal_edge_kernel\r
+(\r
+ global unsigned char *s_base,\r
+ int s_off,\r
+ int p, /* pitch */\r
+ global signed char *flimit,\r
+ global signed char *limit,\r
+ global signed char *thresh,\r
+ int off_stride\r
+)\r
+{\r
+ int hev = 0; /* high edge variance */\r
+ signed char mask = 0;\r
+ int i = get_global_id(0);\r
+\r
+ if (i < get_global_size(0)){\r
+ s_off += i;\r
+\r
+ mask = vp8_filter_mask(limit[i], flimit[i], s_base[s_off - 4*p],\r
+ s_base[s_off - 3*p], s_base[s_off - 2*p], s_base[s_off - p],\r
+ s_base[s_off], s_base[s_off + p], s_base[s_off + 2*p],\r
+ s_base[s_off + 3*p]);\r
+\r
+ hev = vp8_hevmask(thresh[i], s_base[s_off - 2*p], s_base[s_off - p],\r
+ s_base[s_off], s_base[s_off+p]);\r
+\r
+ vp8_filter(mask, hev, s_base, s_off - 2 * p, s_off - p, s_off,\r
+ s_off + p);\r
+ }\r
+}\r
+\r
+\r
+kernel void vp8_loop_filter_vertical_edge_kernel\r
+(\r
+ global unsigned char *s_base,\r
+ int s_off,\r
+ int p,\r
+ global signed char *flimit,\r
+ global signed char *limit,\r
+ global signed char *thresh,\r
+ int off_stride\r
+)\r
+{\r
+\r
+ int hev = 0; /* high edge variance */\r
+ signed char mask = 0;\r
+ int i = get_global_id(0);\r
+\r
+ if ( i < get_global_size(0) ){\r
+ s_off += p * i;\r
+ mask = vp8_filter_mask(limit[i], flimit[i],\r
+ s_base[s_off-4], s_base[s_off-3], s_base[s_off-2],\r
+ s_base[s_off-1], s_base[s_off], s_base[s_off+1],\r
+ s_base[s_off+2], s_base[s_off+3]);\r
+\r
+ hev = vp8_hevmask(thresh[i], s_base[s_off-2], s_base[s_off-1],\r
+ s_base[s_off], s_base[s_off+1]);\r
+\r
+ vp8_filter(mask, hev, s_base, s_off - 2, s_off - 1, s_off, s_off + 1);\r
+\r
+ }\r
+}\r
+\r
+\r
+kernel void vp8_mbloop_filter_horizontal_edge_kernel\r
+(\r
+ global unsigned char *s_base,\r
+ int s_off,\r
+ int p,\r
+ global signed char *flimit,\r
+ global signed char *limit,\r
+ global signed char *thresh,\r
+ int off_stride\r
+)\r
+{\r
+\r
+ global uc *s = s_base+s_off;\r
+\r
+ signed char hev = 0; /* high edge variance */\r
+ signed char mask = 0;\r
+ int i = get_global_id(0);\r
+\r
+ if (i < get_global_size(0)){\r
+ s += i;\r
+\r
+ mask = vp8_filter_mask(limit[i], flimit[i],\r
+ s[-4*p], s[-3*p], s[-2*p], s[-1*p],\r
+ s[0*p], s[1*p], s[2*p], s[3*p]);\r
+\r
+ hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);\r
+\r
+ vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);\r
+\r
+ }\r
+}\r
+\r
+\r
+kernel void vp8_mbloop_filter_vertical_edge_kernel\r
+(\r
+ global unsigned char *s_base,\r
+ int s_off,\r
+ int p,\r
+ global signed char *flimit,\r
+ global signed char *limit,\r
+ global signed char *thresh,\r
+ int off_stride\r
+)\r
+{\r
+\r
+ global uc *s = s_base + s_off;\r
+\r
+ signed char hev = 0; /* high edge variance */\r
+ signed char mask = 0;\r
+ int i = get_global_id(0);\r
+\r
+ if (i < get_global_size(0)){\r
+ s += p * i;\r
+\r
+ mask = vp8_filter_mask(limit[i], flimit[i],\r
+ s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);\r
+\r
+ hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]);\r
+\r
+ vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);\r
+\r
+ }\r
+}\r
+\r
+\r
+kernel void vp8_loop_filter_simple_horizontal_edge_kernel\r
+(\r
+ global unsigned char *s_base,\r
+ int s_off,\r
+ int p,\r
+ global const signed char *flimit,\r
+ global const signed char *limit,\r
+ global const signed char *thresh,\r
+ int off_stride\r
+)\r
+{\r
+\r
+ signed char mask = 0;\r
+ int i = get_global_id(0);\r
+ (void) thresh;\r
+\r
+ if (i < get_global_size(0))\r
+ {\r
+ s_off += i;\r
+ mask = vp8_simple_filter_mask(limit[i], flimit[i], s_base[s_off-2*p], s_base[s_off-p], s_base[s_off], s_base[s_off+p]);\r
+ vp8_simple_filter(mask, s_base, s_off - 2 * p, s_off - 1 * p, s_off, s_off + 1 * p);\r
+ }\r
+}\r
+\r
+\r
+kernel void vp8_loop_filter_simple_vertical_edge_kernel\r
+(\r
+ global unsigned char *s_base,\r
+ int s_off,\r
+ int p,\r
+ global signed char *flimit,\r
+ global signed char *limit,\r
+ global signed char *thresh,\r
+ int off_stride\r
+)\r
+{\r
+\r
+ signed char mask = 0;\r
+ int i = get_global_id(0);\r
+ (void) thresh;\r
+\r
+ if (i < get_global_size(0)){\r
+ s_off += p * i;\r
+ mask = vp8_simple_filter_mask(limit[i], flimit[i], s_base[s_off-2], s_base[s_off-1], s_base[s_off], s_base[s_off+1]);\r
+ vp8_simple_filter(mask, s_base, s_off - 2, s_off - 1, s_off, s_off + 1);\r
+ }\r
+\r
+}\r
+\r
+\r
+\r
+//Inline and non-kernel functions follow.\r
+\r
+__inline void vp8_mbfilter(\r
+ signed char mask,\r
+ signed char hev,\r
+ global uc *op2,\r
+ global uc *op1,\r
+ global uc *op0,\r
+ global uc *oq0,\r
+ global uc *oq1,\r
+ global uc *oq2\r
+)\r
+{\r
+ signed char s, u;\r
+ signed char vp8_filter, Filter1, Filter2;\r
+ signed char ps2 = (signed char) * op2 ^ 0x80;\r
+ signed char ps1 = (signed char) * op1 ^ 0x80;\r
+ signed char ps0 = (signed char) * op0 ^ 0x80;\r
+ signed char qs0 = (signed char) * oq0 ^ 0x80;\r
+ signed char qs1 = (signed char) * oq1 ^ 0x80;\r
+ signed char qs2 = (signed char) * oq2 ^ 0x80;\r
+\r
+ /* add outer taps if we have high edge variance */\r
+ vp8_filter = vp8_signed_char_clamp(ps1 - qs1);\r
+ vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));\r
+ vp8_filter &= mask;\r
+\r
+ Filter2 = vp8_filter;\r
+ Filter2 &= hev;\r
+\r
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */\r
+ Filter1 = vp8_signed_char_clamp(Filter2 + 4);\r
+ Filter2 = vp8_signed_char_clamp(Filter2 + 3);\r
+ Filter1 >>= 3;\r
+ Filter2 >>= 3;\r
+ qs0 = vp8_signed_char_clamp(qs0 - Filter1);\r
+ ps0 = vp8_signed_char_clamp(ps0 + Filter2);\r
+\r
+\r
+ /* only apply wider filter if not high edge variance */\r
+ vp8_filter &= ~hev;\r
+ Filter2 = vp8_filter;\r
+\r
+ /* roughly 3/7th difference across boundary */\r
+ u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);\r
+ s = vp8_signed_char_clamp(qs0 - u);\r
+ *oq0 = s ^ 0x80;\r
+ s = vp8_signed_char_clamp(ps0 + u);\r
+ *op0 = s ^ 0x80;\r
+\r
+ /* roughly 2/7th difference across boundary */\r
+ u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7);\r
+ s = vp8_signed_char_clamp(qs1 - u);\r
+ *oq1 = s ^ 0x80;\r
+ s = vp8_signed_char_clamp(ps1 + u);\r
+ *op1 = s ^ 0x80;\r
+\r
+ /* roughly 1/7th difference across boundary */\r
+ u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7);\r
+ s = vp8_signed_char_clamp(qs2 - u);\r
+ *oq2 = s ^ 0x80;\r
+ s = vp8_signed_char_clamp(ps2 + u);\r
+ *op2 = s ^ 0x80;\r
+}\r
+\r
+\r
+__inline signed char vp8_signed_char_clamp(int t)\r
+{\r
+ t = (t < -128 ? -128 : t);\r
+ t = (t > 127 ? 127 : t);\r
+ return (signed char) t;\r
+}\r
+\r
+\r
+/* is there high variance internal edge ( 11111111 yes, 00000000 no) */\r
+__inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1)\r
+{\r
+ signed char hev = 0;\r
+ hev |= (abs(p1 - p0) > thresh) * -1;\r
+ hev |= (abs(q1 - q0) > thresh) * -1;\r
+ return hev;\r
+}\r
+\r
+\r
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */\r
+__inline signed char vp8_filter_mask(\r
+ signed char limit,\r
+ signed char flimit,\r
+ uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3)\r
+{\r
+ signed char mask = 0;\r
+ mask |= (abs(p3 - p2) > limit) * -1;\r
+ mask |= (abs(p2 - p1) > limit) * -1;\r
+ mask |= (abs(p1 - p0) > limit) * -1;\r
+ mask |= (abs(q1 - q0) > limit) * -1;\r
+ mask |= (abs(q2 - q1) > limit) * -1;\r
+ mask |= (abs(q3 - q2) > limit) * -1;\r
+ mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit) * -1;\r
+ mask = ~mask;\r
+ return mask;\r
+}\r
+\r
+/* should we apply any filter at all ( 11111111 yes, 00000000 no) */\r
+__inline signed char vp8_simple_filter_mask(\r
+ signed char limit,\r
+ signed char flimit,\r
+ uc p1,\r
+ uc p0,\r
+ uc q0,\r
+ uc q1\r
+)\r
+{\r
+ signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= flimit * 2 + limit) * -1;\r
+ return mask;\r
+}\r
+\r
+void vp8_simple_filter(\r
+ signed char mask,\r
+ global uc *base,\r
+ int op1_off,\r
+ int op0_off,\r
+ int oq0_off,\r
+ int oq1_off\r
+)\r
+{\r
+\r
+ global uc *op1 = base + op1_off;\r
+ global uc *op0 = base + op0_off;\r
+ global uc *oq0 = base + oq0_off;\r
+ global uc *oq1 = base + oq1_off;\r
+\r
+ signed char vp8_filter, Filter1, Filter2;\r
+ signed char p1 = (signed char) * op1 ^ 0x80;\r
+ signed char p0 = (signed char) * op0 ^ 0x80;\r
+ signed char q0 = (signed char) * oq0 ^ 0x80;\r
+ signed char q1 = (signed char) * oq1 ^ 0x80;\r
+ signed char u;\r
+\r
+ vp8_filter = vp8_signed_char_clamp(p1 - q1);\r
+ vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (q0 - p0));\r
+ vp8_filter &= mask;\r
+\r
+ /* save bottom 3 bits so that we round one side +4 and the other +3 */\r
+ Filter1 = vp8_signed_char_clamp(vp8_filter + 4);\r
+ Filter1 >>= 3;\r
+ u = vp8_signed_char_clamp(q0 - Filter1);\r
+ *oq0 = u ^ 0x80;\r
+\r
+ Filter2 = vp8_signed_char_clamp(vp8_filter + 3);\r
+ Filter2 >>= 3;\r
+ u = vp8_signed_char_clamp(p0 + Filter2);\r
+ *op0 = u ^ 0x80;\r
+}
\ No newline at end of file
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "../../../vpx_ports/config.h"
+#include "loopfilter_cl.h"
+#include "../onyxc_int.h"
+
+#include "vpx_config.h"
+#include "vp8_opencl.h"
+#include "blockd_cl.h"
+
+const char *loopFilterCompileOptions = "-Ivp8/common/opencl";
+const char *loop_filter_cl_file_name = "vp8/common/opencl/loopfilter.cl";
+
+typedef unsigned char uc;
+
+extern void vp8_loop_filter_frame
+(
+ VP8_COMMON *cm,
+ MACROBLOCKD *mbd,
+ int default_filt_lvl
+);
+
+prototype_loopfilter_cl(vp8_loop_filter_horizontal_edge_cl);
+prototype_loopfilter_cl(vp8_loop_filter_vertical_edge_cl);
+prototype_loopfilter_cl(vp8_mbloop_filter_horizontal_edge_cl);
+prototype_loopfilter_cl(vp8_mbloop_filter_vertical_edge_cl);
+prototype_loopfilter_cl(vp8_loop_filter_simple_horizontal_edge_cl);
+prototype_loopfilter_cl(vp8_loop_filter_simple_vertical_edge_cl);
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_cl(
+ MACROBLOCKD *x,
+ cl_mem buf_base,
+ int y_off,
+ int u_off,
+ int v_off,
+ int y_stride,
+ int uv_stride,
+ loop_filter_info *lfi,
+ int simpler_lpf
+)
+{
+ (void) simpler_lpf;
+
+ vp8_mbloop_filter_horizontal_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1);
+ vp8_mbloop_filter_horizontal_edge_cl(x, buf_base, u_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1);
+ vp8_mbloop_filter_horizontal_edge_cl(x, buf_base, v_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1);
+}
+
+void vp8_loop_filter_mbhs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
+ int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+ (void) uv_stride;
+ (void) simpler_lpf;
+ vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
+ int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+ (void) simpler_lpf;
+
+ vp8_mbloop_filter_vertical_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1);
+ vp8_mbloop_filter_vertical_edge_cl(x, buf_base, u_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1);
+ vp8_mbloop_filter_vertical_edge_cl(x, buf_base, v_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1);
+}
+
+void vp8_loop_filter_mbvs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
+ int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+ (void) uv_stride;
+ (void) simpler_lpf;
+ vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
+ int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+ (void) simpler_lpf;
+
+ vp8_loop_filter_horizontal_edge_cl(x, buf_base, y_off + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+ vp8_loop_filter_horizontal_edge_cl(x, buf_base, y_off + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+ vp8_loop_filter_horizontal_edge_cl(x, buf_base, y_off + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+ vp8_loop_filter_horizontal_edge_cl(x, buf_base, u_off + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1);
+ vp8_loop_filter_horizontal_edge_cl(x, buf_base, v_off + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1);
+
+}
+
+void vp8_loop_filter_bhs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
+ int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+ (void) uv_stride;
+ (void) simpler_lpf;
+
+ vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+ vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+ vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
+ int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+ (void) simpler_lpf;
+
+ vp8_loop_filter_vertical_edge_cl(x, buf_base, y_off + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+ vp8_loop_filter_vertical_edge_cl(x, buf_base, y_off + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+ vp8_loop_filter_vertical_edge_cl(x, buf_base, y_off + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+
+ vp8_loop_filter_vertical_edge_cl(x, buf_base, u_off + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1);
+ vp8_loop_filter_vertical_edge_cl(x, buf_base, v_off + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1);
+}
+
+void vp8_loop_filter_bvs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
+ int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
+{
+ (void) uv_stride;
+ (void) simpler_lpf;
+
+ vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+ vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+ vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
+}
+
+void vp8_init_loop_filter_cl(VP8_COMMON *cm)
+{
+ loop_filter_info *lfi = cm->lf_info;
+ int sharpness_lvl = cm->sharpness_level;
+ int frame_type = cm->frame_type;
+ int i, j;
+
+ int block_inside_limit = 0;
+ int HEVThresh;
+ const int yhedge_boost = 2;
+
+ /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
+ for (i = 0; i <= MAX_LOOP_FILTER; i++)
+ {
+ int filt_lvl = i;
+
+ if (frame_type == KEY_FRAME)
+ {
+ if (filt_lvl >= 40)
+ HEVThresh = 2;
+ else if (filt_lvl >= 15)
+ HEVThresh = 1;
+ else
+ HEVThresh = 0;
+ }
+ else
+ {
+ if (filt_lvl >= 40)
+ HEVThresh = 3;
+ else if (filt_lvl >= 20)
+ HEVThresh = 2;
+ else if (filt_lvl >= 15)
+ HEVThresh = 1;
+ else
+ HEVThresh = 0;
+ }
+
+ /* Set loop filter paramaeters that control sharpness. */
+ block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
+ block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
+
+ if (sharpness_lvl > 0)
+ {
+ if (block_inside_limit > (9 - sharpness_lvl))
+ block_inside_limit = (9 - sharpness_lvl);
+ }
+
+ if (block_inside_limit < 1)
+ block_inside_limit = 1;
+
+ for (j = 0; j < 16; j++)
+ {
+ lfi[i].lim[j] = block_inside_limit;
+ lfi[i].mbflim[j] = filt_lvl + yhedge_boost;
+ lfi[i].flim[j] = filt_lvl;
+ lfi[i].thr[j] = HEVThresh;
+ }
+ }
+}
+
+/* Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding
+ * each frame. Check last_frame_type to skip the function most of times.
+ */
+void vp8_frame_init_loop_filter_cl(loop_filter_info *lfi, int frame_type)
+{
+ int HEVThresh;
+ int i, j;
+
+ /* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
+ for (i = 0; i <= MAX_LOOP_FILTER; i++)
+ {
+ int filt_lvl = i;
+
+ if (frame_type == KEY_FRAME)
+ {
+ if (filt_lvl >= 40)
+ HEVThresh = 2;
+ else if (filt_lvl >= 15)
+ HEVThresh = 1;
+ else
+ HEVThresh = 0;
+ }
+ else
+ {
+ if (filt_lvl >= 40)
+ HEVThresh = 3;
+ else if (filt_lvl >= 20)
+ HEVThresh = 2;
+ else if (filt_lvl >= 15)
+ HEVThresh = 1;
+ else
+ HEVThresh = 0;
+ }
+
+ for (j = 0; j < 16; j++)
+ {
+ lfi[i].thr[j] = HEVThresh;
+ }
+ }
+}
+
+
+//This might not need to be copied from loopfilter.c
+void vp8_adjust_mb_lf_value_cl(MACROBLOCKD *mbd, int *filter_level)
+{
+ MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi;
+
+ if (mbd->mode_ref_lf_delta_enabled)
+ {
+ /* Apply delta for reference frame */
+ *filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];
+
+ /* Apply delta for mode */
+ if (mbmi->ref_frame == INTRA_FRAME)
+ {
+ /* Only the split mode BPRED has a further special case */
+ if (mbmi->mode == B_PRED)
+ *filter_level += mbd->mode_lf_deltas[0];
+ }
+ else
+ {
+ /* Zero motion mode */
+ if (mbmi->mode == ZEROMV)
+ *filter_level += mbd->mode_lf_deltas[1];
+
+ /* Split MB motion mode */
+ else if (mbmi->mode == SPLITMV)
+ *filter_level += mbd->mode_lf_deltas[3];
+
+ /* All other inter motion modes (Nearest, Near, New) */
+ else
+ *filter_level += mbd->mode_lf_deltas[2];
+ }
+
+ /* Range check */
+ if (*filter_level > MAX_LOOP_FILTER)
+ *filter_level = MAX_LOOP_FILTER;
+ else if (*filter_level < 0)
+ *filter_level = 0;
+ }
+}
+
+
+//Start of externally callable functions.
+
+int cl_init_loop_filter() {
+ int err;
+
+ // Create the filter compute program from the file-defined source code
+ if ( cl_load_program(&cl_data.loop_filter_program, loop_filter_cl_file_name,
+ loopFilterCompileOptions) != CL_SUCCESS )
+ return VP8_CL_TRIED_BUT_FAILED;
+
+ // Create the compute kernels in the program we wish to run
+ VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_horizontal_edge_kernel,"vp8_loop_filter_horizontal_edge_kernel");
+ VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_vertical_edge_kernel,"vp8_loop_filter_vertical_edge_kernel");
+ VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_mbloop_filter_horizontal_edge_kernel,"vp8_mbloop_filter_horizontal_edge_kernel");
+ VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_mbloop_filter_vertical_edge_kernel,"vp8_mbloop_filter_vertical_edge_kernel");
+ VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_simple_horizontal_edge_kernel,"vp8_loop_filter_simple_horizontal_edge_kernel");
+ VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_simple_vertical_edge_kernel,"vp8_loop_filter_simple_vertical_edge_kernel");
+
+ return CL_SUCCESS;
+}
+
+void cl_destroy_loop_filter(){
+
+ if (cl_data.loop_filter_program)
+ clReleaseProgram(cl_data.loop_filter_program);
+
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_horizontal_edge_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_vertical_edge_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_mbloop_filter_horizontal_edge_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_mbloop_filter_vertical_edge_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_simple_horizontal_edge_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_simple_vertical_edge_kernel);
+
+ cl_data.loop_filter_program = NULL;
+}
+
+
+void vp8_loop_filter_set_baselines_cl(MACROBLOCKD *mbd, int default_filt_lvl, int *baseline_filter_level){
+ int alt_flt_enabled = mbd->segmentation_enabled;
+ int i;
+
+ if (alt_flt_enabled)
+ {
+ for (i = 0; i < MAX_MB_SEGMENTS; i++)
+ {
+ /* Abs value */
+ if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+ baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+ /* Delta Value */
+ else
+ {
+ baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
+ baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */
+ }
+ }
+ }
+ else
+ {
+ for (i = 0; i < MAX_MB_SEGMENTS; i++)
+ baseline_filter_level[i] = default_filt_lvl;
+ }
+}
+
+void vp8_loop_filter_frame_cl
+(
+ VP8_COMMON *cm,
+ MACROBLOCKD *mbd,
+ int default_filt_lvl
+)
+{
+ YV12_BUFFER_CONFIG *post = cm->frame_to_show;
+ loop_filter_info *lfi = cm->lf_info;
+ FRAME_TYPE frame_type = cm->frame_type;
+ LOOPFILTERTYPE filter_type = cm->filter_type;
+
+ int mb_row;
+ int mb_col;
+
+ int baseline_filter_level[MAX_MB_SEGMENTS];
+ int filter_level;
+ int alt_flt_enabled = mbd->segmentation_enabled;
+
+ int err;
+ unsigned char *buf_base;
+ int y_off, u_off, v_off;
+ //unsigned char *y_ptr, *u_ptr, *v_ptr;
+
+ mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */
+
+ /* Note the baseline filter values for each segment */
+ vp8_loop_filter_set_baselines_cl(mbd, default_filt_lvl, baseline_filter_level);
+
+ /* Initialize the loop filter for this frame. */
+ if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
+ vp8_init_loop_filter_cl(cm);
+ else if (frame_type != cm->last_frame_type)
+ vp8_frame_init_loop_filter_cl(lfi, frame_type);
+
+ /* Set up the buffer pointers */
+
+ buf_base = post->buffer_alloc;
+ y_off = post->y_buffer - buf_base;
+ u_off = post->u_buffer - buf_base;
+ v_off = post->v_buffer - buf_base;
+
+ VP8_CL_SET_BUF(mbd->cl_commands, post->buffer_mem, post->buffer_size, post->buffer_alloc,
+ vp8_loop_filter_frame(cm,mbd,default_filt_lvl),);
+
+ /* vp8_filter each macro block */
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
+
+ filter_level = baseline_filter_level[Segment];
+
+ /* Distance of Mb to the various image edges.
+ * These specified to 8th pel as they are always compared to values
+ * that are in 1/8th pel units. Apply any context driven MB level
+ * adjustment
+ */
+ filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);
+
+ if (filter_level)
+ {
+ if (mb_col > 0){
+ if (filter_type == NORMAL_LOOPFILTER)
+ vp8_loop_filter_mbv_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+ else
+ vp8_loop_filter_mbvs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+ }
+
+ if (mbd->mode_info_context->mbmi.dc_diff > 0){
+ if (filter_type == NORMAL_LOOPFILTER)
+ vp8_loop_filter_bv_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+ else
+ vp8_loop_filter_bvs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+ }
+
+ /* don't apply across umv border */
+ if (mb_row > 0){
+ if (filter_type == NORMAL_LOOPFILTER)
+ vp8_loop_filter_mbh_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+ else
+ vp8_loop_filter_mbhs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+ }
+
+ if (mbd->mode_info_context->mbmi.dc_diff > 0){
+ if (filter_type == NORMAL_LOOPFILTER)
+ vp8_loop_filter_bh_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+ else
+ vp8_loop_filter_bhs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
+ }
+ }
+
+ y_off += 16;
+ u_off += 8;
+ v_off += 8;
+
+ mbd->mode_info_context++; /* step to next MB */
+ }
+
+ y_off += post->y_stride * 16 - post->y_width;
+ u_off += post->uv_stride * 8 - post->uv_width;
+ v_off += post->uv_stride * 8 - post->uv_width;
+
+ mbd->mode_info_context++; /* Skip border mb */
+ }
+
+ //Retrieve buffer contents
+ err = clEnqueueReadBuffer(mbd->cl_commands, post->buffer_mem, CL_FALSE, 0, post->buffer_size, post->buffer_alloc, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS(mbd->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read loop filter output!\n",
+ ,
+ );
+
+ VP8_CL_FINISH(mbd->cl_commands);
+}
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef loopfilter_cl_h
+#define loopfilter_cl_h
+
+#include "../../../vpx_ports/mem.h"
+
+#include "../onyxc_int.h"
+#include "blockd_cl.h"
+#include "../loopfilter.h"
+
+#define prototype_loopfilter_cl(sym) \
+ void sym(MACROBLOCKD*, cl_mem src_base, int src_offset, \
+ int pitch, const signed char *flimit, \
+ const signed char *limit, const signed char *thresh, int count, int block_cnt)
+
+#define prototype_loopfilter_block_cl(sym) \
+ void sym(MACROBLOCKD*, unsigned char *y, unsigned char *u, unsigned char *v,\
+ int ystride, int uv_stride, loop_filter_info *lfi, int simpler)
+
+extern void vp8_loop_filter_frame_cl
+(
+ VP8_COMMON *cm,
+ MACROBLOCKD *mbd,
+ int default_filt_lvl
+);
+
+extern prototype_loopfilter_block_cl(vp8_lf_normal_mb_v_cl);
+extern prototype_loopfilter_block_cl(vp8_lf_normal_b_v_cl);
+extern prototype_loopfilter_block_cl(vp8_lf_normal_mb_h_cl);
+extern prototype_loopfilter_block_cl(vp8_lf_normal_b_h_cl);
+extern prototype_loopfilter_block_cl(vp8_lf_simple_mb_v_cl);
+extern prototype_loopfilter_block_cl(vp8_lf_simple_b_v_cl);
+extern prototype_loopfilter_block_cl(vp8_lf_simple_mb_h_cl);
+extern prototype_loopfilter_block_cl(vp8_lf_simple_b_h_cl);
+
+typedef prototype_loopfilter_block_cl((*vp8_lf_block_cl_fn_t));
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+
+#include <stdio.h>
+
+#include "vpx_ports/config.h"
+#include "vp8_opencl.h"
+#include "blockd_cl.h"
+
+//#include "loopfilter_cl.h"
+//#include "../onyxc_int.h"
+
+typedef unsigned char uc;
+
+static void vp8_loop_filter_cl_run(
+ cl_command_queue cq,
+ cl_kernel kernel,
+ cl_mem buf_mem,
+ int s_off,
+ int p,
+ const signed char *flimit,
+ const signed char *limit,
+ const signed char *thresh,
+ int count,
+ int block_cnt
+){
+ size_t global[] = {count,block_cnt};
+ int err;
+
+ cl_mem flimit_mem;
+ cl_mem limit_mem;
+ cl_mem thresh_mem;
+
+ VP8_CL_CREATE_BUF(cq, flimit_mem, , sizeof(uc)*16, flimit,, );
+ VP8_CL_CREATE_BUF(cq, limit_mem, , sizeof(uc)*16, limit,, );
+ VP8_CL_CREATE_BUF(cq, thresh_mem, , sizeof(uc)*16, thresh,, );
+
+ err = 0;
+ err = clSetKernelArg(kernel, 0, sizeof (cl_mem), &buf_mem);
+ err |= clSetKernelArg(kernel, 1, sizeof (cl_int), &s_off);
+ err |= clSetKernelArg(kernel, 2, sizeof (cl_int), &p);
+ err |= clSetKernelArg(kernel, 3, sizeof (cl_mem), &flimit_mem);
+ err |= clSetKernelArg(kernel, 4, sizeof (cl_mem), &limit_mem);
+ err |= clSetKernelArg(kernel, 5, sizeof (cl_mem), &thresh_mem);
+ err |= clSetKernelArg(kernel, 6, sizeof (cl_int), &block_cnt);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",,
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel(cq, kernel, 2, NULL, global, NULL , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);,
+ );
+
+ clReleaseMemObject(flimit_mem);
+ clReleaseMemObject(limit_mem);
+ clReleaseMemObject(thresh_mem);
+
+ VP8_CL_FINISH(cq);
+}
+
+void vp8_loop_filter_horizontal_edge_cl
+(
+ MACROBLOCKD *x,
+ cl_mem s_base,
+ int s_off,
+ int p, /* pitch */
+ const signed char *flimit,
+ const signed char *limit,
+ const signed char *thresh,
+ int count,
+ int block_cnt
+)
+{
+ vp8_loop_filter_cl_run(x->cl_commands,
+ cl_data.vp8_loop_filter_horizontal_edge_kernel, s_base, s_off,
+ p, flimit, limit, thresh, count*8, block_cnt
+ );
+}
+
+void vp8_loop_filter_vertical_edge_cl
+(
+ MACROBLOCKD *x,
+ cl_mem s_base,
+ int s_off,
+ int p,
+ const signed char *flimit,
+ const signed char *limit,
+ const signed char *thresh,
+ int count,
+ int block_cnt
+)
+{
+ vp8_loop_filter_cl_run(x->cl_commands,
+ cl_data.vp8_loop_filter_vertical_edge_kernel, s_base, s_off,
+ p, flimit, limit, thresh, count*8, block_cnt
+ );
+}
+
+void vp8_mbloop_filter_horizontal_edge_cl
+(
+ MACROBLOCKD *x,
+ cl_mem s_base,
+ int s_off,
+ int p,
+ const signed char *flimit,
+ const signed char *limit,
+ const signed char *thresh,
+ int count,
+ int block_cnt
+)
+{
+ vp8_loop_filter_cl_run(x->cl_commands,
+ cl_data.vp8_mbloop_filter_horizontal_edge_kernel, s_base, s_off,
+ p, flimit, limit, thresh, count*8, block_cnt
+ );
+}
+
+
+void vp8_mbloop_filter_vertical_edge_cl
+(
+ MACROBLOCKD *x,
+ cl_mem s_base,
+ int s_off,
+ int p,
+ const signed char *flimit,
+ const signed char *limit,
+ const signed char *thresh,
+ int count,
+ int block_cnt
+)
+{
+ vp8_loop_filter_cl_run(x->cl_commands,
+ cl_data.vp8_mbloop_filter_vertical_edge_kernel, s_base, s_off,
+ p, flimit, limit, thresh, count*8, block_cnt
+ );
+}
+
+void vp8_loop_filter_simple_horizontal_edge_cl
+(
+ MACROBLOCKD *x,
+ cl_mem s_base,
+ int s_off,
+ int p,
+ const signed char *flimit,
+ const signed char *limit,
+ const signed char *thresh,
+ int count,
+ int block_cnt
+)
+{
+ vp8_loop_filter_cl_run(x->cl_commands,
+ cl_data.vp8_loop_filter_simple_horizontal_edge_kernel, s_base, s_off,
+ p, flimit, limit, thresh, count*8, block_cnt
+ );
+}
+
+void vp8_loop_filter_simple_vertical_edge_cl
+(
+ MACROBLOCKD *x,
+ cl_mem s_base,
+ int s_off,
+ int p,
+ const signed char *flimit,
+ const signed char *limit,
+ const signed char *thresh,
+ int count,
+ int block_cnt
+)
+{
+ vp8_loop_filter_cl_run(x->cl_commands,
+ cl_data.vp8_loop_filter_simple_vertical_edge_kernel, s_base, s_off,
+ p, flimit, limit, thresh, count*8, block_cnt
+ );
+}
--- /dev/null
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/config.h"
+#include "../subpixel.h"
+#include "subpixel_cl.h"
+#include "../onyxc_int.h"
+#include "vp8_opencl.h"
+
+#if HAVE_DLOPEN
+#include "dynamic_cl.h"
+#endif
+
+void vp8_arch_opencl_common_init(VP8_COMMON *ctx)
+{
+
+#if HAVE_DLOPEN
+
+#if WIN32 //Windows .dll has no lib prefix and no extension
+ cl_loaded = load_cl("OpenCL");
+#else //But *nix needs full name
+ cl_loaded = load_cl("libOpenCL.so");
+#endif
+
+ if (cl_loaded == CL_SUCCESS)
+ cl_initialized = cl_common_init();
+ else
+ cl_initialized = VP8_CL_TRIED_BUT_FAILED;
+
+#else //!HAVE_DLOPEN (e.g. Apple)
+ cl_initialized = cl_common_init();
+#endif
+
+}
--- /dev/null
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+//for the decoder, all subpixel prediction is done in this file.
+//
+//Need to determine some sort of mechanism for easily determining SIXTAP/BILINEAR
+//and what arguments to feed into the kernels. These kernels SHOULD be 2-pass,
+//and ideally there'd be a data structure that determined what static arguments
+//to pass in.
+//
+//Also, the only external functions being called here are the subpixel prediction
+//functions. Hopefully this means no worrying about when to copy data back/forth.
+
+#include "../../../vpx_ports/config.h"
+//#include "../recon.h"
+#include "../subpixel.h"
+//#include "../blockd.h"
+//#include "../reconinter.h"
+#if CONFIG_RUNTIME_CPU_DETECT
+//#include "../onyxc_int.h"
+#endif
+
+#include "vp8_opencl.h"
+#include "filter_cl.h"
+#include "reconinter_cl.h"
+#include "blockd_cl.h"
+
+#include <stdio.h>
+
+/* use this define on systems where unaligned int reads and writes are
+ * not allowed, i.e. ARM architectures
+ */
+/*#define MUST_BE_ALIGNED*/
+
+static const int bbb[4] = {0, 2, 8, 10};
+
+static void vp8_memcpy(
+ unsigned char *src_base,
+ int src_offset,
+ int src_stride,
+ unsigned char *dst_base,
+ int dst_offset,
+ int dst_stride,
+ int num_bytes,
+ int num_iter
+){
+
+ int i,r;
+ unsigned char *src = &src_base[src_offset];
+ unsigned char *dst = &dst_base[dst_offset];
+ src_offset = dst_offset = 0;
+
+ for (r = 0; r < num_iter; r++){
+ for (i = 0; i < num_bytes; i++){
+ src_offset = r*src_stride + i;
+ dst_offset = r*dst_stride + i;
+ dst[dst_offset] = src[src_offset];
+ }
+ }
+}
+
+static void vp8_copy_mem_cl(
+ cl_command_queue cq,
+ cl_mem src_mem,
+ int *src_offsets,
+ int src_stride,
+ cl_mem dst_mem,
+ int *dst_offsets,
+ int dst_stride,
+ int num_bytes,
+ int num_iter,
+ int num_blocks
+){
+
+ int err,block;
+
+#if MEM_COPY_KERNEL
+ size_t global[3] = {num_bytes, num_iter, num_blocks};
+
+ size_t local[3];
+ local[0] = global[0];
+ local[1] = global[1];
+ local[2] = global[2];
+
+ err = clSetKernelArg(cl_data.vp8_memcpy_kernel, 0, sizeof (cl_mem), &src_mem);
+ err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 2, sizeof (int), &src_stride);
+ err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 3, sizeof (cl_mem), &dst_mem);
+ err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 5, sizeof (int), &dst_stride);
+ err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 6, sizeof (int), &num_bytes);
+ err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 7, sizeof (int), &num_iter);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",
+ return,
+ );
+
+ for (block = 0; block < num_blocks; block++){
+
+ /* Set kernel arguments */
+ err = clSetKernelArg(cl_data.vp8_memcpy_kernel, 1, sizeof (int), &src_offsets[block]);
+ err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 4, sizeof (int), &dst_offsets[block]);
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",
+ return,
+ );
+
+ /* Execute the kernel */
+ if (num_bytes * num_iter > cl_data.vp8_memcpy_kernel_size){
+ err = clEnqueueNDRangeKernel( cq, cl_data.vp8_memcpy_kernel, 2, NULL, global, NULL , 0, NULL, NULL);
+ } else {
+ err = clEnqueueNDRangeKernel( cq, cl_data.vp8_memcpy_kernel, 2, NULL, global, local , 0, NULL, NULL);
+ }
+
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ return,
+ );
+ }
+#else
+ int iter;
+ for (block=0; block < num_blocks; block++){
+ for (iter = 0; iter < num_iter; iter++){
+ err = clEnqueueCopyBuffer(cq, src_mem, dst_mem,
+ src_offsets[block]+iter*src_stride,
+ dst_offsets[block]+iter*dst_stride,
+ num_bytes, 0, NULL, NULL
+ );
+ VP8_CL_CHECK_SUCCESS(cq, err != CL_SUCCESS, "Error copying between buffers\n",
+ ,
+ );
+ }
+ }
+#endif
+}
+
+static void vp8_build_inter_predictors_b_cl(MACROBLOCKD *x, BLOCKD *d, int pitch)
+{
+ unsigned char *ptr_base = *(d->base_pre);
+ int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+ vp8_subpix_cl_fn_t sppf;
+
+ int pre_dist = *d->base_pre - x->pre.buffer_alloc;
+ cl_mem pre_mem = x->pre.buffer_mem;
+ int pre_off = pre_dist+ptr_offset;
+
+ if (d->sixtap_filter == CL_TRUE)
+ sppf = vp8_sixtap_predict4x4_cl;
+ else
+ sppf = vp8_bilinear_predict4x4_cl;
+
+ //ptr_base a.k.a. d->base_pre is the start of the
+ //Macroblock's y_buffer, u_buffer, or v_buffer
+
+ if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
+ {
+ sppf(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
+ }
+ else
+ {
+ vp8_copy_mem_cl(d->cl_commands, pre_mem, &pre_off, d->pre_stride,d->cl_predictor_mem, &d->predictor_offset,pitch,4,4,1);
+ }
+}
+
+
+static void vp8_build_inter_predictors4b_cl(MACROBLOCKD *x, BLOCKD *d, int pitch)
+{
+ unsigned char *ptr_base = *(d->base_pre);
+ int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+ int pre_dist = *d->base_pre - x->pre.buffer_alloc;
+ cl_mem pre_mem = x->pre.buffer_mem;
+ int pre_off = pre_dist + ptr_offset;
+
+ //If there's motion in the bottom 8 subpixels, need to do subpixel prediction
+ if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
+ {
+ if (d->sixtap_filter == CL_TRUE)
+ vp8_sixtap_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
+ else
+ vp8_bilinear_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
+ }
+ //Otherwise copy memory directly from src to dest
+ else
+ {
+ vp8_copy_mem_cl(d->cl_commands, pre_mem, &pre_off, d->pre_stride, d->cl_predictor_mem, &d->predictor_offset, pitch, 8, 8, 1);
+ }
+
+
+}
+
+static void vp8_build_inter_predictors2b_cl(MACROBLOCKD *x, BLOCKD *d, int pitch)
+{
+ unsigned char *ptr_base = *(d->base_pre);
+
+ int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+ int pre_dist = *d->base_pre - x->pre.buffer_alloc;
+ cl_mem pre_mem = x->pre.buffer_mem;
+ int pre_off = pre_dist+ptr_offset;
+
+ if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
+ {
+ if (d->sixtap_filter == CL_TRUE)
+ vp8_sixtap_predict8x4_cl(d->cl_commands,ptr_base,pre_mem,pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
+ else
+ vp8_bilinear_predict8x4_cl(d->cl_commands,ptr_base,pre_mem,pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
+ }
+ else
+ {
+ vp8_copy_mem_cl(d->cl_commands, pre_mem, &pre_off, d->pre_stride, d->cl_predictor_mem, &d->predictor_offset, pitch, 8, 4, 1);
+ }
+}
+
+
+void vp8_build_inter_predictors_mbuv_cl(MACROBLOCKD *x)
+{
+ int i;
+
+ vp8_cl_mb_prep(x, PREDICTOR|PRE_BUF);
+
+#if !ONE_CQ_PER_MB
+ VP8_CL_FINISH(x->cl_commands);
+#endif
+
+ if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
+ x->mode_info_context->mbmi.mode != SPLITMV)
+ {
+
+ unsigned char *pred_base = x->predictor;
+ int upred_offset = 256;
+ int vpred_offset = 320;
+
+ int mv_row = x->block[16].bmi.mv.as_mv.row;
+ int mv_col = x->block[16].bmi.mv.as_mv.col;
+ int offset;
+
+ unsigned char *pre_base = x->pre.buffer_alloc;
+ cl_mem pre_mem = x->pre.buffer_mem;
+ int upre_off = x->pre.u_buffer - pre_base;
+ int vpre_off = x->pre.v_buffer - pre_base;
+ int pre_stride = x->block[16].pre_stride;
+
+ offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+ if ((mv_row | mv_col) & 7)
+ {
+ if (cl_initialized == CL_SUCCESS && x->sixtap_filter == CL_TRUE){
+ vp8_sixtap_predict8x8_cl(x->block[16].cl_commands,pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8);
+ vp8_sixtap_predict8x8_cl(x->block[20].cl_commands,pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8);
+ }
+ else{
+ vp8_bilinear_predict8x8_cl(x->block[16].cl_commands,pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8);
+ vp8_bilinear_predict8x8_cl(x->block[20].cl_commands,pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8);
+ }
+ }
+ else
+ {
+ int pre_offsets[2] = {upre_off+offset, vpre_off+offset};
+ int pred_offsets[2] = {upred_offset,vpred_offset};
+ vp8_copy_mem_cl(x->block[16].cl_commands, pre_mem, pre_offsets, pre_stride, x->cl_predictor_mem, pred_offsets, 8, 8, 8, 2);
+ }
+ }
+ else
+ {
+ // Can probably batch these operations as well, but not tested in decoder
+ // (or at least the test videos I've been using.
+ for (i = 16; i < 24; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ vp8_build_inter_predictors2b_cl(x, d0, 8);
+ else
+ {
+ vp8_build_inter_predictors_b_cl(x, d0, 8);
+ vp8_build_inter_predictors_b_cl(x, d1, 8);
+ }
+ }
+ }
+
+#if !ONE_CQ_PER_MB
+ VP8_CL_FINISH(x->block[0].cl_commands);
+ VP8_CL_FINISH(x->block[16].cl_commands);
+ VP8_CL_FINISH(x->block[20].cl_commands);
+#endif
+
+ vp8_cl_mb_finish(x, PREDICTOR);
+}
+
+void vp8_build_inter_predictors_mb_cl(MACROBLOCKD *x)
+{
+ //If CL is running in encoder, need to call following before proceeding.
+ //vp8_cl_mb_prep(x, PRE_BUF);
+
+#if !ONE_CQ_PER_MB
+ VP8_CL_FINISH(x->cl_commands);
+#endif
+
+ if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
+ x->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ int offset;
+ unsigned char *pred_base = x->predictor;
+ int upred_offset = 256;
+ int vpred_offset = 320;
+
+ int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+ int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+ int pre_stride = x->block[0].pre_stride;
+
+ unsigned char *pre_base = x->pre.buffer_alloc;
+ cl_mem pre_mem = x->pre.buffer_mem;
+ int ypre_off = x->pre.y_buffer - pre_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
+ int upre_off = x->pre.u_buffer - pre_base;
+ int vpre_off = x->pre.v_buffer - pre_base;
+
+ if ((mv_row | mv_col) & 7)
+ {
+ if (cl_initialized == CL_SUCCESS && x->sixtap_filter == CL_TRUE){
+ vp8_sixtap_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, 0, 16);
+ }
+ else
+ vp8_bilinear_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, 0, 16);
+ }
+ else
+ {
+ //16x16 copy
+ int pred_off = 0;
+ vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &ypre_off, pre_stride, x->cl_predictor_mem, &pred_off, 16, 16, 16, 1);
+ }
+
+
+ mv_row = x->block[16].bmi.mv.as_mv.row;
+ mv_col = x->block[16].bmi.mv.as_mv.col;
+ pre_stride >>= 1;
+ offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+ if ((mv_row | mv_col) & 7)
+ {
+ if (x->sixtap_filter == CL_TRUE){
+ vp8_sixtap_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8);
+ vp8_sixtap_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8);
+ }
+ else {
+ vp8_bilinear_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8);
+ vp8_bilinear_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8);
+ }
+ }
+ else
+ {
+ int pre_off = upre_off + offset;
+ vp8_copy_mem_cl(x->block[16].cl_commands, pre_mem, &pre_off, pre_stride, x->cl_predictor_mem, &upred_offset, 8, 8, 8, 1);
+ pre_off = vpre_off + offset;
+ vp8_copy_mem_cl(x->block[20].cl_commands, pre_mem, &pre_off, pre_stride, x->cl_predictor_mem, &vpred_offset, 8, 8, 8, 1);
+ }
+ }
+ else
+ {
+ int i;
+
+ if (x->mode_info_context->mbmi.partitioning < 3)
+ {
+ for (i = 0; i < 4; i++)
+ {
+ BLOCKD *d = &x->block[bbb[i]];
+ vp8_build_inter_predictors4b_cl(x, d, 16);
+ }
+ }
+ else
+ {
+ /* This loop can be done in any order... No dependencies.*/
+ /* Also, d0/d1 can be decoded simultaneously */
+ for (i = 0; i < 16; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ vp8_build_inter_predictors2b_cl(x, d0, 16);
+ else
+ {
+ vp8_build_inter_predictors_b_cl(x, d0, 16);
+ vp8_build_inter_predictors_b_cl(x, d1, 16);
+ }
+ }
+ }
+
+ /* Another case of re-orderable/batchable loop */
+ for (i = 16; i < 24; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ vp8_build_inter_predictors2b_cl(x, d0, 8);
+ else
+ {
+ vp8_build_inter_predictors_b_cl(x, d0, 8);
+ vp8_build_inter_predictors_b_cl(x, d1, 8);
+ }
+ }
+ }
+
+#if !ONE_CQ_PER_MB
+ VP8_CL_FINISH(x->block[0].cl_commands);
+ VP8_CL_FINISH(x->block[16].cl_commands);
+ VP8_CL_FINISH(x->block[20].cl_commands);
+#endif
+
+ vp8_cl_mb_finish(x, PREDICTOR);
+}
+
+
+/* The following functions are written for skip_recon_mb() to call. Since there is no recon in this
+ * situation, we can write the result directly to dst buffer instead of writing it to predictor
+ * buffer and then copying it to dst buffer.
+ */
+static void vp8_build_inter_predictors_b_s_cl(MACROBLOCKD *x, BLOCKD *d, int dst_offset)
+{
+ unsigned char *ptr_base = *(d->base_pre);
+ int dst_stride = d->dst_stride;
+ int pre_stride = d->pre_stride;
+ int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+ vp8_subpix_cl_fn_t sppf;
+
+ int pre_dist = *d->base_pre - x->pre.buffer_alloc;
+ cl_mem pre_mem = x->pre.buffer_mem;
+ cl_mem dst_mem = x->dst.buffer_mem;
+
+ if (d->sixtap_filter == CL_TRUE){
+ sppf = vp8_sixtap_predict4x4_cl;
+ } else
+ sppf = vp8_bilinear_predict4x4_cl;
+
+ if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
+ {
+ sppf(d->cl_commands, ptr_base, pre_mem, pre_dist+ptr_offset, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, NULL, dst_mem, dst_offset, dst_stride);
+ }
+ else
+ {
+ int pre_off = pre_dist+ptr_offset;
+ vp8_copy_mem_cl(d->cl_commands, pre_mem,&pre_off,pre_stride, dst_mem, &dst_offset,dst_stride,4,4,1);
+ }
+}
+
+
+void vp8_build_inter_predictors_mb_s_cl(MACROBLOCKD *x)
+{
+ cl_mem dst_mem = NULL;
+ cl_mem pre_mem = x->pre.buffer_mem;
+
+ unsigned char *dst_base = x->dst.buffer_alloc;
+ int ydst_off = x->dst.y_buffer - dst_base;
+ int udst_off = x->dst.u_buffer - dst_base;
+ int vdst_off = x->dst.v_buffer - dst_base;
+
+ dst_mem = x->dst.buffer_mem;
+ vp8_cl_mb_prep(x, DST_BUF);
+
+#if !ONE_CQ_PER_MB
+ VP8_CL_FINISH(x->cl_commands);
+#endif
+
+ if (x->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ int offset;
+ unsigned char *pre_base = x->pre.buffer_alloc;
+ int ypre_off = x->pre.y_buffer - pre_base;
+ int upre_off = x->pre.u_buffer - pre_base;
+ int vpre_off = x->pre.v_buffer - pre_base;
+
+ int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+ int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+ int pre_stride = x->dst.y_stride;
+
+ int ptr_offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+ if ((mv_row | mv_col) & 7)
+ {
+ if (x->sixtap_filter == CL_TRUE){
+ vp8_sixtap_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off+ptr_offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
+ }
+ else
+ vp8_bilinear_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off+ptr_offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
+ }
+ else
+ {
+ int pre_off = ypre_off+ptr_offset;
+ vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off, pre_stride, dst_mem, &ydst_off, x->dst.y_stride, 16, 16, 1);
+ }
+
+ mv_row = x->block[16].bmi.mv.as_mv.row;
+ mv_col = x->block[16].bmi.mv.as_mv.col;
+ pre_stride >>= 1;
+ offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+
+ if ((mv_row | mv_col) & 7)
+ {
+ if (x->sixtap_filter == CL_TRUE){
+ vp8_sixtap_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, udst_off, x->dst.uv_stride);
+ vp8_sixtap_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, vdst_off, x->dst.uv_stride);
+ } else {
+ vp8_bilinear_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, udst_off, x->dst.uv_stride);
+ vp8_bilinear_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, vdst_off, x->dst.uv_stride);
+ }
+ }
+ else
+ {
+ int pre_offsets[2] = {upre_off+offset, vpre_off+offset};
+ int dst_offsets[2] = {udst_off,vdst_off};
+ vp8_copy_mem_cl(x->block[16].cl_commands, pre_mem, pre_offsets, pre_stride, dst_mem, dst_offsets, x->dst.uv_stride, 8, 8, 2);
+ }
+
+ }
+ else
+ {
+ /* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later,
+ * if sth is wrong, go back to what it is in build_inter_predictors_mb.
+ *
+ * ACW: Not sure who the above comment belongs to, but it is
+ * accurate for the decoder. Verified by reverse trace of source
+ */
+ int i;
+
+ if (x->mode_info_context->mbmi.partitioning < 3)
+ {
+ for (i = 0; i < 4; i++)
+ {
+ BLOCKD *d = &x->block[bbb[i]];
+
+ {
+ unsigned char *ptr_base = *(d->base_pre);
+ int pre_off = ptr_base - x->pre.buffer_alloc;
+
+ int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+ pre_off += ptr_offset;
+
+ if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
+ {
+ if (x->sixtap_filter == CL_TRUE)
+ vp8_sixtap_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
+ else
+ vp8_bilinear_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
+ }
+ else
+ {
+ vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off, d->pre_stride, dst_mem, &ydst_off, x->dst.y_stride, 8, 8, 1);
+ }
+ }
+ }
+ }
+ else
+ {
+ for (i = 0; i < 16; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ {
+ /*vp8_build_inter_predictors2b(x, d0, 16);*/
+ unsigned char *ptr_base = *(d0->base_pre);
+
+ int pre_off = ptr_base - x->pre.buffer_alloc;
+
+ int ptr_offset = d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
+ pre_off += ptr_offset;
+
+ if ( (d0->bmi.mv.as_mv.row | d0->bmi.mv.as_mv.col) & 7)
+ {
+ if (d0->sixtap_filter == CL_TRUE)
+ vp8_sixtap_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem, pre_off, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
+ else
+ vp8_bilinear_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem,pre_off, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
+ }
+ else
+ {
+ vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off, d0->pre_stride, dst_mem, &ydst_off, x->dst.y_stride, 8, 4, 1);
+ }
+ }
+ else
+ {
+ vp8_build_inter_predictors_b_s_cl(x,d0, ydst_off);
+ vp8_build_inter_predictors_b_s_cl(x,d1, ydst_off);
+ }
+ }
+ }
+
+ for (i = 16; i < 24; i += 2)
+ {
+ BLOCKD *d0 = &x->block[i];
+ BLOCKD *d1 = &x->block[i+1];
+
+ if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+ {
+ /*vp8_build_inter_predictors2b(x, d0, 8);*/
+ unsigned char *ptr_base = *(d0->base_pre);
+ int ptr_offset = d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
+ int pre_off = ptr_base - x->pre.buffer_alloc + ptr_offset;
+
+ if ( (d0->bmi.mv.as_mv.row | d0->bmi.mv.as_mv.col) & 7)
+ {
+ if (d0->sixtap_filter || CL_TRUE)
+ vp8_sixtap_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem, pre_off, d0->pre_stride,
+ d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7,
+ dst_base, dst_mem, ydst_off, x->dst.uv_stride);
+ else
+ vp8_bilinear_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem, pre_off, d0->pre_stride,
+ d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7,
+ dst_base, dst_mem, ydst_off, x->dst.uv_stride);
+ }
+ else
+ {
+ vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off,
+ d0->pre_stride, dst_mem, &ydst_off, x->dst.uv_stride, 8, 4, 1);
+ }
+ }
+ else
+ {
+ vp8_build_inter_predictors_b_s_cl(x,d0, ydst_off);
+ vp8_build_inter_predictors_b_s_cl(x,d1, ydst_off);
+ }
+ } //end for
+ }
+
+#if !ONE_CQ_PER_MB
+ VP8_CL_FINISH(x->block[0].cl_commands);
+ VP8_CL_FINISH(x->block[16].cl_commands);
+ VP8_CL_FINISH(x->block[20].cl_commands);
+#endif
+
+ vp8_cl_mb_finish(x, DST_BUF);
+}
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RECONINTER_CL_H
+#define __INC_RECONINTER_CL_H
+
+#include "blockd_cl.h"
+#include "subpixel_cl.h"
+#include "filter_cl.h"
+
+extern void vp8_build_inter_predictors_mb_cl(MACROBLOCKD *x);
+extern void vp8_build_inter_predictors_mbuv_cl(MACROBLOCKD *x);
+
+extern void vp8_build_inter_predictors_mb_s_cl(MACROBLOCKD *x);
+//extern void vp8_build_inter_predictors_b_cl(BLOCKD *d, int pitch);
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef SUBPIXEL_CL_H
+#define SUBPIXEL_CL_H
+
+#include "../blockd.h"
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+
+#define prototype_subpixel_predict_cl(sym) \
+ void sym(cl_command_queue cq, unsigned char *src_base, cl_mem src_mem, int src_offset, \
+ int src_pitch, int xofst, int yofst, \
+ unsigned char *dst_base, cl_mem dst_mem, int dst_offset, int dst_pitch)
+
+extern prototype_subpixel_predict_cl(vp8_sixtap_predict16x16_cl);
+extern prototype_subpixel_predict_cl(vp8_sixtap_predict8x8_cl);
+extern prototype_subpixel_predict_cl(vp8_sixtap_predict8x4_cl);
+extern prototype_subpixel_predict_cl(vp8_sixtap_predict4x4_cl);
+extern prototype_subpixel_predict_cl(vp8_bilinear_predict16x16_cl);
+extern prototype_subpixel_predict_cl(vp8_bilinear_predict8x8_cl);
+extern prototype_subpixel_predict_cl(vp8_bilinear_predict8x4_cl);
+extern prototype_subpixel_predict_cl(vp8_bilinear_predict4x4_cl);
+
+typedef prototype_subpixel_predict_cl((*vp8_subpix_cl_fn_t));
+
+//typedef enum
+//{
+// SIXTAP = 0,
+// BILINEAR = 1
+//} SUBPIX_TYPE;
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include "vp8_opencl.h"
+
+int cl_initialized = VP8_CL_NOT_INITIALIZED;
+VP8_COMMON_CL cl_data;
+
+//Initialization functions for various CL programs.
+extern int cl_init_filter();
+extern int cl_init_idct();
+extern int cl_init_loop_filter();
+
+//Common CL destructors
+extern void cl_destroy_loop_filter();
+extern void cl_destroy_filter();
+extern void cl_destroy_idct();
+
+//Destructors for encoder/decoder-specific bits
+extern void cl_decode_destroy();
+extern void cl_encode_destroy();
+
+/**
+ *
+ * @param cq
+ * @param new_status
+ */
+void cl_destroy(cl_command_queue cq, int new_status) {
+
+ if (cl_initialized != CL_SUCCESS)
+ return;
+
+ //Wait on any pending operations to complete... frees up all of our pointers
+ if (cq != NULL)
+ clFinish(cq);
+
+#if ENABLE_CL_SUBPIXEL
+ //Release the objects that we've allocated on the GPU
+ cl_destroy_filter();
+#endif
+
+#if ENABLE_CL_IDCT_DEQUANT
+ cl_destroy_idct();
+
+#if CONFIG_VP8_DECODER
+ if (cl_data.cl_decode_initialized == CL_SUCCESS)
+ cl_decode_destroy();
+#endif
+
+#endif
+#if ENABLE_CL_LOOPFILTER
+ cl_destroy_loop_filter();
+#endif
+
+
+#if CONFIG_VP8_ENCODER
+ //placeholder for if/when encoder CL gets implemented
+#endif
+
+ if (cq){
+ clReleaseCommandQueue(cq);
+ }
+
+ if (cl_data.context){
+ clReleaseContext(cl_data.context);
+ cl_data.context = NULL;
+ }
+
+ cl_initialized = new_status;
+
+ return;
+}
+
+/**
+ *
+ * @param dev
+ * @return
+ */
+cl_device_type device_type(cl_device_id dev){
+ cl_device_type type;
+ int err;
+
+ err = clGetDeviceInfo(dev, CL_DEVICE_TYPE, sizeof(type),&type,NULL);
+ if (err != CL_SUCCESS)
+ return CL_INVALID_DEVICE;
+ return type;
+}
+
+/**
+ *
+ * @return
+ */
+int cl_common_init() {
+ int err,i,dev;
+ cl_platform_id platform_ids[MAX_NUM_PLATFORMS];
+ cl_uint num_found, num_devices;
+ cl_device_id devices[MAX_NUM_DEVICES];
+
+ //Don't allow multiple CL contexts..
+ if (cl_initialized != VP8_CL_NOT_INITIALIZED)
+ return cl_initialized;
+
+ // Connect to a compute device
+ err = clGetPlatformIDs(MAX_NUM_PLATFORMS, platform_ids, &num_found);
+
+ if (err != CL_SUCCESS) {
+ fprintf(stderr, "Couldn't query platform IDs\n");
+ return VP8_CL_TRIED_BUT_FAILED;
+ }
+
+ if (num_found == 0) {
+ fprintf(stderr, "No platforms found\n");
+ return VP8_CL_TRIED_BUT_FAILED;
+ }
+
+ //printf("Enumerating %d platform(s)\n", num_found);
+ //Enumerate the platforms found
+ for (i = 0; i < num_found; i++){
+ char buf[2048];
+ size_t len;
+
+ err = clGetPlatformInfo( platform_ids[i], CL_PLATFORM_VENDOR, sizeof(buf), buf, &len);
+ if (err != CL_SUCCESS){
+ fprintf(stderr, "Error retrieving platform vendor for platform %d",i);
+ continue;
+ }
+ //printf("Platform %d: %s\n",i,buf);
+
+ //If you need to force a platform (e.g. CPU-only testing), uncomment this
+ //if (strstr(buf,"NVIDIA"))
+ // continue;
+
+ //Try to find a valid compute device
+ //Favor the GPU, but fall back to any other available device if necessary
+#ifdef __APPLE__
+ printf("Apple system. Running CL as CPU-only for now...\n");
+ err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_CPU, MAX_NUM_DEVICES, devices, &num_devices);
+#else
+ err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, MAX_NUM_DEVICES, devices, &num_devices);
+#endif //__APPLE__
+ //printf("found %d devices\n", num_devices);
+ cl_data.device_id = NULL;
+ for( dev = 0; dev < num_devices; dev++ ){
+ char ext[2048];
+ //Get info for this device.
+ err = clGetDeviceInfo(devices[dev], CL_DEVICE_EXTENSIONS,
+ sizeof(ext),ext,NULL);
+ VP8_CL_CHECK_SUCCESS(NULL,err != CL_SUCCESS,
+ "Error retrieving device extension list",continue, 0);
+ //printf("Device %d supports: %s\n",dev,ext);
+
+ //The kernels in VP8 require byte-addressable stores, which is an
+ //extension. It's required in OpenCL 1.1, but not all devices
+ //support it.
+ if (strstr(ext,"cl_khr_byte_addressable_store")){
+ //We found a valid device, so use it. But if we find a GPU
+ //(maybe this is one), prefer that.
+ cl_data.device_id = devices[dev];
+
+ if ( device_type(devices[dev]) == CL_DEVICE_TYPE_GPU ){
+ //printf("Device %d is a GPU\n",dev);
+ break;
+ }
+ }
+ }
+
+ //If we've found a usable GPU, stop looking.
+ if (cl_data.device_id != NULL && device_type(cl_data.device_id) == CL_DEVICE_TYPE_GPU )
+ break;
+
+ }
+
+ if (cl_data.device_id == NULL){
+ printf("Error: Failed to find a valid OpenCL device. Using CPU paths\n");
+ return VP8_CL_TRIED_BUT_FAILED;
+ }
+
+ // Create the compute context
+ cl_data.context = clCreateContext(0, 1, &cl_data.device_id, NULL, NULL, &err);
+ if (!cl_data.context) {
+ printf("Error: Failed to create a compute context!\n");
+ return VP8_CL_TRIED_BUT_FAILED;
+ }
+
+ //Initialize programs to null value
+ //Enables detection of if they've been initialized as well.
+ cl_data.filter_program = NULL;
+ cl_data.idct_program = NULL;
+ cl_data.loop_filter_program = NULL;
+
+#if ENABLE_CL_SUBPIXEL
+ err = cl_init_filter();
+ if (err != CL_SUCCESS)
+ return err;
+#endif
+
+#if ENABLE_CL_IDCT_DEQUANT
+ err = cl_init_idct();
+ if (err != CL_SUCCESS)
+ return err;
+#endif
+
+#if ENABLE_CL_LOOPFILTER
+
+ err = cl_init_loop_filter();
+ if (err != CL_SUCCESS)
+ return err;
+#endif
+
+ return CL_SUCCESS;
+}
+
+char *cl_read_file(const char* file_name) {
+ long pos;
+ char *bytes;
+ size_t amt_read;
+ FILE *f;
+
+ f = fopen(file_name, "rb");
+
+ if (f == NULL) {
+ char *fullpath;
+ //printf("Couldn't find %s\n", file_name);
+
+ //Generate a file path for the CL sources using the library install dir
+ fullpath = malloc(strlen(vpx_codec_lib_dir()) + strlen(file_name) + 2);
+ if (fullpath == NULL) {
+ return NULL;
+ }
+ strcpy(fullpath, vpx_codec_lib_dir());
+ strcat(fullpath, "/"); //Will need to be changed for MSVS
+ strcat(fullpath, file_name);
+
+ //printf("Looking in %s\n", fullpath);
+
+ f = fopen(fullpath, "rb");
+ if (f == NULL) {
+ fprintf(stderr,"Couldn't find CL source at %s or %s\n", file_name, fullpath);
+ free(fullpath);
+ return NULL;
+ }
+
+ //printf("Found cl source at %s\n", fullpath);
+ free(fullpath);
+ } else {
+ //printf("Found cl source at %s\n", file_name);
+ }
+
+ fseek(f, 0, SEEK_END);
+ pos = ftell(f);
+ fseek(f, 0, SEEK_SET);
+ bytes = malloc(pos+1);
+
+ if (bytes == NULL) {
+ fclose(f);
+ return NULL;
+ }
+
+ amt_read = fread(bytes, pos, 1, f);
+ if (amt_read != 1) {
+ free(bytes);
+ fclose(f);
+ return NULL;
+ }
+
+ bytes[pos] = '\0'; //null terminate the source string
+ fclose(f);
+
+
+ return bytes;
+}
+
+void show_build_log(cl_program *prog_ref){
+ size_t len;
+ char *buffer;
+ int err = clGetProgramBuildInfo(*prog_ref, cl_data.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
+
+ if (err != CL_SUCCESS){
+ printf("Error: Could not get length of CL build log\n");
+ }
+
+ buffer = (char*) malloc(len);
+ if (buffer == NULL) {
+ printf("Error: Couldn't allocate compile output buffer memory\n");
+ }
+
+ err = clGetProgramBuildInfo(*prog_ref, cl_data.device_id, CL_PROGRAM_BUILD_LOG, len, buffer, NULL);
+ if (err != CL_SUCCESS) {
+ printf("Error: Could not get CL build log\n");
+
+ } else {
+ printf("Compile output: %s\n", buffer);
+ }
+ free(buffer);
+}
+
+int cl_load_program(cl_program *prog_ref, const char *file_name, const char *opts) {
+
+ int err;
+ char *kernel_src = cl_read_file(file_name);
+
+ *prog_ref = NULL;
+ if (kernel_src != NULL) {
+ *prog_ref = clCreateProgramWithSource(cl_data.context, 1, (const char**)&kernel_src, NULL, &err);
+ free(kernel_src);
+ } else {
+ cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
+ printf("Couldn't find OpenCL source files. \nUsing software path.\n");
+ return VP8_CL_TRIED_BUT_FAILED;
+ }
+
+ if (*prog_ref == NULL) {
+ printf("Error: Couldn't create program\n");
+ return VP8_CL_TRIED_BUT_FAILED;
+ }
+
+ if (err != CL_SUCCESS) {
+ printf("Error creating program: %d\n", err);
+ }
+
+ /* Build the program executable */
+ err = clBuildProgram(*prog_ref, 0, NULL, opts, NULL, NULL);
+ if (err != CL_SUCCESS) {
+ printf("Error: Failed to build program executable for %s!\n", file_name);
+
+ show_build_log(prog_ref);
+
+ return VP8_CL_TRIED_BUT_FAILED;
+ }
+
+ return CL_SUCCESS;
+}
--- /dev/null
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_OPENCL_H
+#define VP8_OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../../../vpx_config.h"
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#if HAVE_DLOPEN
+#include "dynamic_cl.h"
+#endif
+
+#define ENABLE_CL_IDCT_DEQUANT 0
+#define ENABLE_CL_SUBPIXEL 1
+#define TWO_PASS_SIXTAP 0
+#define MEM_COPY_KERNEL 1
+#define ONE_CQ_PER_MB 1 //Value of 0 is racey... still experimental.
+#define ENABLE_CL_LOOPFILTER 0
+
+extern char *cl_read_file(const char* file_name);
+extern int cl_common_init();
+extern void cl_destroy(cl_command_queue cq, int new_status);
+extern int cl_load_program(cl_program *prog_ref, const char *file_name, const char *opts);
+
+#define MAX_NUM_PLATFORMS 4
+#define MAX_NUM_DEVICES 10
+
+#define VP8_CL_TRIED_BUT_FAILED 1
+#define VP8_CL_NOT_INITIALIZED -1
+extern int cl_initialized;
+
+extern const char *vpx_codec_lib_dir(void);
+
+#define VP8_CL_FINISH(cq) \
+ if (cl_initialized == CL_SUCCESS){ \
+ /* Wait for kernels to finish. */ \
+ clFinish(cq); \
+ }
+
+#define VP8_CL_BARRIER(cq) \
+ if (cl_initialized == CL_SUCCESS){ \
+ /* Insert a barrier into the command queue. */ \
+ clEnqueueBarrier(cq); \
+ }
+
+#define VP8_CL_CHECK_SUCCESS(cq,cond,msg,alt,retCode) \
+ if ( cond ){ \
+ fprintf(stderr, msg); \
+ cl_destroy(cq, VP8_CL_TRIED_BUT_FAILED); \
+ alt; \
+ return retCode; \
+ }
+
+#define VP8_CL_CALC_LOCAL_SIZE(kernel, kernel_size) \
+ err = clGetKernelWorkGroupInfo( cl_data.kernel, \
+ cl_data.device_id, \
+ CL_KERNEL_WORK_GROUP_SIZE, \
+ sizeof(size_t), \
+ &cl_data.kernel_size, \
+ NULL);\
+ VP8_CL_CHECK_SUCCESS(NULL, err != CL_SUCCESS, \
+ "Error: Failed to calculate local size of kernel!\n", \
+ ,\
+ VP8_CL_TRIED_BUT_FAILED \
+ ); \
+
+#define VP8_CL_CREATE_KERNEL(data,program,name,str_name) \
+ data.name = clCreateKernel(data.program, str_name , &err); \
+ VP8_CL_CHECK_SUCCESS(NULL, err != CL_SUCCESS || !data.name, \
+ "Error: Failed to create compute kernel "#str_name"!\n", \
+ ,\
+ VP8_CL_TRIED_BUT_FAILED \
+ );
+
+#define VP8_CL_READ_BUF(cq, bufRef, bufSize, dstPtr) \
+ err = clEnqueueReadBuffer(cq, bufRef, CL_FALSE, 0, bufSize , dstPtr, 0, NULL, NULL); \
+ VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, \
+ "Error: Failed to read from GPU!\n",, err \
+ ); \
+
+#define VP8_CL_SET_BUF(cq, bufRef, bufSize, dataPtr, altPath, retCode) \
+ { \
+ err = clEnqueueWriteBuffer(cq, bufRef, CL_FALSE, 0, \
+ bufSize, dataPtr, 0, NULL, NULL); \
+ \
+ VP8_CL_CHECK_SUCCESS(cq, err != CL_SUCCESS, \
+ "Error: Failed to write to buffer!\n", \
+ altPath, retCode\
+ ); \
+ } \
+
+#define VP8_CL_CREATE_BUF(cq, bufRef, bufType, bufSize, dataPtr, altPath, retCode) \
+ bufRef = clCreateBuffer(cl_data.context, CL_MEM_READ_WRITE, bufSize, NULL, NULL); \
+ if (dataPtr != NULL && bufRef != NULL){ \
+ VP8_CL_SET_BUF(cq, bufRef, bufSize, dataPtr, altPath, retCode)\
+ } \
+ VP8_CL_CHECK_SUCCESS(cq, !bufRef, \
+ "Error: Failed to allocate buffer. Using CPU path!\n", \
+ altPath, retCode\
+ ); \
+
+#define VP8_CL_RELEASE_KERNEL(kernel) \
+ if (kernel) \
+ clReleaseKernel(kernel); \
+ kernel = NULL;
+
+typedef struct VP8_COMMON_CL {
+ cl_device_id device_id; // compute device id
+ cl_context context; // compute context
+ //cl_command_queue commands; // compute command queue
+
+ cl_program filter_program; // compute program for subpixel/bilinear filters
+ cl_kernel vp8_sixtap_predict_kernel;
+ size_t vp8_sixtap_predict_kernel_size;
+ cl_kernel vp8_sixtap_predict8x4_kernel;
+ size_t vp8_sixtap_predict8x4_kernel_size;
+ cl_kernel vp8_sixtap_predict8x8_kernel;
+ size_t vp8_sixtap_predict8x8_kernel_size;
+ cl_kernel vp8_sixtap_predict16x16_kernel;
+ size_t vp8_sixtap_predict16x16_kernel_size;
+
+ cl_kernel vp8_bilinear_predict4x4_kernel;
+ cl_kernel vp8_bilinear_predict8x4_kernel;
+ cl_kernel vp8_bilinear_predict8x8_kernel;
+ cl_kernel vp8_bilinear_predict16x16_kernel;
+
+ cl_kernel vp8_filter_block2d_first_pass_kernel;
+ size_t vp8_filter_block2d_first_pass_kernel_size;
+ cl_kernel vp8_filter_block2d_second_pass_kernel;
+ size_t vp8_filter_block2d_second_pass_kernel_size;
+
+ cl_kernel vp8_filter_block2d_bil_first_pass_kernel;
+ size_t vp8_filter_block2d_bil_first_pass_kernel_size;
+ cl_kernel vp8_filter_block2d_bil_second_pass_kernel;
+ size_t vp8_filter_block2d_bil_second_pass_kernel_size;
+
+ cl_kernel vp8_memcpy_kernel;
+ size_t vp8_memcpy_kernel_size;
+ cl_kernel vp8_memset_short_kernel;
+
+ cl_program idct_program;
+ cl_kernel vp8_short_inv_walsh4x4_1_kernel;
+ cl_kernel vp8_short_inv_walsh4x4_1st_pass_kernel;
+ cl_kernel vp8_short_inv_walsh4x4_2nd_pass_kernel;
+ cl_kernel vp8_dc_only_idct_add_kernel;
+ //Note that the following 2 kernels are encoder-only. Not used in decoder.
+ cl_kernel vp8_short_idct4x4llm_1_kernel;
+ cl_kernel vp8_short_idct4x4llm_kernel;
+
+ cl_program loop_filter_program;
+ cl_kernel vp8_loop_filter_horizontal_edge_kernel;
+ cl_kernel vp8_loop_filter_vertical_edge_kernel;
+ cl_kernel vp8_mbloop_filter_horizontal_edge_kernel;
+ cl_kernel vp8_mbloop_filter_vertical_edge_kernel;
+ cl_kernel vp8_loop_filter_simple_horizontal_edge_kernel;
+ cl_kernel vp8_loop_filter_simple_vertical_edge_kernel;
+
+ cl_program dequant_program;
+ cl_kernel vp8_dequant_dc_idct_add_kernel;
+ cl_kernel vp8_dequant_idct_add_kernel;
+ cl_kernel vp8_dequantize_b_kernel;
+
+ cl_int cl_decode_initialized;
+ cl_int cl_encode_initialized;
+
+} VP8_COMMON_CL;
+
+extern VP8_COMMON_CL cl_data;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* VP8_OPENCL_H */
+
return retval;
}
+
int vp8_dc_uv_quant(int QIndex, int Delta)
{
int retval;
return retval;
}
+
int vp8_ac_uv_quant(int QIndex, int Delta)
{
int retval;
{
#if ARCH_ARM
BLOCKD *b = &x->block[0];
- RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
/*b = &x->block[4];*/
b += 4;
- RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
/*b = &x->block[8];*/
b += 4;
- RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
/*b = &x->block[12];*/
b += 4;
- RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
#else
int i;
{
BLOCKD *b = &x->block[i];
- RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
}
#endif
}
#if ARCH_ARM
BLOCKD *b = &x->block[0];
- RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
b += 4;
- RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
b += 4;
- RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
b += 4;
- RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
b += 4;
/*b = &x->block[16];*/
- RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
b++;
b++;
- RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
b++;
b++;
- RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
b++;
b++;
- RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
#else
int i;
{
BLOCKD *b = &x->block[i];
- RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
}
for (i = 16; i < 24; i += 2)
{
BLOCKD *b = &x->block[i];
- RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
}
#endif
}
* be found in the AUTHORS file in the root of the source tree.
*/
-
#include "vpx_ports/config.h"
#include "recon.h"
#include "subpixel.h"
#include "onyxc_int.h"
#endif
+#if CONFIG_OPENCL
+#include "opencl/vp8_opencl.h"
+#include "opencl/filter_cl.h"
+#include "opencl/reconinter_cl.h"
+#endif
+
/* use this define on systems where unaligned int reads and writes are
* not allowed, i.e. ARM architectures
*/
static const int bbb[4] = {0, 2, 8, 10};
-
+//Copy 16 x 16-bytes from src to dst.
void vp8_copy_mem16x16_c(
unsigned char *src,
int src_stride,
int r;
+ //Set this up as a 2D kernel. Each loop iteration is X, each byte/int within
+ //is the Y address.
+
for (r = 0; r < 16; r++)
{
#ifdef MUST_BE_ALIGNED
}
+//Copy 8 x 8-bytes
void vp8_copy_mem8x8_c(
unsigned char *src,
int src_stride,
void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf)
{
int r;
- unsigned char *ptr_base;
- unsigned char *ptr;
- unsigned char *pred_ptr = d->predictor;
- ptr_base = *(d->base_pre);
+ //d->base_pre is the start of the previous frame's y_buffer, u_buffer, or v_buffer
+ unsigned char *ptr_base = *(d->base_pre);
+ int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+ unsigned char *pred_ptr = d->predictor_base + d->predictor_offset;
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
{
- ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
- sppf(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
+ sppf(ptr_base+ptr_offset, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
}
else
{
- ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
- ptr = ptr_base;
for (r = 0; r < 4; r++)
{
#ifdef MUST_BE_ALIGNED
- pred_ptr[0] = ptr[0];
- pred_ptr[1] = ptr[1];
- pred_ptr[2] = ptr[2];
- pred_ptr[3] = ptr[3];
+ pred_ptr[0] = ptr_base[ptr_offset];
+ pred_ptr[1] = ptr_base[ptr_offset+1];
+ pred_ptr[2] = ptr_base[ptr_offset+2];
+ pred_ptr[3] = ptr_base[ptr_offset+3];
#else
- *(int *)pred_ptr = *(int *)ptr ;
+ *(int *)pred_ptr = *(int *)(ptr_base+ptr_offset) ;
#endif
pred_ptr += pitch;
- ptr += d->pre_stride;
+ ptr_offset += d->pre_stride;
}
}
}
{
unsigned char *ptr_base;
unsigned char *ptr;
- unsigned char *pred_ptr = d->predictor;
+ unsigned char *pred_ptr = d->predictor_base + d->predictor_offset;
ptr_base = *(d->base_pre);
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
{
unsigned char *ptr_base;
unsigned char *ptr;
- unsigned char *pred_ptr = d->predictor;
+ unsigned char *pred_ptr = d->predictor_base + d->predictor_offset;
ptr_base = *(d->base_pre);
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
}
}
-
+/* Encoder only */
void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
{
int i;
+#if CONFIG_OPENCL
+ if ( 0 && cl_initialized == CL_SUCCESS ){
+ vp8_build_inter_predictors_mbuv_cl(x);
+ VP8_CL_FINISH(x->cl_commands);
+ VP8_CL_FINISH(x->block[0].cl_commands);
+ VP8_CL_FINISH(x->block[16].cl_commands);
+ VP8_CL_FINISH(x->block[20].cl_commands);
+ return;
+ }
+#endif
+
if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
x->mode_info_context->mbmi.mode != SPLITMV)
{
if ((mv_row | mv_col) & 7)
{
- x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
- x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
+ x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
+ x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
}
else
{
void vp8_build_inter_predictors_mby(MACROBLOCKD *x)
{
- if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
- x->mode_info_context->mbmi.mode != SPLITMV)
+ if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
+ x->mode_info_context->mbmi.mode != SPLITMV)
{
unsigned char *ptr_base;
unsigned char *ptr;
if ((mv_row | mv_col) & 7)
{
- x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
+ x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
}
else
{
if ((mv_row | mv_col) & 7)
{
- x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
- x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
+ x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
+ x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
}
else
{
}
-/* The following functions are wriiten for skip_recon_mb() to call. Since there is no recon in this
+/* The following functions are written for skip_recon_mb() to call. Since there is no recon in this
* situation, we can write the result directly to dst buffer instead of writing it to predictor
* buffer and then copying it to dst buffer.
*/
int r;
unsigned char *ptr_base;
unsigned char *ptr;
- /*unsigned char *pred_ptr = d->predictor;*/
+ /*unsigned char *pred_ptr = d->predictor_base + d->predictor_offset;*/
int dst_stride = d->dst_stride;
int pre_stride = d->pre_stride;
+ int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
ptr_base = *(d->base_pre);
+ ptr = ptr_base + ptr_offset;
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
{
- ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, dst_stride);
}
else
{
- ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
- ptr = ptr_base;
-
for (r = 0; r < 4; r++)
{
#ifdef MUST_BE_ALIGNED
}
-
void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
{
- /*unsigned char *pred_ptr = x->block[0].predictor;
- unsigned char *dst_ptr = *(x->block[0].base_dst) + x->block[0].dst;*/
- unsigned char *pred_ptr = x->predictor;
unsigned char *dst_ptr = x->dst.y_buffer;
+#if CONFIG_OPENCL && ENABLE_CL_SUBPIXEL
+ if (cl_initialized == CL_SUCCESS){
+ vp8_build_inter_predictors_mb_s_cl(x);
+ return;
+ }
+#endif
+
if (x->mode_info_context->mbmi.mode != SPLITMV)
{
int offset;
if ((mv_row | mv_col) & 7)
{
- x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
+ x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
}
else
{
if ((mv_row | mv_col) & 7)
{
- x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, udst_ptr, x->dst.uv_stride);
- x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vdst_ptr, x->dst.uv_stride);
+ x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, udst_ptr, x->dst.uv_stride);
+ x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vdst_ptr, x->dst.uv_stride);
}
else
{
{
/* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later,
* if sth is wrong, go back to what it is in build_inter_predictors_mb.
+ *
+ * ACW: note: Not sure who the above comment belongs to.
*/
int i;
{
unsigned char *ptr_base;
unsigned char *ptr;
- unsigned char *pred_ptr = d->predictor;
ptr_base = *(d->base_pre);
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
}
}
}
- else
+ else
{
for (i = 0; i < 16; i += 2)
{
/*build_inter_predictors2b(x, d0, 16);*/
unsigned char *ptr_base;
unsigned char *ptr;
- unsigned char *pred_ptr = d0->predictor;
ptr_base = *(d0->base_pre);
ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
/*build_inter_predictors2b(x, d0, 8);*/
unsigned char *ptr_base;
unsigned char *ptr;
- unsigned char *pred_ptr = d0->predictor;
ptr_base = *(d0->base_pre);
ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
for (i = 16; i < 24; i += 2)
{
BLOCKD *b = &x->block[i];
- RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
}
}
case B_LD_PRED:
{
unsigned char *ptr = Above;
+
+#if 0
+ //More readable version of the unrolled loop
+ int stride = 16, r=0, c=0;
+ for (r=0; r < 4; r++){
+ for (c=0; c < 4; c++){
+ int off = r+c;
+ int off2 = off > 5 ? 5: off; //Clamp so [3,3] has max off2 of 7
+ predictor[r*stride+c] = (ptr[off] + ptr[off+1]*2 + ptr[off2+2] + 2)>>2;
+ }
+ }
+#else
predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
predictor[0 * 16 + 1] =
predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
predictor[2 * 16 + 3] =
predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
-
+#endif
+
}
break;
case B_RD_PRED:
*dst_ptr1 = *src_ptr;
*dst_ptr2 = *src_ptr;
}
-
-
#include "swapyv12buffer.h"
+
+
void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame, YV12_BUFFER_CONFIG *last_frame)
{
unsigned char *temp;
-
+#if CONFIG_OPENCL
+ cl_mem temp_mem;
+#endif
+ int temp_size;
+
temp = last_frame->buffer_alloc;
last_frame->buffer_alloc = new_frame->buffer_alloc;
new_frame->buffer_alloc = temp;
last_frame->v_buffer = new_frame->v_buffer;
new_frame->v_buffer = temp;
+ temp_size = last_frame->buffer_size;
+ last_frame->buffer_size = new_frame->buffer_size;
+ new_frame->buffer_size = temp_size;
+
+#if CONFIG_OPENCL
+ temp_mem = last_frame->buffer_mem;
+ last_frame->buffer_mem = new_frame->buffer_mem;
+ new_frame->buffer_mem = temp_mem;
+#endif
+
}
void vp8_dequantize_b_neon(BLOCKD *d)
{
int i;
- short *DQ = d->dqcoeff;
- short *Q = d->qcoeff;
+ short *DQ = d->dqcoeff_base + d->dqcoeff_offset;
+ short *Q = d->qcoeff_base + d->qcoeff_offset;
short *DQC = d->dequant;
vp8_dequantize_b_loop_neon(Q, DQC, DQ);
void vp8_dequantize_b_v6(BLOCKD *d)
{
int i;
- short *DQ = d->dqcoeff;
- short *Q = d->qcoeff;
+ short *DQ = d->dqcoeff_base + d->dqcoeff_offset;
+ short *Q = d->qcoeff_base + d->qcoeff_offset;
short *DQC = d->dequant;
vp8_dequantize_b_loop_v6(Q, DQC, DQ);
#include "vp8/common/reconinter.h"
#include "dequantize.h"
#include "detokenize.h"
-#include "vp8/common/invtrans.h"
#include "vp8/common/alloccommon.h"
#include "vp8/common/entropymode.h"
#include "vp8/common/quant_common.h"
#include "vp8/common/threading.h"
#include "decoderthreading.h"
#include "dboolhuff.h"
+#include "vp8/common/blockd.h"
#include <assert.h>
#include <stdio.h>
+#include "vpx_config.h"
+#if CONFIG_OPENCL
+#include "vp8/common/opencl/vp8_opencl.h"
+#include "vp8/common/opencl/blockd_cl.h"
+#include "opencl/dequantize_cl.h"
+#include "opencl/decodframe_cl.h"
+#endif
+
+#define PROFILE_OUTPUT 0
+
void vp8cx_init_de_quantizer(VP8D_COMP *pbi)
{
int i;
xd->block[24].dequant = pc->Y2dequant[QIndex];
+#if CONFIG_OPENCL && ENABLE_CL_IDCT_DEQUANT
+ mb_init_dequantizer_cl(xd);
+#endif
+
}
#if CONFIG_RUNTIME_CPU_DETECT
else
{
vp8_build_inter_predictors_mb_s(xd);
+#if CONFIG_OPENCL
+ VP8_CL_FINISH(xd->cl_commands);
+#if !ONE_CQ_PER_MB
+ VP8_CL_FINISH(xd->block[0].cl_commands);
+ VP8_CL_FINISH(xd->block[16].cl_commands);
+ VP8_CL_FINISH(xd->block[20].cl_commands);
+#endif
+#endif
}
}
static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd)
{
+
int eobtotal = 0;
int i, do_clamp = xd->mode_info_context->mbmi.need_to_clamp_mvs;
xd->mode_info_context->mbmi.dc_diff = 1;
+#if PROFILE_OUTPUT
+ if (xd->frame_type == KEY_FRAME)
+ printf("Intra-Coded MB\n");
+ else{
+ if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME){
+ printf("Intra-Coded Inter-Frame MB\n");
+ } else {
+ printf("Inter-Coded MB\n");
+ }
+ }
+#endif
+
+#if CONFIG_OPENCL
+ //If OpenCL is enabled and initialized, use CL-specific decoder for remains
+ //of MB decoding.
+ if (cl_initialized == CL_SUCCESS){
+ vp8_decode_macroblock_cl(pbi, xd, eobtotal);
+ return;
+ }
+#endif
+
if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0)
{
xd->mode_info_context->mbmi.dc_diff = 0;
if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
{
BLOCKD *b = &xd->block[24];
+ short *qcoeff = b->qcoeff_base + b->qcoeff_offset;
+ vp8_second_order_fn_t second_order;
+
DEQUANT_INVOKE(&pbi->dequant, block)(b);
/* do 2nd order transform on the dc block */
- if (xd->eobs[24] > 1)
- {
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
- ((int *)b->qcoeff)[1] = 0;
- ((int *)b->qcoeff)[2] = 0;
- ((int *)b->qcoeff)[3] = 0;
- ((int *)b->qcoeff)[4] = 0;
- ((int *)b->qcoeff)[5] = 0;
- ((int *)b->qcoeff)[6] = 0;
- ((int *)b->qcoeff)[7] = 0;
- }
- else
- {
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
+ if (xd->eobs[24] > 1){
+ second_order = IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16);
+ ((int *)qcoeff)[0] = 0;
+ ((int *)qcoeff)[1] = 0;
+ ((int *)qcoeff)[2] = 0;
+ ((int *)qcoeff)[3] = 0;
+ ((int *)qcoeff)[4] = 0;
+ ((int *)qcoeff)[5] = 0;
+ ((int *)qcoeff)[6] = 0;
+ ((int *)qcoeff)[7] = 0;
+ } else {
+ second_order = IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1);
+ ((int *)qcoeff)[0] = 0;
}
+ second_order(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset]);
DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
- (xd->qcoeff, xd->block[0].dequant,
- xd->predictor, xd->dst.y_buffer,
- xd->dst.y_stride, xd->eobs, xd->block[24].diff);
+ (xd->qcoeff, xd->block[0].dequant,
+ xd->predictor, xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs, &xd->block[24].diff_base[xd->block[24].diff_offset]);
}
else if ((xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)
{
for (i = 0; i < 16; i++)
{
-
BLOCKD *b = &xd->block[i];
- vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+ short *qcoeff = b->qcoeff_base + b->qcoeff_offset;
+ vp8_predict_intra4x4(b, b->bmi.mode, b->predictor_base + b->predictor_offset);
if (xd->eobs[i] > 1)
{
DEQUANT_INVOKE(&pbi->dequant, idct_add)
- (b->qcoeff, b->dequant, b->predictor,
+ (qcoeff, b->dequant, b->predictor_base + b->predictor_offset,
*(b->base_dst) + b->dst, 16, b->dst_stride);
}
else
{
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
- (b->qcoeff[0] * b->dequant[0], b->predictor,
+ (qcoeff[0] * b->dequant[0], b->predictor_base + b->predictor_offset,
*(b->base_dst) + b->dst, 16, b->dst_stride);
- ((int *)b->qcoeff)[0] = 0;
+ ((int *)qcoeff)[0] = 0;
}
}
-
}
else
{
DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
- (xd->qcoeff, xd->block[0].dequant,
- xd->predictor, xd->dst.y_buffer,
- xd->dst.y_stride, xd->eobs);
+ (xd->qcoeff, xd->block[0].dequant,
+ xd->predictor, xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs);
}
DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
- (xd->qcoeff+16*16, xd->block[16].dequant,
- xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
- xd->dst.uv_stride, xd->eobs+16);
+ (xd->qcoeff+16*16, xd->block[16].dequant,
+ xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride, xd->eobs+16);
}
xd->mb_to_top_edge = -((mb_row * 16)) << 3;
xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+ xd->dst.buffer_alloc = pc->yv12_fb[dst_fb_idx].buffer_alloc;
+ xd->dst.buffer_size = pc->yv12_fb[dst_fb_idx].buffer_size;
+#if CONFIG_OPENCL
+ xd->dst.buffer_mem = pc->yv12_fb[dst_fb_idx].buffer_mem;
+#endif
+
for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
{
xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+ xd->pre.buffer_alloc = pc->yv12_fb[ref_fb_idx].buffer_alloc;
+ xd->pre.buffer_size = pc->yv12_fb[ref_fb_idx].buffer_size;
+#if CONFIG_OPENCL
+ xd->pre.buffer_mem = pc->yv12_fb[ref_fb_idx].buffer_mem;
+#endif
if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME)
{
vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
- /* reset the mode ref deltasa for loop filter */
+ /* reset the mode ref deltas for loop filter */
vpx_memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
vpx_memset(xd->mode_lf_deltas, 0, sizeof(xd->mode_lf_deltas));
}
else
{
- if (!pc->use_bilinear_mc_filter)
- pc->mcomp_filter_type = SIXTAP;
- else
- pc->mcomp_filter_type = BILINEAR;
- /* To enable choice of different interploation filters */
+ /* To enable choice of different interpolation filters */
if (pc->mcomp_filter_type == SIXTAP)
{
+#if CONFIG_OPENCL
+ xd->sixtap_filter = CL_TRUE;
+#endif
xd->subpixel_predict = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap4x4);
xd->subpixel_predict8x4 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap8x4);
xd->subpixel_predict8x8 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), sixtap8x8);
}
else
{
+#if CONFIG_OPENCL
+ xd->sixtap_filter = CL_FALSE;
+#endif
xd->subpixel_predict = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear4x4);
xd->subpixel_predict8x4 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear8x4);
xd->subpixel_predict8x8 = SUBPIX_INVOKE(RTCD_VTABLE(subpix), bilinear8x8);
xd->corrupted = 0; /* init without corruption */
}
+
int vp8_decode_frame(VP8D_COMP *pbi)
{
vp8_reader *const bc = & pbi->bc;
pc->vert_scale = data[6] >> 6;
data += 7;
+ //Allow resolution changes on key frames.
if (Width != pc->Width || Height != pc->Height)
{
+#if CONFIG_MULTITHREAD
int prev_mb_rows = pc->mb_rows;
+#endif
if (pc->Width <= 0)
{
pc->refresh_last_frame = pc->frame_type == KEY_FRAME || vp8_read_bit(bc);
- if (0)
- {
- FILE *z = fopen("decodestats.stt", "a");
- fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
- pc->current_video_frame,
- pc->frame_type,
- pc->refresh_golden_frame,
- pc->refresh_alt_ref_frame,
- pc->refresh_last_frame,
- pc->base_qindex);
- fclose(z);
- }
-
+#if 0
+ FILE *z = fopen("decodestats.stt", "a");
+ fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n",
+ pc->current_video_frame,
+ pc->frame_type,
+ pc->refresh_golden_frame,
+ pc->refresh_alt_ref_frame,
+ pc->refresh_last_frame,
+ pc->base_qindex);
+ fclose(z);
+#endif
{
/* read coef probability tree */
}
}
+ //Set up the macroblock's previous/destination buffers
vpx_memcpy(&xd->pre, &pc->yv12_fb[pc->lst_fb_idx], sizeof(YV12_BUFFER_CONFIG));
vpx_memcpy(&xd->dst, &pc->yv12_fb[pc->new_fb_idx], sizeof(YV12_BUFFER_CONFIG));
#endif
vp8_setup_intra_recon(&pc->yv12_fb[pc->new_fb_idx]);
+ /* clear out the coeff buffer */
+ vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
+
vp8_setup_block_dptrs(xd);
vp8_build_block_doffsets(xd);
- /* clear out the coeff buffer */
- vpx_memset(xd->qcoeff, 0, sizeof(xd->qcoeff));
-
/* Read the mb_no_coeff_skip flag */
pc->mb_no_coeff_skip = (int)vp8_read_bit(bc);
vpx_memcpy(&xd->block[0].bmi, &xd->mode_info_context->bmi[0], sizeof(B_MODE_INFO));
+#if PROFILE_OUTPUT
+ if (pc->frame_type == KEY_FRAME)
+ printf("Key Frame\n");
+ else
+ printf("Inter-Frame\n");
+#endif
+
#if CONFIG_MULTITHREAD
if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION)
{
int ibc = 0;
int num_part = 1 << pc->multi_token_partition;
- /* Decode the individual macro block */
+ /* Decode the individual macro blocks */
for (mb_row = 0; mb_row < pc->mb_rows; mb_row++)
{
}
}
-
+#if CONFIG_OPENCL
+ vp8_decode_frame_cl_finish(pbi);
+#endif
+
stop_token_decoder(pbi);
/* Collect information about decoder corruption. */
void vp8_dequantize_b_c(BLOCKD *d)
{
int i;
- short *DQ = d->dqcoeff;
- short *Q = d->qcoeff;
+ short *DQ = d->dqcoeff_base + d->dqcoeff_offset;
+ short *Q = d->qcoeff_base + d->qcoeff_offset;
short *DQC = d->dequant;
for (i = 0; i < 16; i++)
Prob = coef_probs + (ENTROPY_NODES*2); \
if(c < 15){\
qcoeff_ptr [ scan[c] ] = (INT16) v; \
- ++c; \
- goto DO_WHILE; }\
+ continue; \
+ }\
qcoeff_ptr [ scan[15] ] = (INT16) v; \
goto BLOCK_FINISHED;
Prob = coef_probs;
Prob += v * ENTROPY_NODES;
-DO_WHILE:
+do{
+
Prob += coef_bands_x[c];
DECODE_AND_BRANCH_IF_ZERO(Prob[EOB_CONTEXT_NODE], BLOCK_FINISHED);
if (c < 15)
{
qcoeff_ptr [ scan[c] ] = (INT16) v;
- ++c;
- goto DO_WHILE;
}
+} while (c++ < 15);
qcoeff_ptr [ scan[15] ] = (INT16) v;
BLOCK_FINISHED:
extern void vp8_arch_x86_decode_init(VP8D_COMP *pbi);
extern void vp8_arch_arm_decode_init(VP8D_COMP *pbi);
+extern void vp8_arch_opencl_decode_init(VP8D_COMP *pbi);
void vp8_dmachine_specific_config(VP8D_COMP *pbi)
{
#if ARCH_ARM
vp8_arch_arm_decode_init(pbi);
#endif
+
+#if CONFIG_OPENCL && (ENABLE_CL_IDCT_DEQUANT)
+ vp8_arch_opencl_decode_init(pbi);
+#endif
}
#include "vpx_ports/arm.h"
#endif
+#include "vpx_config.h"
+#if CONFIG_OPENCL
+#include "vp8/common/opencl/blockd_cl.h"
+#include "vp8/common/opencl/vp8_opencl.h"
+#endif
+
extern void vp8_init_loop_filter(VP8_COMMON *cm);
extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
+#define PROFILE_OUTPUT 0
+#if PROFILE_OUTPUT
+struct vpx_usec_timer frame_timer;
+struct vpx_usec_timer loop_filter_timer;
+unsigned int total_mb = 0;
+unsigned int total_loop_filter = 0;
+#endif
void vp8dx_initialize()
{
vp8_decoder_remove_threads(pbi);
#endif
vp8_remove_common(&pbi->common);
+
vpx_free(pbi);
}
pbi->Source = source;
pbi->source_sz = size;
+#if CONFIG_OPENCL
+ pbi->mb.cl_commands = NULL;
+ if (cl_initialized == CL_SUCCESS){
+ int err;
+ //Create command queue for macroblock.
+ pbi->mb.cl_commands = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err);
+ if (!pbi->mb.cl_commands || err != CL_SUCCESS) {
+ printf("Error: Failed to create a command queue!\n");
+ cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
+ }
+
+ pbi->mb.cl_diff_mem = NULL;
+ pbi->mb.cl_predictor_mem = NULL;
+ pbi->mb.cl_qcoeff_mem = NULL;
+ pbi->mb.cl_dqcoeff_mem = NULL;
+ pbi->mb.cl_eobs_mem = NULL;
+
+#define SET_ON_ALLOC 0
+#if SET_ON_ALLOC
+
+#if ENABLE_CL_SUBPIXEL || ENABLE_CL_IDCT_DEQUANT
+ VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_predictor_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+ sizeof(cl_uchar)*384, pbi->mb.predictor, goto BUF_DONE, -1);
+#endif
+
+#if ENABLE_CL_IDCT_DEQUANT
+ VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_diff_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+ sizeof(cl_short)*400, pbi->mb.diff, goto BUF_DONE, -1);
+
+ VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_qcoeff_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+ sizeof(cl_short)*400, pbi->mb.qcoeff, goto BUF_DONE,-1);
+
+ VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_dqcoeff_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+ sizeof(cl_short)*400, pbi->mb.dqcoeff, goto BUF_DONE,-1);
+
+ VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_eobs_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+ sizeof(cl_char)*25, pbi->mb.eobs, goto BUF_DONE,-1);
+#endif
+#else
+#if ENABLE_CL_IDCT_DEQUANT || ENABLE_CL_SUBPIXEL
+ VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_predictor_mem, CL_MEM_READ_WRITE,
+ sizeof(cl_uchar)*384, NULL, goto BUF_DONE,-1);
+#endif
+
+#if ENABLE_CL_IDCT_DEQUANT
+ VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_diff_mem, CL_MEM_READ_WRITE,
+ sizeof(cl_short)*400, NULL, goto BUF_DONE,-1);
+
+ VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_qcoeff_mem, CL_MEM_READ_WRITE,
+ sizeof(cl_short)*400, NULL, goto BUF_DONE,-1);
+
+ VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_dqcoeff_mem, CL_MEM_READ_WRITE,
+ sizeof(cl_short)*400, NULL, goto BUF_DONE,-1);
+
+ VP8_CL_CREATE_BUF(pbi->mb.cl_commands, pbi->mb.cl_eobs_mem, CL_MEM_READ_WRITE,
+ sizeof(cl_char) * 25, NULL, goto BUF_DONE,-1);
+#endif
+#endif
+ }
+#if ENABLE_CL_IDCT_DEQUANT || ENABLE_CL_SUBPIXEL
+ BUF_DONE:
+#endif
+#endif
+
+#if PROFILE_OUTPUT
+ printf("Frame size = %d * %d\n", cm->Height, cm->Width);
+ printf("Macroblocks = %d * %d\n", cm->mb_rows, cm->mb_cols);
+
+ vpx_usec_timer_start(&frame_timer);
+#endif
retcode = vp8_decode_frame(pbi);
+#if PROFILE_OUTPUT
+ vpx_usec_timer_mark(&frame_timer);
+ total_mb += vpx_usec_timer_elapsed(&frame_timer);
+#endif
+
if (retcode < 0)
{
#if HAVE_ARMV7
if(pbi->common.filter_level)
{
+
+#if PROFILE_OUTPUT
+ struct vpx_usec_timer lpftimer;
+ vpx_usec_timer_start(&lpftimer);
+#endif
+
/* Apply the loop filter if appropriate. */
vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level);
+#if PROFILE_OUTPUT
+ vpx_usec_timer_mark(&lpftimer);
+ pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer);
+
+ printf("Loop Filter\n");
+ total_loop_filter += vpx_usec_timer_elapsed(&lpftimer);
+#if 0
+ if (pbi->common.filter_type == NORMAL_LOOPFILTER){
+ printf("Normal LF Time (us): %d\n", vpx_usec_timer_elapsed(&lpftimer));
+ } else {
+ printf("Simple LF Time (us): %d\n", vpx_usec_timer_elapsed(&lpftimer));
+ }
+#endif
+#endif
+
cm->last_frame_type = cm->frame_type;
cm->last_filter_type = cm->filter_type;
cm->last_sharpness_level = cm->sharpness_level;
}
+#if PROFILE_OUTPUT
+ else {
+ printf("No Loop Filter\n");
+ }
+#endif
vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
}
+#if CONFIG_OPENCL
+ if (cl_initialized == CL_SUCCESS){
+ //Copy buffer_alloc to buffer_mem so YV12_BUFFER_CONFIG can be used as
+ //a reference frame (e.g. YV12..buffer_mem contains same as buffer_alloc).
+ vp8_cl_mb_prep(&pbi->mb, DST_BUF);
+
+ if (pbi->mb.cl_commands != NULL)
+ clReleaseCommandQueue(pbi->mb.cl_commands);
+ pbi->mb.cl_commands = NULL;
+ }
+#endif
vp8_clear_system_state();
}
#endif
pbi->common.error.setjmp = 0;
+
+
+#if PROFILE_OUTPUT
+ //Dump the total MB/Loop Filter processing times.
+ //This is cumulative between frames, so only use the last output value.
+ printf("MB Time (us): %d, LF Time (us): %d\n", total_mb, total_loop_filter);
+#endif
+
+
return retcode;
}
+
int vp8dx_get_raw_frame(VP8D_PTR ptr, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags)
{
int ret = -1;
--- /dev/null
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "../onyxd_int.h"
+#include "vp8/common/header.h"
+#include "vp8/common/reconintra.h"
+#include "vp8/common/reconintra4x4.h"
+#include "vp8/common/recon.h"
+#include "vp8/common/reconinter.h"
+//#include "../dequantize.h"
+//#include "../detokenize.h"
+//#include "vp8/common/alloccommon.h"
+//#include "vp8/common/entropymode.h"
+//#include "vp8/common/quant_common.h"
+//#include "vpx_scale/vpxscale.h"
+//#include "vpx_scale/yv12extend.h"
+//#include "vp8/common/setupintrarecon.h"
+
+//#include "../decodemv.h"
+//#include "vp8/common/extend.h"
+//#include "vpx_mem/vpx_mem.h"
+//#include "vp8/common/idct.h"
+//#include "../dequantize.h"
+//#include "vp8/common/predictdc.h"
+//#include "vp8/common/threading.h"
+//#include "../decoderthreading.h"
+//#include "../dboolhuff.h"
+//#include "vp8/common/blockd.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "vpx_config.h"
+#if CONFIG_OPENCL
+#include "vp8/common/opencl/vp8_opencl.h"
+#include "vp8/common/opencl/blockd_cl.h"
+#include "vp8/common/opencl/reconinter_cl.h"
+#include "dequantize_cl.h"
+#endif
+
+#define PROFILE_OUTPUT 0
+
+//Implemented in ../decodframe.c
+extern void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
+
+void mb_init_dequantizer_cl(MACROBLOCKD *xd){
+ int i, err;
+ //Set up per-block dequant CL memory. Eventually, might be able to set up
+ //one large buffer containing the entire large dequant buffer.
+ if (cl_initialized == CL_SUCCESS){
+ for (i=0; i < 25; i++){
+
+#if 1 //Initialize CL memory on allocation?
+ VP8_CL_CREATE_BUF(xd->cl_commands, xd->block[i].cl_dequant_mem,
+ ,
+ 16*sizeof(cl_short),
+ xd->block[i].dequant,,
+ );
+#else
+ VP8_CL_CREATE_BUF(xd->cl_commands, xd->block[i].cl_dequant_mem,
+ ,
+ 16*sizeof(cl_short),
+ NULL,,
+ );
+#endif
+ }
+ }
+}
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x)
+#else
+#define RTCD_VTABLE(x) NULL
+#endif
+
+/* skip_recon_mb() is Modified: Instead of writing the result to predictor buffer and then copying it
+ * to dst buffer, we can write the result directly to dst buffer. This eliminates unnecessary copy.
+ */
+static void skip_recon_mb_cl(VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+ if (xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+ {
+
+ vp8_build_intra_predictors_mbuv_s(xd);
+ RECON_INVOKE(&pbi->common.rtcd.recon,
+ build_intra_predictors_mby_s)(xd);
+
+ }
+ else
+ {
+#if ENABLE_CL_SUBPIXEL
+ if (cl_initialized == CL_SUCCESS)
+ {
+ vp8_build_inter_predictors_mb_s_cl(xd);
+ } else
+#endif
+ {
+ vp8_build_inter_predictors_mb_s(xd);
+ }
+ VP8_CL_FINISH(xd->cl_commands);
+#if !ONE_CQ_PER_MB
+ VP8_CL_FINISH(xd->block[0].cl_commands);
+ VP8_CL_FINISH(xd->block[16].cl_commands);
+ VP8_CL_FINISH(xd->block[20].cl_commands);
+#endif
+ }
+}
+
+void vp8_decode_macroblock_cl(VP8D_COMP *pbi, MACROBLOCKD *xd, int eobtotal)
+{
+ int i;
+
+ if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0)
+ {
+ xd->mode_info_context->mbmi.dc_diff = 0;
+ skip_recon_mb_cl(pbi, xd);
+ return;
+ }
+
+ if (xd->segmentation_enabled)
+ mb_init_dequantizer(pbi, xd);
+
+ /* do prediction */
+ if (xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
+ {
+ vp8_build_intra_predictors_mbuv(xd);
+
+ if (xd->mode_info_context->mbmi.mode != B_PRED)
+ {
+ RECON_INVOKE(&pbi->common.rtcd.recon,
+ build_intra_predictors_mby)(xd);
+ } else {
+ vp8_intra_prediction_down_copy(xd);
+ }
+ }
+ else
+ {
+#if ENABLE_CL_SUBPIXEL
+ vp8_build_inter_predictors_mb_cl(xd);
+#else
+ vp8_build_inter_predictors_mb(xd);
+#endif
+
+#if !ENABLE_CL_IDCT_DEQUANT
+ //Wait for inter-predict if dequant/IDCT is being done on the CPU
+ VP8_CL_FINISH(xd->cl_commands);
+#endif
+ }
+
+ /* dequantization and idct */
+ if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
+ {
+ BLOCKD *b = &xd->block[24];
+ short *qcoeff = b->qcoeff_base + b->qcoeff_offset;
+ vp8_second_order_fn_t second_order;
+
+#if ENABLE_CL_IDCT_DEQUANT
+ if (cl_initialized == CL_SUCCESS){
+ vp8_cl_block_prep(b, DEQUANT|QCOEFF);
+ vp8_dequantize_b_cl(b);
+ vp8_cl_block_finish(b, DQCOEFF);
+ VP8_CL_FINISH(b->cl_commands); //Keep until qcoeff memset below is CL
+ }
+ else
+#endif
+ {
+ DEQUANT_INVOKE(&pbi->dequant, block)(b);
+ }
+
+
+ /* do 2nd order transform on the dc block */
+ if (xd->eobs[24] > 1){
+ second_order = IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16);
+ ((int *)qcoeff)[0] = 0;
+ ((int *)qcoeff)[1] = 0;
+ ((int *)qcoeff)[2] = 0;
+ ((int *)qcoeff)[3] = 0;
+ ((int *)qcoeff)[4] = 0;
+ ((int *)qcoeff)[5] = 0;
+ ((int *)qcoeff)[6] = 0;
+ ((int *)qcoeff)[7] = 0;
+ } else {
+ second_order = IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1);
+ ((int *)qcoeff)[0] = 0;
+ }
+
+#if ENABLE_CL_IDCT_DEQUANT
+ if (cl_initialized == CL_SUCCESS){
+ int y_off = xd->dst.y_buffer - xd->dst.buffer_alloc;
+ vp8_cl_block_prep(b, DQCOEFF|DIFF);
+
+ if (xd->eobs[24] > 1)
+ {
+ vp8_short_inv_walsh4x4_cl(b);
+ } else {
+ vp8_short_inv_walsh4x4_1_cl(b);
+ }
+ vp8_cl_block_finish(b, DIFF);
+
+ vp8_dequant_dc_idct_add_y_block_cl(&xd->block[0],
+ xd->dst.buffer_alloc, xd->dst.buffer_mem, y_off, xd->dst.y_stride, xd->eobs,
+ xd->block[24].diff_offset);
+ }
+ else
+#endif
+ {
+ second_order(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset]);
+ DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
+ (xd->qcoeff, xd->block[0].dequant,
+ xd->predictor, xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs, &xd->block[24].diff_base[xd->block[24].diff_offset]);
+ }
+ }
+ else if ((xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)
+ {
+#if ENABLE_CL_IDCT_DEQUANT
+ if (cl_initialized == CL_SUCCESS)
+ vp8_cl_mb_prep(xd, DST_BUF);
+#endif
+ for (i = 0; i < 16; i++)
+ {
+ BLOCKD *b = &xd->block[i];
+ short *qcoeff = b->qcoeff_base + b->qcoeff_offset;
+#if ENABLE_CL_IDCT_DEQUANT
+ VP8_CL_FINISH(b->cl_commands);
+#endif
+ vp8_predict_intra4x4(b, b->bmi.mode, b->predictor_base + b->predictor_offset);
+
+#if ENABLE_CL_IDCT_DEQUANT
+ if (cl_initialized == CL_SUCCESS){
+ size_t dst_size = (4*b->dst_stride + b->dst + 4);
+ cl_mem dst_mem = xd->dst.buffer_mem;
+
+ int dst_off = *(b->base_dst) - xd->dst.buffer_alloc;
+
+ if (xd->eobs[i] > 1)
+ {
+ vp8_cl_block_prep(b, QCOEFF|DEQUANT|PREDICTOR);
+ vp8_dequant_idct_add_cl(b, *(b->base_dst), dst_mem, dst_off+b->dst, dst_size, b->qcoeff_offset, b->predictor_offset, 16, b->dst_stride, DEQUANT_INVOKE(&pbi->dequant, idct_add));
+ vp8_cl_block_finish(b, QCOEFF);
+ }
+ else
+ {
+ vp8_cl_block_prep(b, PREDICTOR|DIFF|QCOEFF|DEQUANT);
+ vp8_dc_only_idct_add_cl(b, CL_FALSE, 0, b->qcoeff_offset, b->predictor_offset,
+ *(b->base_dst), dst_mem, dst_off+b->dst, dst_size, 16, b->dst_stride);
+ VP8_CL_FINISH(b->cl_commands);
+ ((int *)(b->qcoeff_base + b->qcoeff_offset))[0] = 0; //Move into follow-up kernel?
+ }
+ vp8_cl_mb_finish(xd,DST_BUF);
+ }
+ else
+#endif
+ {
+ if (xd->eobs[i] > 1)
+ {
+ DEQUANT_INVOKE(&pbi->dequant, idct_add)
+ (qcoeff, b->dequant, b->predictor_base + b->predictor_offset,
+ *(b->base_dst) + b->dst, 16, b->dst_stride);
+ }
+ else
+ {
+ IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
+ (qcoeff[0] * b->dequant[0], b->predictor_base + b->predictor_offset,
+ *(b->base_dst) + b->dst, 16, b->dst_stride);
+ ((int *)qcoeff)[0] = 0;
+ }
+ }
+
+ }
+ }
+ else
+ {
+#if ENABLE_CL_IDCT_DEQUANT
+ if (cl_initialized == CL_SUCCESS){
+ vp8_cl_mb_prep(xd,DST_BUF);
+ vp8_dequant_idct_add_y_block_cl(pbi, xd);
+ vp8_cl_mb_finish(xd,DST_BUF);
+ }
+ else
+#endif
+ {
+ DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
+ (xd->qcoeff, xd->block[0].dequant,
+ xd->predictor, xd->dst.y_buffer,
+ xd->dst.y_stride, xd->eobs);
+ }
+ }
+
+#if ENABLE_CL_IDCT_DEQUANT
+ if (cl_initialized == CL_SUCCESS){
+ vp8_cl_mb_prep(xd,DST_BUF);
+ vp8_dequant_idct_add_uv_block_cl(pbi, xd, DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block));
+ vp8_cl_mb_finish(xd,DST_BUF);
+ VP8_CL_FINISH(xd->cl_commands);
+ } else
+#endif
+ {
+ DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
+ (xd->qcoeff+16*16, xd->block[16].dequant,
+ xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
+ xd->dst.uv_stride, xd->eobs+16);
+ }
+}
+
+void vp8_decode_frame_cl_finish(VP8D_COMP *pbi){
+
+ //If using OpenCL, free all of the GPU buffers we've allocated.
+ if (cl_initialized == CL_SUCCESS){
+#if ENABLE_CL_IDCT_DEQUANT
+ int i;
+#endif
+
+ //Wait for stuff to finish, just in case
+ clFinish(pbi->mb.cl_commands);
+
+#if !ONE_CQ_PER_MB
+ clFinish(pbi->mb.block[0].cl_commands);
+ clFinish(pbi->mb.block[16].cl_commands);
+ clFinish(pbi->mb.block[20].cl_commands);
+ clReleaseCommandQueue(pbi->mb.block[0].cl_commands);
+ clReleaseCommandQueue(pbi->mb.block[16].cl_commands);
+ clReleaseCommandQueue(pbi->mb.block[20].cl_commands);
+#endif
+
+#if ENABLE_CL_IDCT_DEQUANT || ENABLE_CL_SUBPIXEL
+ //Free Predictor CL buffer
+ if (pbi->mb.cl_predictor_mem != NULL)
+ clReleaseMemObject(pbi->mb.cl_predictor_mem);
+#endif
+
+#if ENABLE_CL_IDCT_DEQUANT
+ //Free other CL Block/MBlock buffers
+ if (pbi->mb.cl_diff_mem != NULL)
+ clReleaseMemObject(pbi->mb.cl_diff_mem);
+ if (pbi->mb.cl_qcoeff_mem != NULL)
+ clReleaseMemObject(pbi->mb.cl_qcoeff_mem);
+ if (pbi->mb.cl_dqcoeff_mem != NULL)
+ clReleaseMemObject(pbi->mb.cl_dqcoeff_mem);
+ if (pbi->mb.cl_eobs_mem != NULL)
+ clReleaseMemObject(pbi->mb.cl_eobs_mem);
+
+ for (i = 0; i < 25; i++){
+ clReleaseMemObject(pbi->mb.block[i].cl_dequant_mem);
+ pbi->mb.block[i].cl_dequant_mem = NULL;
+ }
+#endif
+ }
+}
--- /dev/null
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_DECODFRAME_CL_H
+#define VP8_DECODFRAME_CL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "../onyxd_int.h"
+#include "vp8/common/blockd.h"
+
+//Implemented in decodframe_cl.c
+extern void mb_init_dequantizer_cl(MACROBLOCKD *xd);
+extern void vp8_decode_frame_cl_finish(VP8D_COMP *pbi);
+extern void vp8_decode_macroblock_cl(VP8D_COMP *pbi, MACROBLOCKD *xd, int eobtotal);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* VP8_DECODFRAME_CL_H */
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+//ACW: Remove me after debugging.
+#include <stdio.h>
+#include <string.h>
+
+#include "vp8/common/opencl/blockd_cl.h"
+#include "vp8/common/opencl/idct_cl.h"
+#include "dequantize_cl.h"
+
+const char *dequantCompileOptions = "";
+const char *dequant_cl_file_name = "vp8/decoder/opencl/dequantize_cl.cl";
+
+void cl_memset_short(short *s, int c, size_t n) {
+ for (n /= sizeof(short); n > 0; --n)
+ *s++ = c;
+}
+
+void vp8_memset_short_cl(cl_mem mem, int offset, short val){
+
+}
+
+int cl_destroy_dequant(){
+ printf("Freeing dequant decoder resources\n");
+
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_dequant_dc_idct_add_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_dequant_idct_add_kernel);
+ VP8_CL_RELEASE_KERNEL(cl_data.vp8_dequantize_b_kernel);
+
+ if (cl_data.dequant_program)
+ clReleaseProgram(cl_data.dequant_program);
+ cl_data.dequant_program = NULL;
+
+ return CL_SUCCESS;
+}
+
+int cl_init_dequant() {
+ int err;
+
+ //printf("Initializing dequant program/kernels\n");
+
+ // Create the compute program from the file-defined source code
+ if (cl_load_program(&cl_data.dequant_program, dequant_cl_file_name,
+ dequantCompileOptions) != CL_SUCCESS)
+ return VP8_CL_TRIED_BUT_FAILED;
+
+ // Create the compute kernels in the program we wish to run
+ VP8_CL_CREATE_KERNEL(cl_data,dequant_program,vp8_dequant_dc_idct_add_kernel,"vp8_dequant_dc_idct_add_kernel");
+ VP8_CL_CREATE_KERNEL(cl_data,dequant_program,vp8_dequant_idct_add_kernel,"vp8_dequant_idct_add_kernel");
+ VP8_CL_CREATE_KERNEL(cl_data,dequant_program,vp8_dequantize_b_kernel,"vp8_dequantize_b_kernel");
+
+ //printf("Created dequant kernels\n");
+
+ return CL_SUCCESS;
+}
+
+void vp8_dequantize_b_cl(BLOCKD *d)
+{
+ int err;
+ size_t global = 16;
+
+ /* Set kernel arguments */
+ err = 0;
+ err = clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 0, sizeof (cl_mem), &d->cl_dqcoeff_mem);
+ err |= clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 1, sizeof (cl_int), &d->dqcoeff_offset);
+ err |= clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 2, sizeof (cl_mem), &d->cl_qcoeff_mem);
+ err |= clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 3, sizeof (cl_int), &d->qcoeff_offset);
+ err |= clSetKernelArg(cl_data.vp8_dequantize_b_kernel, 4, sizeof (cl_mem), &d->cl_dequant_mem);
+ VP8_CL_CHECK_SUCCESS( d->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",
+ vp8_dequantize_b_c(d),
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel( d->cl_commands, cl_data.vp8_dequantize_b_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( d->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);\
+ vp8_dequantize_b_c(d),
+ );
+
+}
+
+void vp8_dequant_idct_add_cl(BLOCKD *b, unsigned char *dest_base, cl_mem dest_mem, int dest_offset, size_t dst_size, int q_offset, int pred_offset, int pitch, int stride, vp8_dequant_idct_add_fn_t idct_add)
+{
+ int err;
+ size_t global = 1;
+ //cl_mem dest_mem = NULL;
+ int free_mem = 0;
+
+ if (dest_mem == NULL){
+ //Initialize destination memory
+ VP8_CL_CREATE_BUF(b->cl_commands, dest_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+ dst_size, dest_base,,
+ );
+ free_mem = 1;
+ }
+
+ /* Set kernel arguments */
+ err = 0;
+ err = clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 0, sizeof (cl_mem), &b->cl_qcoeff_mem);
+ err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 1, sizeof (int), &q_offset);
+ err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 2, sizeof (cl_mem), &b->cl_dequant_mem);
+ err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 3, sizeof (cl_mem), &b->cl_predictor_mem);
+ err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 4, sizeof (int), &pred_offset);
+ err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 5, sizeof (cl_mem), &dest_mem);
+ err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 6, sizeof (int), &dest_offset);
+ err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 7, sizeof (int), &pitch);
+ err |= clSetKernelArg(cl_data.vp8_dequant_idct_add_kernel, 8, sizeof (int), &stride);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",
+ idct_add(b->qcoeff_base+q_offset, b->dequant, b->predictor_base + pred_offset,
+ dest_base + dest_offset, pitch, stride),
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel( b->cl_commands, cl_data.vp8_dequant_idct_add_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);\
+ idct_add(b->qcoeff_base+q_offset, b->dequant, b->predictor_base + pred_offset,
+ dest_base + dest_offset, pitch, stride),
+ );
+
+ if (free_mem == 1){
+ /* Read back the result data from the device */
+ err = clEnqueueReadBuffer(b->cl_commands, dest_mem, CL_FALSE, 0, dst_size, dest_base, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read output array!\n",
+ idct_add(b->qcoeff_base+q_offset, b->dequant, b->predictor_base + pred_offset,
+ dest_base + dest_offset, pitch, stride),
+ );
+
+ //CL Spec says this can be freed without clFinish first
+ clReleaseMemObject(dest_mem);
+ }
+
+ return;
+}
+
+//Can modify arguments. Only called from vp8_dequant_dc_idct_add_y_block_cl.
+void vp8_dequant_dc_idct_add_cl(
+ BLOCKD *b,
+ int qcoeff_offset,
+ int pred_offset,
+ unsigned char *dest_base,
+ int dest_off,
+ int pitch,
+ int stride,
+ int Dc_offset)
+{
+ int err;
+ int dq_offset = 0;
+ unsigned char *dest = dest_base + dest_off;
+
+ cl_mem dest_mem = NULL;
+ size_t dest_size;
+ size_t global = 1;
+ int dest_offset=0;
+
+ //Initialize dest_mem
+ dest_size = sizeof(cl_uchar)*(4*stride + dest_offset + 4);
+ VP8_CL_CREATE_BUF(b->cl_commands, dest_mem, CL_MEM_READ_WRITE|CL_MEM_COPY_HOST_PTR,
+ dest_size, dest,,
+ );
+
+ //Assuming that all input cl_mem has been initialized outside of this Fn.
+
+ /* Set kernel arguments */
+ err = 0;
+ err = clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 0, sizeof (cl_mem), &b->cl_qcoeff_mem);
+ err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 1, sizeof (int), &qcoeff_offset);
+ err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 2, sizeof (cl_mem), &b->cl_dequant_mem);
+ err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 3, sizeof(int), &dq_offset);
+ err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 4, sizeof (cl_mem), &b->cl_predictor_mem);
+ err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 5, sizeof (int), &pred_offset);
+ err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 6, sizeof (cl_mem), &b->cl_diff_mem);
+ err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 7, sizeof (int), &Dc_offset);
+ err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 8, sizeof (cl_mem), &dest_mem);
+ err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 9, sizeof (int), &pitch);
+ err |= clSetKernelArg(cl_data.vp8_dequant_dc_idct_add_kernel, 10, sizeof (int), &stride);
+
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to set kernel arguments!\n",,
+ );
+
+ /* Execute the kernel */
+ err = clEnqueueNDRangeKernel( b->cl_commands, cl_data.vp8_dequant_dc_idct_add_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to execute kernel!\n",
+ printf("err = %d\n",err);,
+ );
+
+ /* Read back the result data from the device */
+ err = clEnqueueReadBuffer(b->cl_commands, dest_mem, CL_FALSE, 0, dest_size, dest, 0, NULL, NULL);
+ VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
+ "Error: Failed to read output array!\n",,
+ );
+
+ //CL Spec says this can be freed without clFinish first
+ clReleaseMemObject(dest_mem);
+ dest_mem = NULL;
+
+ return;
+
+}
--- /dev/null
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+__constant int cospi8sqrt2minus1 = 20091;
+__constant int sinpi8sqrt2 = 35468;
+__constant int rounding = 0;
+
+void vp8_short_idct4x4llm(__global short*, short*, int);
+void cl_memset_short(__global short*, int, size_t);
+
+#define USE_VECTORS 0
+
+__kernel void vp8_dequantize_b_kernel(
+ __global short *dqcoeff_base,
+ int dqcoeff_offset,
+ __global short *qcoeff_base,
+ int qcoeff_offset,
+ __global short *dequant
+)
+{
+ __global short *DQ = dqcoeff_base + dqcoeff_offset;
+ __global short *Q = qcoeff_base + qcoeff_offset;
+
+#if USE_VECTORS
+ vstore16(vload16(0,Q) * vload16(0,dequant), 0, DQ);
+#else
+ int tid = get_global_id(0);
+ if (tid < 16)
+ {
+ DQ[tid] = Q[tid] * dequant[tid];
+ }
+
+#endif
+}
+
+__kernel void vp8_dequant_idct_add_kernel(
+ __global short *input_base,
+ int input_offset,
+ __global short *dq,
+ __global unsigned char *pred_base,
+ int pred_offset,
+ __global unsigned char *dest_base,
+ int dest_offset,
+ int pitch,
+ int stride
+)
+{
+ short output[16];
+ short *diff_ptr = output;
+ int r, c;
+ int i;
+ __global unsigned char *dest = dest_base + dest_offset;
+ __global short *input = input_base + input_offset;
+ __global unsigned char *pred = pred_base + pred_offset;
+
+#if USE_VECTORS
+ vstore16( (short16)vload16(0,dq) * (short16)vload16(0,input) , 0, input);
+#else
+ for (i = 0; i < 16; i++)
+ {
+ input[i] = dq[i] * input[i];
+ }
+#endif
+
+ /* the idct halves ( >> 1) the pitch */
+ vp8_short_idct4x4llm(input, output, 4 << 1);
+
+ //Note, remember to copy back the input buffer (qcoeff) to system memory.
+ cl_memset_short(input, 0, 32);
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ int a = diff_ptr[c] + pred[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dest[c] = (unsigned char) a;
+ }
+
+ dest += stride;
+ diff_ptr += 4;
+ pred += pitch;
+ }
+}
+
+
+__kernel void vp8_dequant_dc_idct_add_kernel(
+ __global short *qcoeff_base,
+ int qcoeff_offset,
+
+ __global short *dequant_base,
+ int dequant_offset,
+
+ __global unsigned char *pred_base,
+ int pred_offset,
+
+ __global short *diff_base,
+ int diff_offset,
+
+ __global unsigned char *dest,
+
+ int pitch,
+ int stride
+)
+{
+ int i;
+ short output[16];
+ short *diff_ptr = output;
+ int r, c;
+
+ global short *input = &qcoeff_base[qcoeff_offset];
+ global short *dq = &dequant_base[dequant_offset];
+ global unsigned char *pred = pred_base + pred_offset;
+
+ //A modified input buffer... copy back to System memory when done!
+ input[0] = diff_base[diff_offset];
+
+#if USE_VECTORS
+ vstore16( (short16)vload16(0,dq) * (short16)vload16(0,input) , 0, input);
+#else
+ for (i = 1; i < 16; i++)
+ {
+ input[i] = dq[i] * input[i];
+ }
+#endif
+
+ /* the idct halves ( >> 1) the pitch */
+ vp8_short_idct4x4llm(input, output, 4 << 1);
+
+ cl_memset_short(input, 0, 32);
+
+ for (r = 0; r < 4; r++)
+ {
+ for (c = 0; c < 4; c++)
+ {
+ int a = diff_ptr[c] + pred[c];
+
+ if (a < 0)
+ a = 0;
+
+ if (a > 255)
+ a = 255;
+
+ dest[c] = (unsigned char) a;
+ }
+
+ dest += stride;
+ diff_ptr += 4;
+ pred += pitch;
+ }
+}
+
+
+
+
+//Note that this kernel has been copied from common/opencl/idctllm_cl.cl
+void vp8_short_idct4x4llm(
+ __global short *input,
+ short *output,
+ int pitch
+)
+{
+ int i;
+ int a1, b1, c1, d1;
+
+ __global short *ip = input;
+ short *op = output;
+ int temp1, temp2;
+ int shortpitch = pitch >> 1;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = ip[0] + ip[8];
+ b1 = ip[0] - ip[8];
+
+ temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
+ temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
+ temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
+ d1 = temp1 + temp2;
+
+ op[shortpitch*0] = a1 + d1;
+ op[shortpitch*3] = a1 - d1;
+
+ op[shortpitch*1] = b1 + c1;
+ op[shortpitch*2] = b1 - c1;
+
+ ip++;
+ op++;
+ }
+
+ op = output;
+
+ for (i = 0; i < 4; i++)
+ {
+ a1 = op[0] + op[2];
+ b1 = op[0] - op[2];
+
+ temp1 = (op[1] * sinpi8sqrt2 + rounding) >> 16;
+ temp2 = op[3] + ((op[3] * cospi8sqrt2minus1 + rounding) >> 16);
+ c1 = temp1 - temp2;
+
+ temp1 = op[1] + ((op[1] * cospi8sqrt2minus1 + rounding) >> 16);
+ temp2 = (op[3] * sinpi8sqrt2 + rounding) >> 16;
+ d1 = temp1 + temp2;
+
+
+ op[0] = (a1 + d1 + 4) >> 3;
+ op[3] = (a1 - d1 + 4) >> 3;
+
+ op[1] = (b1 + c1 + 4) >> 3;
+ op[2] = (b1 - c1 + 4) >> 3;
+
+ op += shortpitch;
+ }
+
+}
+
+void vp8_dc_only_idct_add_kernel(
+ short input_dc,
+ __global unsigned char *pred_ptr,
+ __global unsigned char *dst_ptr,
+ int pitch,
+ int stride
+)
+{
+ int a1 = ((input_dc + 4) >> 3);
+ int r, c;
+ int pred_offset,dst_offset;
+
+ int tid = get_global_id(0);
+ if (tid < 16){
+ r = tid / 4;
+ c = tid % 4;
+
+ pred_offset = r * pitch;
+ dst_offset = r * stride;
+ int a = a1 + pred_ptr[pred_offset + c] ;
+
+ if (a < 0)
+ a = 0;
+ else if (a > 255)
+ a = 255;
+
+ dst_ptr[dst_offset + c] = (unsigned char) a ;
+ }
+}
+
+void cl_memset_short(__global short *s, int c, size_t n) {
+ int i;
+ for (i = 0; i < n/2; i++)
+ *s++ = c;
+}
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef DEQUANTIZE_CL_H
+#define DEQUANTIZE_CL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp8/decoder/onyxd_int.h"
+#include "vp8/decoder/dequantize.h"
+#include "vp8/common/opencl/vp8_opencl.h"
+
+#define prototype_dequant_block_cl(sym) \
+ void sym(BLOCKD *x)
+
+#define prototype_dequant_idct_add_cl(sym) \
+ void sym(BLOCKD *b, unsigned char *dest_base,cl_mem dest_mem, int dest_offset, size_t dest_size, int q_offset, \
+ int pred_offset, int pitch, int stride, \
+ vp8_dequant_idct_add_fn_t idct_add)
+
+#define prototype_dequant_dc_idct_add_cl(sym) \
+ void sym(BLOCKD* b, int qcoeff_offset, \
+ int pred_offset, unsigned char *dest_base, int dst_offset, \
+ int pitch, int stride, \
+ int dc)
+
+#define prototype_dequant_dc_idct_add_y_block_cl(sym) \
+ void sym(BLOCKD *b, \
+ unsigned char *dst_base, cl_mem dst_mem, int dst_off,\
+ int stride, char *eobs, int dc_offset)
+
+#define prototype_dequant_idct_add_y_block_cl(sym) \
+ void sym(VP8D_COMP *pbi, MACROBLOCKD *xd)
+
+#define prototype_dequant_idct_add_uv_block_cl(sym) \
+ void sym(VP8D_COMP *pbi, MACROBLOCKD *xd, \
+ vp8_dequant_idct_add_uv_block_fn_t idct_add_uv_block)
+
+
+
+extern prototype_dequant_block_cl(vp8_dequantize_b_cl);
+
+//CL functions
+extern prototype_dequant_idct_add_cl(vp8_dequant_idct_add_cl);
+
+//C functions
+extern prototype_dequant_dc_idct_add_cl(vp8_dequant_dc_idct_add_cl);
+
+
+//Might be CL... check implementation.
+extern prototype_dequant_dc_idct_add_y_block_cl(vp8_dequant_dc_idct_add_y_block_cl);
+extern prototype_dequant_idct_add_y_block_cl(vp8_dequant_idct_add_y_block_cl);
+extern prototype_dequant_idct_add_uv_block_cl(vp8_dequant_idct_add_uv_block_cl);
+
+
+
+extern const char *dequantCompileOptions;
+extern const char *dequant_cl_file_name;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/decoder/onyxd_int.h"
+#include "vpx_ports/config.h"
+#include "../../common/idct.h"
+#include "vp8/common/opencl/blockd_cl.h"
+#include "dequantize_cl.h"
+
+//change q/dq/pre/eobs/dc to offsets
+void vp8_dequant_dc_idct_add_y_block_cl(
+ BLOCKD *b,
+ unsigned char *dst_base, //xd->dst.buffer_alloc
+ cl_mem dst_mem,
+ int dst_off,
+ int stride, //xd->dst.y_stride
+ char *eobs, //xd->eobs
+ int dc_offset //xd->block[24].diff_offset
+)
+{
+ int i, j;
+ int q_offset = 0;
+ int pre_offset = 0;
+ int dst_offset = 0;
+ unsigned char *dst = dst_base+dst_off;
+ size_t dst_size = 16*(stride+1);
+
+ vp8_cl_block_prep(b, QCOEFF|DEQUANT|DIFF|PREDICTOR);
+ for (i = 0; i < 4; i++)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ if (*eobs++ > 1){
+ vp8_dequant_dc_idct_add_cl (b, q_offset, pre_offset, dst, dst_offset, 16, stride, dc_offset);
+ }
+ else{
+ vp8_dc_only_idct_add_cl(b, CL_TRUE, dc_offset, 0, pre_offset, dst, NULL, dst_offset, dst_size, 16, stride);
+ }
+
+ q_offset += 16;
+ pre_offset += 4;
+ dst_offset += 4;
+ dc_offset++;
+ }
+
+ pre_offset += 64 - 16;
+ dst_offset += 4*stride - 16;
+ }
+
+ vp8_cl_block_finish(b, QCOEFF);
+
+}
+
+void vp8_dequant_idct_add_y_block_cl (VP8D_COMP *pbi, MACROBLOCKD *xd)
+{
+ int i, j;
+
+ short *q = xd->qcoeff;
+ int q_offset = 0;
+ int pre_offset = 0;
+ cl_mem dst_mem = xd->dst.buffer_mem;
+ unsigned char *dst = xd->dst.buffer_alloc;
+ int dst_offset = xd->dst.y_buffer - dst;
+ int stride = xd->dst.y_stride;
+ char *eobs = xd->eobs;
+ int dst_size = 16 * (stride + 1);
+
+
+ vp8_cl_mb_prep(xd,PREDICTOR|DIFF|QCOEFF);
+ for (i = 0; i < 4; i++)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ if (*eobs++ > 1){
+ vp8_cl_block_prep(&xd->block[0], DEQUANT);
+ vp8_dequant_idct_add_cl(&xd->block[0], dst, dst_mem, dst_offset, dst_size+dst_offset, q_offset, pre_offset, 16, stride, pbi->dequant.idct_add);
+ vp8_cl_block_finish(&xd->block[0], QCOEFF);
+ }
+ else
+ {
+ vp8_cl_block_prep(&xd->block[0], DEQUANT);
+ vp8_dc_only_idct_add_cl(&xd->block[0], CL_FALSE, 0, q_offset, pre_offset, dst, dst_mem, dst_offset, dst_size+dst_offset, 16, stride);
+ VP8_CL_FINISH(xd->cl_commands);
+ ((int *)(q+q_offset))[0] = 0;
+ vp8_cl_mb_prep(xd,QCOEFF);
+ }
+
+ q_offset += 16;
+ pre_offset += 4;
+ dst_offset += 4;
+ }
+
+ pre_offset += 64 - 16;
+ dst_offset += 4*stride - 16;
+ }
+
+}
+
+void vp8_dequant_idct_add_uv_block_cl(VP8D_COMP *pbi, MACROBLOCKD *xd,
+ vp8_dequant_idct_add_uv_block_fn_t idct_add_uv_block
+)
+{
+ int i, j;
+
+ int block_num = 16;
+ BLOCKD b = xd->block[block_num];
+
+ short *q = xd->qcoeff;
+
+ cl_mem dst_mem = xd->dst.buffer_mem;
+ unsigned char *dst = xd->dst.buffer_alloc;
+ int u_off = xd->dst.u_buffer - dst;
+ int v_off = xd->dst.v_buffer - dst;
+
+ int stride = xd->dst.uv_stride;
+ size_t dst_size = 8*(stride+1);
+ char *eobs = xd->eobs+16;
+
+ int pre_offset = block_num*16;
+ int q_offset = block_num*16;
+ int dst_offset = 0;
+
+ vp8_cl_mb_prep(xd, DIFF|QCOEFF|PREDICTOR);
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ if (*eobs++ > 1){
+ vp8_cl_block_prep(&xd->block[0], DEQUANT);
+ vp8_dequant_idct_add_cl(&b, dst, dst_mem, u_off+dst_offset, u_off+dst_size, q_offset, pre_offset, 8, stride, DEQUANT_INVOKE (&pbi->dequant, idct_add));
+ }
+ else
+ {
+ vp8_cl_block_prep(&xd->block[block_num], DEQUANT);
+ vp8_dc_only_idct_add_cl (&b, CL_FALSE, 0, q_offset, pre_offset, dst, dst_mem, u_off+dst_offset, u_off+dst_size, 8, stride);
+
+ //Need round trip + finish until qcoeff set in CL
+ vp8_cl_block_finish(&xd->block[0], QCOEFF);
+ VP8_CL_FINISH(xd->cl_commands);
+ ((int *)(q+q_offset))[0] = 0;
+ vp8_cl_mb_prep(xd,QCOEFF);
+ }
+
+ q_offset += 16;
+ pre_offset += 4;
+ dst_offset += 4;
+ }
+
+ pre_offset += 32 - 8;
+ dst_offset += 4*stride - 8;
+ }
+
+ //Swap dstu out of cl_mem and dstv into it
+
+ dst_offset = 0;
+ for (i = 0; i < 2; i++)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ if (*eobs++ > 1){
+ vp8_cl_block_prep(&b, DEQUANT);
+ vp8_dequant_idct_add_cl (&b, dst, dst_mem, v_off+dst_offset, v_off+dst_size, q_offset,
+ pre_offset, 8, stride, DEQUANT_INVOKE (&pbi->dequant, idct_add));
+ }
+ else
+ {
+ vp8_cl_block_prep(&b, DEQUANT);
+ vp8_dc_only_idct_add_cl (&b, CL_FALSE, 0, q_offset, pre_offset,
+ dst, dst_mem, v_off+dst_offset, v_off+dst_size, 8, stride);
+
+ //Eventually replace with memset kernel call to prevent round trip
+ vp8_cl_mb_finish(xd,QCOEFF);
+ VP8_CL_FINISH(xd->cl_commands);
+ ((int *)(q+q_offset))[0] = 0;
+ vp8_cl_mb_prep(xd,QCOEFF);
+ }
+
+ q_offset += 16;
+ pre_offset += 4;
+ dst_offset += 4;
+ }
+
+ pre_offset += 32 - 8;
+ dst_offset += 4*stride - 8;
+ }
+
+ vp8_cl_mb_finish(xd,QCOEFF);
+
+}
--- /dev/null
+/*
+ * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vp8/decoder/onyxd_int.h"
+
+#include "vp8/common/opencl/vp8_opencl.h"
+#include "vp8_decode_cl.h"
+
+void vp8_arch_opencl_decode_init(VP8D_COMP *pbi)
+{
+
+ if (cl_initialized == CL_SUCCESS){
+ cl_decode_init();
+ }
+
+}
--- /dev/null
+#include "vpx_ports/config.h"
+
+#include "../../common/opencl/vp8_opencl.h"
+#include "vp8_decode_cl.h"
+
+#include <stdio.h>
+
+extern int cl_init_dequant();
+extern int cl_destroy_dequant();
+
+int cl_decode_destroy(){
+
+#if ENABLE_CL_IDCT_DEQUANT
+ int err;
+ err = cl_destroy_dequant();
+#endif
+
+ return CL_SUCCESS;
+}
+
+int cl_decode_init()
+{
+#if ENABLE_CL_IDCT_DEQUANT
+ int err;
+#endif
+
+ //Initialize programs to null value
+ //Enables detection of if they've been initialized as well.
+ cl_data.dequant_program = NULL;
+
+#if ENABLE_CL_IDCT_DEQUANT
+ err = cl_init_dequant();
+ if (err != CL_SUCCESS)
+ return err;
+#endif
+
+ return CL_SUCCESS;
+}
--- /dev/null
+/*
+ * Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_OPENCL_DECODE_H
+#define VP8_OPENCL_DECODE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int cl_decode_init();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* VP8_OPENCL_H */
if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV)
{
BLOCKD *b = &xd->block[24];
+ short *qcoeff = b->qcoeff_base + b->qcoeff_offset;
DEQUANT_INVOKE(&pbi->dequant, block)(b);
/* do 2nd order transform on the dc block */
if (xd->eobs[24] > 1)
{
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
- ((int *)b->qcoeff)[1] = 0;
- ((int *)b->qcoeff)[2] = 0;
- ((int *)b->qcoeff)[3] = 0;
- ((int *)b->qcoeff)[4] = 0;
- ((int *)b->qcoeff)[5] = 0;
- ((int *)b->qcoeff)[6] = 0;
- ((int *)b->qcoeff)[7] = 0;
+ IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset]);
+ ((int *)qcoeff)[0] = 0;
+ ((int *)qcoeff)[1] = 0;
+ ((int *)qcoeff)[2] = 0;
+ ((int *)qcoeff)[3] = 0;
+ ((int *)qcoeff)[4] = 0;
+ ((int *)qcoeff)[5] = 0;
+ ((int *)qcoeff)[6] = 0;
+ ((int *)qcoeff)[7] = 0;
}
else
{
- IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
- ((int *)b->qcoeff)[0] = 0;
+ IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset]);
+ ((int *)qcoeff)[0] = 0;
}
DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
(xd->qcoeff, xd->block[0].dequant,
xd->predictor, xd->dst.y_buffer,
- xd->dst.y_stride, xd->eobs, xd->block[24].diff);
+ xd->dst.y_stride, xd->eobs, &xd->block[24].diff_base[xd->block[24].diff_offset]);
}
else if ((xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED)
{
for (i = 0; i < 16; i++)
{
BLOCKD *b = &xd->block[i];
- vp8mt_predict_intra4x4(pbi, xd, b->bmi.mode, b->predictor, mb_row, mb_col, i);
+ short *qcoeff = b->qcoeff_base + b->qcoeff_offset;
+ vp8mt_predict_intra4x4(pbi, xd, b->bmi.mode, b->predictor_base + b->predictor_offset, mb_row, mb_col, i);
if (xd->eobs[i] > 1)
{
DEQUANT_INVOKE(&pbi->dequant, idct_add)
- (b->qcoeff, b->dequant, b->predictor,
+ (qcoeff, b->dequant, b->predictor_base + b->predictor_offset,
*(b->base_dst) + b->dst, 16, b->dst_stride);
}
else
{
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
- (b->qcoeff[0] * b->dequant[0], b->predictor,
+ (qcoeff[0] * b->dequant[0], b->predictor_base + b->predictor_offset,
*(b->base_dst) + b->dst, 16, b->dst_stride);
- ((int *)b->qcoeff)[0] = 0;
+ ((int *)qcoeff)[0] = 0;
}
}
}
static void dequantize_b_mmx(BLOCKD *d)
{
- short *sq = (short *) d->qcoeff;
- short *dq = (short *) d->dqcoeff;
+ short *sq = (short *) d->qcoeff_base + d->qcoeff_offset;
+ short *dq = (short *) d->dqcoeff_base + d->dqcoeff_offset;
short *q = (short *) d->dequant;
vp8_dequantize_b_impl_mmx(sq, dq, q);
}
void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d)
{
- d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff, d->dqcoeff, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant_fast);
+ d->eob = vp8_fast_quantize_b_neon_func(b->coeff, b->zbin, d->qcoeff_base + d->qcoeff_offset, d->dqcoeff_base + d->dqcoeff_offset, d->dequant, vp8_rvsplus1_default_zig_zag1d, b->round, b->quant_fast);
}
/*
short *zbin_ptr = &b->Zbin[0][0];
short *round_ptr = &b->Round[0][0];
short *quant_ptr = &b->Quant[0][0];
- short *qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr= d->dqcoeff;
+ short *qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset;
+ short *dqcoeff_ptr= d->dqcoeff_base + d->dqcoeff_offset;
short *dequant_ptr= &d->Dequant[0][0];
eob = 0;
DEFINE(vp8_block_zrun_zbin_boost, offsetof(BLOCK, zrun_zbin_boost));
DEFINE(vp8_block_quant_shift, offsetof(BLOCK, quant_shift));
-DEFINE(vp8_blockd_qcoeff, offsetof(BLOCKD, qcoeff));
+DEFINE(vp8_blockd_qcoeff_base, offsetof(BLOCKD, qcoeff_base));
+DEFINE(vp8_blockd_qcoeff_offset, offsetof(BLOCKD, qcoeff_offset));
DEFINE(vp8_blockd_dequant, offsetof(BLOCKD, dequant));
-DEFINE(vp8_blockd_dqcoeff, offsetof(BLOCKD, dqcoeff));
+DEFINE(vp8_blockd_dqcoeff_base, offsetof(BLOCKD, dqcoeff_base));
+DEFINE(vp8_blockd_dqcoeff_offset, offsetof(BLOCKD, dqcoeff_offset));
DEFINE(vp8_blockd_eob, offsetof(BLOCKD, eob));
// subtract
DEFINE(vp8_block_src_diff, offsetof(BLOCK, src_diff));
DEFINE(vp8_block_src_stride, offsetof(BLOCK, src_stride));
-DEFINE(vp8_blockd_predictor, offsetof(BLOCKD, predictor));
+DEFINE(vp8_blockd_predictor_base, offsetof(BLOCKD, predictor_base));
+DEFINE(vp8_blockd_predictor_offset, offsetof(BLOCKD, predictor_offset));
//pack tokens
DEFINE(vp8_writer_lowvalue, offsetof(vp8_writer, lowvalue));
#include "vp8/common/reconintra.h"
#include "vp8/common/reconintra4x4.h"
#include "encodemb.h"
-#include "vp8/common/invtrans.h"
+#include "invtrans.h"
#include "vp8/common/recon.h"
#include "dct.h"
#include "vp8/common/g_common.h"
#else
#define IF_RTCD(x) NULL
#endif
+
void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode)
{
- vp8_predict_intra4x4(b, best_mode, b->predictor);
+ vp8_predict_intra4x4(b, best_mode, b->predictor_base + b->predictor_offset);
ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 32);
- RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
}
void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
#include "vp8/common/reconinter.h"
#include "quantize.h"
#include "tokenize.h"
-#include "vp8/common/invtrans.h"
+#include "invtrans.h"
#include "vp8/common/recon.h"
#include "vp8/common/reconintra.h"
#include "dct.h"
{
unsigned char *src_ptr = (*(be->base_src) + be->src);
short *diff_ptr = be->src_diff;
- unsigned char *pred_ptr = bd->predictor;
+ unsigned char *pred_ptr = bd->predictor_base + bd->predictor_offset;
int src_stride = be->src_stride;
int r, c;
// recon = copy from predictors to destination
{
BLOCKD *b = &x->e_mbd.block[0];
- unsigned char *pred_ptr = b->predictor;
+ unsigned char *pred_ptr = b->predictor_base + b->predictor_offset;
unsigned char *dst_ptr = *(b->base_dst) + b->dst;
int stride = b->dst_stride;
vpx_memcpy(dst_ptr+i*stride,pred_ptr+16*i,16);
b = &x->e_mbd.block[16];
- pred_ptr = b->predictor;
+ pred_ptr = b->predictor_base + b->predictor_offset;
dst_ptr = *(b->base_dst) + b->dst;
stride = b->dst_stride;
vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
b = &x->e_mbd.block[20];
- pred_ptr = b->predictor;
+ pred_ptr = b->predictor_base + b->predictor_offset;
dst_ptr = *(b->base_dst) + b->dst;
stride = b->dst_stride;
dequant_ptr = d->dequant;
coeff_ptr = b->coeff;
- qcoeff_ptr = d->qcoeff;
- dqcoeff_ptr = d->dqcoeff;
+ qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset;
+ dqcoeff_ptr = d->dqcoeff_base + d->qcoeff_offset;
i0 = !type;
eob = d->eob;
#include "invtrans.h"
-
-
static void recon_dcblock(MACROBLOCKD *x)
{
BLOCKD *b = &x->block[24];
for (i = 0; i < 16; i++)
{
- x->block[i].dqcoeff[0] = b->diff[i];
+ *(x->block[i].dqcoeff_base+x->block[i].dqcoeff_offset) = b->diff_base[b->diff_offset+i];
}
}
void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch)
{
if (b->eob > 1)
- IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);
+ IDCT_INVOKE(rtcd, idct16)(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset], pitch);
else
- IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch);
+ IDCT_INVOKE(rtcd, idct1)(b->dqcoeff_base + b->dqcoeff_offset, &b->diff_base[b->diff_offset], pitch);
}
-
+/* Only used in the encoder */
void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
{
int i;
/* do 2nd order transform on the dc block */
- IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff, x->block[24].diff);
+ IDCT_INVOKE(rtcd, iwalsh16)(x->block[24].dqcoeff_base + x->block[23].dqcoeff_offset, &x->block[24].diff_base[x->block[24].diff_offset]);
recon_dcblock(x);
}
}
+
+/* Only used in encoder */
void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
{
int i;
{
vp8_inverse_transform_b(rtcd, &x->block[i], 16);
}
-
}
x->mode_info_context->mbmi.mode != SPLITMV)
{
/* do 2nd order transform on the dc block */
+ BLOCKD b = x->block[24];
+
+ IDCT_INVOKE(rtcd, iwalsh16)(b.dqcoeff_base+b.dqcoeff_offset, &b.diff_base[b.diff_offset]);
- IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff);
recon_dcblock(x);
}
#define __INC_INVTRANS_H
#include "vpx_ports/config.h"
-#include "idct.h"
-#include "blockd.h"
+#include "vp8/common/idct.h"
+#include "vp8/common/blockd.h"
extern void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch);
extern void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
extern void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type;
- if (!cm->use_bilinear_mc_filter)
- cm->mcomp_filter_type = SIXTAP;
- else
- cm->mcomp_filter_type = BILINEAR;
-
cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
cm->Width = cpi->oxcf.Width ;
unsigned char *sptr;
unsigned char *dptr;
sptr = (*(be->base_src) + be->src);
- dptr = b->predictor;
+ dptr = b->predictor_base + b->predictor_offset;
return VARIANCE_INVOKE(rtcd, get4x4sse_cs)(sptr, be->src_stride, dptr, 16, 0x7fffffff);
int this_rd;
rate = mode_costs[mode];
- vp8_predict_intra4x4(b, mode, b->predictor);
+ vp8_predict_intra4x4(b, mode, b->predictor_base + b->predictor_offset);
distortion = get_prediction_error(be, b, &rtcd->variance);
this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
short *round_ptr = b->round;
short *quant_ptr = b->quant_fast;
unsigned char *quant_shift_ptr = b->quant_shift;
- short *qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr = d->dqcoeff;
+ short *qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset;
+ short *dqcoeff_ptr = d->dqcoeff_base + d->dqcoeff_offset;
short *dequant_ptr = d->dequant;
vpx_memset(qcoeff_ptr, 0, 32);
short *coeff_ptr = b->coeff;
short *round_ptr = b->round;
short *quant_ptr = b->quant_fast;
- short *qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr = d->dqcoeff;
+ short *qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset;
+ short *dqcoeff_ptr = d->dqcoeff_base + d->dqcoeff_offset;
short *dequant_ptr = d->dequant;
eob = -1;
short *round_ptr = b->round;
short *quant_ptr = b->quant;
unsigned char *quant_shift_ptr = b->quant_shift;
- short *qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr = d->dqcoeff;
+ short *qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset;
+ short *dqcoeff_ptr = d->dqcoeff_base + d->dqcoeff_offset;
short *dequant_ptr = d->dequant;
short zbin_oq_value = b->zbin_extra;
coeff_ptr = b->coeff;
quant_ptr = b->quant;
quant_shift_ptr = b->quant_shift;
- qcoeff_ptr = d->qcoeff;
- dqcoeff_ptr = d->dqcoeff;
+ qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset;
+ dqcoeff_ptr = d->dqcoeff_base + d->dqcoeff_offset;
dequant_ptr = d->dequant;
eob = - 1;
vpx_memset(qcoeff_ptr, 0, 32);
short *zbin_ptr = b->zbin;
short *round_ptr = b->round;
short *quant_ptr = b->quant;
- short *qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr = d->dqcoeff;
+ short *qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset;
+ short *dqcoeff_ptr = d->dqcoeff_base + d->dqcoeff_offset;
short *dequant_ptr = d->dequant;
short zbin_oq_value = b->zbin_extra;
void vp8_auto_select_speed(VP8_COMP *cpi)
{
- int used = cpi->oxcf.cpu_used;
+ //int used = cpi->oxcf.cpu_used;
int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate);
for (j = dc; j < 16; j++)
{
- int this_diff = be->coeff[j] - bd->dqcoeff[j];
+ short *dqcoeff = bd->dqcoeff_base + bd->dqcoeff_offset;
+ int this_diff = be->coeff[j] - dqcoeff[j];
berror += this_diff * this_diff;
}
for (i = 16; i < 24; i++)
{
+ short *dqcoeff;
+
be = &mb->block[i];
bd = &mb->e_mbd.block[i];
- error += vp8_block_error_c(be->coeff, bd->dqcoeff);
+ dqcoeff = bd->dqcoeff_base + bd->dqcoeff_offset;
+ error += vp8_block_error_c(be->coeff, dqcoeff);
}
return error;
int eob = b->eob;
int pt ; /* surrounding block/prev coef predictor */
int cost = 0;
- short *qcoeff_ptr = b->qcoeff;
+ short *qcoeff_ptr = b->qcoeff_base + b->qcoeff_offset;;
VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
// Distortion
d = ENCODEMB_INVOKE(rtcd, mberr)(mb, 1) << 2;
- d += ENCODEMB_INVOKE(rtcd, berr)(mb_y2->coeff, x_y2->dqcoeff);
+ d += ENCODEMB_INVOKE(rtcd, berr)(mb_y2->coeff, x_y2->dqcoeff_base + x_y2->dqcoeff_offset);
*Distortion = (d >> 4);
rate = bmode_costs[mode];
- vp8_predict_intra4x4(b, mode, b->predictor);
+ vp8_predict_intra4x4(b, mode, b->predictor_base + b->predictor_offset);
ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16);
x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
x->quantize_b(be, b);
ratey = cost_coeffs(x, b, PLANE_TYPE_Y_WITH_DC, &tempa, &templ);
rate += ratey;
- distortion = ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), berr)(be->coeff, b->dqcoeff) >> 2;
+ distortion = ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), berr)(be->coeff, b->dqcoeff_base + b->dqcoeff_offset) >> 2;
this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
*best_mode = mode;
*a = tempa;
*l = templ;
- copy_predictor(best_predictor, b->predictor);
- vpx_memcpy(best_dqcoeff, b->dqcoeff, 32);
+ copy_predictor(best_predictor, b->predictor_base + b->predictor_offset);
+ vpx_memcpy(best_dqcoeff, b->dqcoeff_base + b->dqcoeff_offset, 32);
}
}
b->bmi.mode = (B_PREDICTION_MODE)(*best_mode);
- IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff, b->diff, 32);
- RECON_INVOKE(IF_RTCD(&cpi->rtcd.common->recon), recon)(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+ IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff, b->diff_base + b->diff_offset, 32);
+ RECON_INVOKE(IF_RTCD(&cpi->rtcd.common->recon), recon)(best_predictor, b->diff_base + b->diff_offset, *(b->base_dst) + b->dst, b->dst_stride);
return best_rd;
}
//be->coeff[0] = 0;
x->quantize_b(be, bd);
- distortion += ENCODEMB_INVOKE(rtcd, berr)(be->coeff, bd->dqcoeff);
+ distortion += ENCODEMB_INVOKE(rtcd, berr)(be->coeff, bd->dqcoeff_base + bd->dqcoeff_offset);
}
}
const int eob = b->eob; /* one beyond last nonzero coeff */
TOKENEXTRA *t = *tp; /* store tokens starting here */
int x;
- const short *qcoeff_ptr = b->qcoeff;
+ const short *qcoeff_ptr = b->qcoeff_base + b->qcoeff_offset;
VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
do
const int eob = b->eob; /* one beyond last nonzero coeff */
TOKENEXTRA *t = *tp; /* store tokens starting here */
int x;
- const short *qcoeff_ptr = b->qcoeff;
+ const short *qcoeff_ptr = b->qcoeff_base + b->qcoeff_offset;
VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
do
int plane_type;
int b;
- TOKENEXTRA *start = *t;
- TOKENEXTRA *tp = *t;
+ //TOKENEXTRA *start = *t;
+ //TOKENEXTRA *tp = *t;
x->mode_info_context->mbmi.dc_diff = 1;
movdqa xmm3, [rsp + qcoeff + 16]
mov rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr
- mov rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
+ mov rdi, [rsi + vp8_blockd_dqcoeff_base + vp8_blockd_dqcoeff_offset] ; dqcoeff_ptr
; y ^ sz
pxor xmm2, xmm0
movdqa xmm0, [rcx]
movdqa xmm1, [rcx + 16]
- mov rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr
+ mov rcx, [rsi + vp8_blockd_qcoeff_base + vp8_blockd_dqcoeff_offset] ; qcoeff_ptr
pmullw xmm0, xmm2
pmullw xmm1, xmm3
paddw xmm1, [rcx]
paddw xmm5, [rcx + 16]
- mov rax, [rsi + vp8_blockd_qcoeff]
+ mov rax, [rsi + vp8_blockd_qcoeff_base + vp8_blockd_qcoeff_offset]
mov rcx, [rsi + vp8_blockd_dequant]
- mov rdi, [rsi + vp8_blockd_dqcoeff]
+ mov rdi, [rsi + vp8_blockd_dqcoeff_base + vp8_blockd_dqcoeff_offset]
; y = x * quant >> 16
pmulhw xmm1, [rdx]
pmulhw xmm1, [rdx]
pmulhw xmm5, [rdx + 16]
- mov rax, [rsi + vp8_blockd_qcoeff]
+ mov rax, [rsi + vp8_blockd_qcoeff_base + vp8_blockd_qcoeff_offset]
mov rdi, [rsi + vp8_blockd_dequant]
- mov rcx, [rsi + vp8_blockd_dqcoeff]
+ mov rcx, [rsi + vp8_blockd_dqcoeff_base + vp8_blockd_dqcoeff_offset]
pxor xmm1, xmm0
pxor xmm5, xmm4
short *zbin_ptr = b->zbin;
short *round_ptr = b->round;
short *quant_ptr = b->quant_fast;
- short *qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr = d->dqcoeff;
+ short *qcoeff_ptr = d->qcoeff_base + d->qcoeff_offset;
+ short *dqcoeff_ptr = d->dqcoeff_base + d->dqcoeff_offset;
short *dequant_ptr = d->dequant;
d->eob = vp8_fast_quantize_b_impl_mmx(
static int mbblock_error_mmx(MACROBLOCK *mb, int dc)
{
short *coeff_ptr = mb->block[0].coeff;
- short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
+ short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff_base + mb->e_mbd.block[0].dqcoeff_offset;
return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
}
unsigned char *z = *(be->base_src) + be->src;
unsigned int src_stride = be->src_stride;
short *diff = &be->src_diff[0];
- unsigned char *predictor = &bd->predictor[0];
+ unsigned char *predictor = bd->predictor_base + bd->predictor_offset;
vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
}
#endif
#if HAVE_SSE2
+
int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
static int mbblock_error_xmm(MACROBLOCK *mb, int dc)
{
short *coeff_ptr = mb->block[0].coeff;
- short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff;
+ short *dcoef_ptr = mb->e_mbd.block[0].dqcoeff_base + mb->e_mbd.block[0].dqcoeff_offset;
return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
}
unsigned char *z = *(be->base_src) + be->src;
unsigned int src_stride = be->src_stride;
short *diff = &be->src_diff[0];
- unsigned char *predictor = &bd->predictor[0];
+ unsigned char *predictor = bd->predictor_base + bd->predictor_offset;
vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
}
VP8_COMMON_SRCS-yes += common/g_common.h
VP8_COMMON_SRCS-yes += common/header.h
VP8_COMMON_SRCS-yes += common/idct.h
-VP8_COMMON_SRCS-yes += common/invtrans.h
VP8_COMMON_SRCS-yes += common/loopfilter.h
VP8_COMMON_SRCS-yes += common/modecont.h
VP8_COMMON_SRCS-yes += common/mv.h
VP8_COMMON_SRCS-yes += common/systemdependent.h
VP8_COMMON_SRCS-yes += common/threading.h
VP8_COMMON_SRCS-yes += common/treecoder.h
-VP8_COMMON_SRCS-yes += common/invtrans.c
VP8_COMMON_SRCS-yes += common/loopfilter.c
VP8_COMMON_SRCS-yes += common/loopfilter_filters.c
VP8_COMMON_SRCS-yes += common/mbpitch.c
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/save_neon_reg$(ASM)
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon_neon.c
+
+#Append OpenCL source files to source listing if needed
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/vp8_opencl.c
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/vp8_opencl.h
+
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/blockd_cl.h
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/blockd_cl.c
+
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/filter_cl.h
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/filter_cl.c
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/filter_cl.cl
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/subpixel_cl.h
+
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/reconinter_cl.h
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/reconinter_cl.c
+
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/idctllm_cl.h
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/idctllm_cl.c
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/idctllm_cl.cl
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/idct_cl.h
+
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/loopfilter_cl.h
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/loopfilter_cl.c
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/loopfilter_cl.cl
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/loopfilter_filters_cl.c
+
+
+VP8_COMMON_SRCS-$(CONFIG_OPENCL) += common/opencl/opencl_systemdependent.c
+VP8_COMMON_SRCS-$(HAVE_DLOPEN) += common/opencl/dynamic_cl.c
+VP8_COMMON_SRCS-$(HAVE_DLOPEN) += common/opencl/dynamic_cl.h
#include "common/onyxd.h"
#include "decoder/onyxd_int.h"
+#if CONFIG_OPENCL
+#include "common/opencl/vp8_opencl.h"
+#endif
+
#define VP8_CAP_POSTPROC (CONFIG_POSTPROC ? VPX_CODEC_CAP_POSTPROC : 0)
typedef vpx_codec_stream_info_t vp8_stream_info_t;
ctx->mmaps[i].dtor(&ctx->mmaps[i]);
}
+#if CONFIG_OPENCL
+ if (cl_initialized == CL_SUCCESS){
+ cl_destroy(NULL, VP8_CL_NOT_INITIALIZED);
+#if HAVE_DLOPEN
+ close_cl();
+#endif
+ }
+#endif
+
return VPX_CODEC_OK;
}
VP8_CX_SRCS-yes += encoder/dct.c
VP8_CX_SRCS-yes += encoder/encodeframe.c
VP8_CX_SRCS-yes += encoder/encodeintra.c
+VP8_CX_SRCS-yes += encoder/invtrans.h
+VP8_CX_SRCS-yes += encoder/invtrans.c
VP8_CX_SRCS-yes += encoder/encodemb.c
VP8_CX_SRCS-yes += encoder/encodemv.c
VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c
VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/dequantize_mmx.asm
VP8_DX_SRCS-$(HAVE_MMX) += decoder/x86/idct_blk_mmx.c
VP8_DX_SRCS-$(HAVE_SSE2) += decoder/x86/idct_blk_sse2.c
+
+
+VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/vp8_decode_cl.c
+VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/vp8_decode_cl.h
+VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/opencl_systemdependent.c
+VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/dequantize_cl.c
+VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/dequantize_cl.h
+VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/dequantize_cl.cl
+VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/idct_blk_cl.c
+VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/decodframe_cl.c
+VP8_DX_SRCS-$(CONFIG_OPENCL) += decoder/opencl/decodframe_cl.h
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vpx_config.h"
+#include "../vpx_config.h"
#ifndef VPX_PORTS_MEM_H
#define VPX_PORTS_MEM_H
-#include "vpx_config.h"
+#include "../vpx_config.h"
#include "vpx/vpx_integer.h"
#if defined(__GNUC__) && __GNUC__
{
duck_free(ybf->buffer_alloc);
- ybf->buffer_alloc = 0;
+ ybf->buffer_alloc = NULL;
+ ybf->buffer_size = -1;
+
+ ybf->y_buffer = NULL;
+ ybf->u_buffer = NULL;
+ ybf->v_buffer = NULL;
+
+#if CONFIG_OPENCL
+ if (cl_initialized == CL_SUCCESS){
+ if (ybf->buffer_mem){
+ clReleaseMemObject(ybf->buffer_mem);
+ ybf->buffer_mem = NULL;
+ }
+ }
+#endif
+
}
else
{
* when we have a large motion vector in V on the last v block.
* Note : We never use these pixels anyway so this doesn't hurt.
*/
+
ybf->buffer_alloc = (unsigned char *) duck_memalign(32, ybf->frame_size + (ybf->y_stride * 2) + 32, 0);
if (ybf->buffer_alloc == NULL)
return -1;
+ ybf->buffer_size = ybf->frame_size + (ybf->y_stride * 2) + 32;
+
+#if CONFIG_OPENCL
+ ybf->buffer_mem = NULL;
+ if (cl_initialized == CL_SUCCESS){
+ ybf->buffer_mem = clCreateBuffer(cl_data.context, CL_MEM_READ_WRITE, ybf->buffer_size, NULL, NULL);
+ if (ybf->buffer_mem == NULL){
+ cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
+ }
+ }
+#endif
+
ybf->y_buffer = ybf->buffer_alloc + (border * ybf->y_stride) + border;
if (yplane_size & 0xf)
yplane_size += 16 - (yplane_size & 0xf);
ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * ybf->uv_stride) + border / 2;
- ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2 * ybf->uv_stride) + border / 2;
+ ybf->v_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * ybf->uv_stride) + border / 2 + uvplane_size;
ybf->corrupted = 0; /* assume not currupted by errors */
}
#define VP7BORDERINPIXELS 48
#define VP8BORDERINPIXELS 32
+#include "../vpx_config.h"
+
+#if CONFIG_OPENCL
+#include "../vp8/common/opencl/vp8_opencl.h"
+#endif
+
/*************************************
For INT_YUV:
unsigned char *v_buffer;
unsigned char *buffer_alloc;
+ int buffer_size;
+#if CONFIG_OPENCL
+ cl_mem buffer_mem;
+#endif
+
int border;
int frame_size;
YUV_TYPE clrtype;