\r
#ifdef USE_OPENCL\r
\r
-#include <math.h>\r
-#include "common.h"\r
-#include "openclwrapper.h"\r
-#define MaxFilterLength 16\r
-#define FILTER_LEN 4\r
-\r
-inline double hb_fit_gauss_kernel( double x )\r
-{\r
- double powNum = -1 * M_PI;\r
-\r
- powNum *= x;\r
-\r
- powNum *= x;\r
-\r
- return exp( powNum );\r
-}\r
-\r
-/**\r
- * Using gaussian algorithm to calculate the scale filter\r
- */\r
-static void hb_set_gauss_interpolation( float *pcoeff, int *pmappedindex, int targetdatalength, int srcdatalength, int filterLength, float bias )\r
-{\r
- int i, j;\r
-\r
- float gausskernel[MaxFilterLength];\r
-\r
- int half = filterLength / 2;\r
-\r
- float scalerate = (float)(srcdatalength) / targetdatalength;\r
-\r
- for( i = 0; i < targetdatalength; ++i )\r
- {\r
- float flindex = i * scalerate + bias;\r
-\r
- if( flindex > (srcdatalength - 1))\r
- {\r
- flindex -= (int)(flindex - (srcdatalength - 1));\r
- }\r
-\r
- int srcindex = (int)(flindex);\r
-\r
- float t = flindex - srcindex;\r
-\r
- for( j = 0; j < (int)half; j++ )\r
- {\r
- gausskernel[j] = (float)hb_fit_gauss_kernel((half - j) - 1 + t );\r
- }\r
-\r
- for( j = 0; j < (int)half; j++ )\r
- {\r
- gausskernel[half + j] = (float)hb_fit_gauss_kernel( j + 1 - t );\r
- }\r
-\r
- while( srcindex < (int)half - 1 )\r
- {\r
- /* -1 0 1 2\r
- * M1 S P1 P2\r
- *\r
- * if srcindex is 0, M1 and S will be the same sample. To keep the\r
- * convolution kernel from having to check for edge conditions, move\r
- * srcindex to 1, then slide down the coefficients\r
- */\r
- srcindex += 1;\r
-\r
- gausskernel[0] += gausskernel[1];\r
-\r
- for( j = 1; j < filterLength - 1; j++ )\r
- {\r
- gausskernel[j] = gausskernel[j + 1];\r
- }\r
-\r
- gausskernel[filterLength - 1] = 0;\r
- }\r
-\r
- while( srcindex >= srcdatalength - half )\r
- {\r
- /* If srcindex is near the edge, shift down srcindex and slide up\r
- * the coefficients\r
- */\r
- srcindex -= 1;\r
-\r
- gausskernel[3] += gausskernel[2];\r
-\r
- for( j = filterLength - 2; j > 0; j-- )\r
- {\r
- gausskernel[j] = gausskernel[j - 1];\r
- }\r
-\r
- gausskernel[0] = 0;\r
- }\r
-\r
- *pmappedindex++ = srcindex - half + 1;\r
-\r
- // Store normalized Gauss kernel\r
-\r
- float sumtemp = 0;\r
-\r
- for( j = 0; j < filterLength; ++j )\r
- {\r
- sumtemp += gausskernel[j];\r
- }\r
-\r
- for( j = 0; j < filterLength; ++j )\r
- {\r
- pcoeff[targetdatalength * j + i] = gausskernel[j] / sumtemp;\r
- }\r
- }\r
-}\r
-\r
-/**\r
-* executive scale using opencl\r
-* get filter args\r
-* create output buffer\r
-* create horizontal filter buffer\r
-* create vertical filter buffer\r
-* create kernels\r
-*/\r
-int hb_ocl_scale_func( void **data, KernelEnv *kenv )\r
-{\r
- cl_int status;\r
-\r
- uint8_t *in_frame = data[0];\r
- uint8_t *out_frame = data[1];\r
- int in_frame_w = (int)data[2];\r
- int in_frame_h = (int)data[3];\r
- int out_frame_w = (int)data[4];\r
- int out_frame_h = (int)data[5];\r
- hb_oclscale_t *os = data[6];\r
-\r
- if( os->use_ocl_mem )\r
- os->h_in_buf = data[0];\r
- int h_filter_len = FILTER_LEN;\r
- int v_filter_len = FILTER_LEN;\r
- //it will make the psnr lower when filter length is 4 in the condition that the video width is shorter than 960 and width is shorter than 544,so we set the filter length to 2\r
- if( out_frame_w <= 960 && out_frame_h <= 544 ) \r
- {\r
- h_filter_len>>=1;\r
- v_filter_len>>=1;\r
- }\r
- if( !os->h_out_buf )\r
- {\r
- hb_log( "OpenCL: Scaling With OpenCL" );\r
- //malloc filter args\r
- float *hf_y, *hf_uv, *vf_y, *vf_uv;\r
- int *hi_y, *hi_uv, *vi_y, *vi_uv;\r
- hf_y = (float*)malloc( sizeof(float)*out_frame_w * h_filter_len );\r
- hf_uv = (float*)malloc( sizeof(float)*(out_frame_w>>1) * h_filter_len );\r
- hi_y = (int*)malloc( sizeof(int)*out_frame_w );\r
- hi_uv = (int*)malloc( sizeof(int)*(out_frame_w>>1));\r
- vf_y = (float*)malloc( sizeof(float)*out_frame_h * v_filter_len );\r
- vf_uv = (float*)malloc( sizeof(float)*(out_frame_h>>1) * v_filter_len );\r
- vi_y = (int*)malloc( sizeof(int)*out_frame_h );\r
- vi_uv = (int*)malloc( sizeof(int)*(out_frame_h>>1) );\r
- //get filter args\r
- hb_set_gauss_interpolation( hf_y, hi_y, out_frame_w, in_frame_w, h_filter_len, 0 );\r
- hb_set_gauss_interpolation( hf_uv, hi_uv, out_frame_w>>1, in_frame_w>>1, h_filter_len, 0 );\r
- hb_set_gauss_interpolation( vf_y, vi_y, out_frame_h, in_frame_h, v_filter_len, 0 );\r
- hb_set_gauss_interpolation( vf_uv, vi_uv, out_frame_h>>1, in_frame_h>>1, v_filter_len, 0 );\r
- //create output buffer\r
- if( !os->use_ocl_mem )\r
- {\r
- CREATEBUF( os->h_in_buf, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, sizeof(uint8_t)* in_frame_w * in_frame_h*3/2 );\r
- }\r
- CREATEBUF( os->h_out_buf, CL_MEM_WRITE_ONLY, sizeof(uint8_t) * out_frame_w * in_frame_h*3/2 );\r
- CREATEBUF( os->v_out_buf, CL_MEM_WRITE_ONLY, sizeof(uint8_t) * out_frame_w * out_frame_h*3/2 );\r
- //create horizontal filter buffer\r
- CREATEBUF( os->h_coeff_y, CL_MEM_READ_ONLY, sizeof(float) * out_frame_w * h_filter_len );\r
- CREATEBUF( os->h_coeff_uv, CL_MEM_READ_ONLY, sizeof(float) * (out_frame_w>>1) * h_filter_len );\r
- CREATEBUF( os->h_index_y, CL_MEM_READ_ONLY, sizeof(int) * out_frame_w );\r
- CREATEBUF( os->h_index_uv, CL_MEM_READ_ONLY, sizeof(int) * (out_frame_w>>1) );\r
- OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_coeff_y, CL_TRUE, 0, sizeof(float) * out_frame_w * h_filter_len, hf_y, 0, NULL, NULL );\r
- OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_coeff_uv, CL_TRUE, 0, sizeof(float) * (out_frame_w>>1) * h_filter_len, hf_uv, 0, NULL, NULL );\r
- OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_index_y, CL_TRUE, 0, sizeof(int) * out_frame_w, hi_y, 0, NULL, NULL );\r
- OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_index_uv, CL_TRUE, 0, sizeof(int) * (out_frame_w>>1), hi_uv, 0, NULL, NULL );\r
- //create vertical filter buffer\r
- CREATEBUF( os->v_coeff_y, CL_MEM_READ_ONLY, sizeof(float) * out_frame_h * v_filter_len );\r
- CREATEBUF( os->v_coeff_uv, CL_MEM_READ_ONLY, sizeof(float) * (out_frame_h>>1) * v_filter_len );\r
- CREATEBUF( os->v_index_y, CL_MEM_READ_ONLY, sizeof(int) * out_frame_h );\r
- CREATEBUF( os->v_index_uv, CL_MEM_READ_ONLY, sizeof(int) * (out_frame_h>>1) );\r
- OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->v_coeff_y, CL_TRUE, 0, sizeof(float) * out_frame_h * v_filter_len, vf_y, 0, NULL, NULL );\r
- OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->v_coeff_uv, CL_TRUE, 0, sizeof(float) * (out_frame_h>>1) * v_filter_len, vf_uv, 0, NULL, NULL );\r
- OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->v_index_y, CL_TRUE, 0, sizeof(int) * out_frame_h, vi_y, 0, NULL, NULL );\r
- OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->v_index_uv, CL_TRUE, 0, sizeof(int) * (out_frame_h>>1), vi_uv, 0, NULL, NULL );\r
- //create horizontal kernel\r
- os->h_kernel = clCreateKernel( kenv->program, "frame_h_scale", NULL );\r
- OCLCHECK( clSetKernelArg, os->h_kernel, 1, sizeof(cl_mem), &os->h_coeff_y );\r
- OCLCHECK( clSetKernelArg, os->h_kernel, 2, sizeof(cl_mem), &os->h_coeff_uv );\r
- OCLCHECK( clSetKernelArg, os->h_kernel, 3, sizeof(cl_mem), &os->h_index_y );\r
- OCLCHECK( clSetKernelArg, os->h_kernel, 4, sizeof(cl_mem), &os->h_index_uv );\r
- OCLCHECK( clSetKernelArg, os->h_kernel, 6, sizeof(int), &in_frame_w );\r
- OCLCHECK( clSetKernelArg, os->h_kernel, 7, sizeof(int), &h_filter_len );\r
- //create vertical kernel\r
- os->v_kernel = clCreateKernel( kenv->program, "frame_v_scale", NULL );\r
- OCLCHECK( clSetKernelArg, os->v_kernel, 1, sizeof(cl_mem), &os->v_coeff_y );\r
- OCLCHECK( clSetKernelArg, os->v_kernel, 2, sizeof(cl_mem), &os->v_coeff_uv );\r
- OCLCHECK( clSetKernelArg, os->v_kernel, 3, sizeof(cl_mem), &os->v_index_y );\r
- OCLCHECK( clSetKernelArg, os->v_kernel, 4, sizeof(cl_mem), &os->v_index_uv );\r
- OCLCHECK( clSetKernelArg, os->v_kernel, 6, sizeof(int), &in_frame_h );\r
- OCLCHECK( clSetKernelArg, os->v_kernel, 7, sizeof(int), &v_filter_len );\r
- free( hf_y );\r
- free( hf_uv );\r
- free( vf_y );\r
- free( vf_uv );\r
- free( hi_y );\r
- free( hi_uv );\r
- free( vi_y );\r
- free( vi_uv );\r
- }\r
- //start horizontal scaling kernel\r
-\r
- if( !os->use_ocl_mem )\r
- {\r
- if( kenv->isAMD )\r
- {\r
- char *mapped = clEnqueueMapBuffer( kenv->command_queue, os->h_in_buf, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, sizeof(uint8_t) * in_frame_w * in_frame_h*3/2, 0, NULL, NULL, NULL );\r
- memcpy( mapped, in_frame, sizeof(uint8_t) * in_frame_w * in_frame_h*3/2 );\r
- clEnqueueUnmapMemObject( kenv->command_queue, os->h_in_buf, mapped, 0, NULL, NULL );\r
- }\r
- else\r
- {\r
- OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_in_buf, CL_TRUE, 0, sizeof(uint8_t) * in_frame_w * in_frame_h * 3/2, in_frame, 0, NULL, NULL );\r
- }\r
- }\r
-\r
- kenv->kernel = os->h_kernel;\r
- size_t dims[2];\r
- dims[0] = out_frame_w;\r
- dims[1] = in_frame_h;\r
- OCLCHECK( clSetKernelArg, kenv->kernel, 0, sizeof(cl_mem), &os->h_in_buf );\r
- OCLCHECK( clSetKernelArg, kenv->kernel, 5, sizeof(cl_mem), &os->h_out_buf );\r
- OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL, dims, NULL, 0, NULL, NULL );\r
- //start vertical scaling kernel\r
-\r
- kenv->kernel = os->v_kernel;\r
- dims[0] = out_frame_w;\r
- dims[1] = out_frame_h;\r
- OCLCHECK( clSetKernelArg, kenv->kernel, 0, sizeof(cl_mem), &os->h_out_buf );\r
- OCLCHECK( clSetKernelArg, kenv->kernel, 5, sizeof(cl_mem), &os->v_out_buf );\r
- OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL, dims, NULL, 0, NULL, NULL );\r
- OCLCHECK( clEnqueueReadBuffer, kenv->command_queue, os->v_out_buf, CL_TRUE, 0, sizeof(uint8_t) * out_frame_w * out_frame_h * 3/2, out_frame, 0, NULL, NULL );\r
-\r
- return 1;\r
-}\r
-\r
-/**\r
-* function describe: this function is used to scaling video frame. it uses the gausi scaling algorithm\r
-* parameter:\r
+#include <math.h>
+#include "common.h"
+#include "openclwrapper.h"
+#define FILTER_LEN 4
+
+#define _A -0.5f
+
+cl_float cubic(cl_float x)
+{
+ if (x < 0)
+ x = -x;
+
+ if (x < 1)
+ return (_A + 2.0f) * (x * x * x) - (_A + 3.0f) * (x * x) + 0 + 1;
+ else if (x < 2)
+ return (_A) * (x * x * x) - (5.0f * _A) * (x * x) + (8.0f * _A) * x - (4.0f * _A);
+ else
+ return 0;
+}
+
+
+cl_float *hb_bicubic_weights(cl_float scale, int length)
+{
+ cl_float *weights = (cl_float*) malloc(length * sizeof(cl_float) * 4);
+
+ int i; // C rocks
+ cl_float *out = weights;
+ for (i = 0; i < length; ++i)
+ {
+ cl_float x = i / scale;
+ cl_float dx = x - (int)x;
+ *out++ = cubic(-dx - 1.0f);
+ *out++ = cubic(-dx);
+ *out++ = cubic(-dx + 1.0f);
+ *out++ = cubic(-dx + 2.0f);
+ }
+ return weights;
+}
+
+int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv);
+
+/**
+* executive scale using opencl
+* get filter args
+* create output buffer
+* create horizontal filter buffer
+* create vertical filter buffer
+* create kernels
+*/
+int hb_ocl_scale_func( void **data, KernelEnv *kenv )
+{
+ cl_int status;
+
+ cl_mem in_buf = data[0];
+ cl_mem out_buf = data[1];
+ int crop_top = data[2];
+ int crop_bottom = data[3];
+ int crop_left = data[4];
+ int crop_right = data[5];
+ int in_frame_w = (int)data[6];
+ int in_frame_h = (int)data[7];
+ int out_frame_w = (int)data[8];
+ int out_frame_h = (int)data[9];
+ hb_oclscale_t *os = data[10];
+ hb_buffer_t *in = data[11];
+ hb_buffer_t *out = data[12];
+
+ if (os->initialized == 0)
+ {
+ hb_log( "Scaling With OpenCL" );
+ if (kenv->isAMD != 0)
+ hb_log( "Using Zero Copy");
+ // create the block kernel
+ cl_int status;
+ os->m_kernel = clCreateKernel( kenv->program, "frame_scale", &status );
+
+ os->initialized = 1;
+ }
+
+ {
+ // Use the new kernel
+ cl_event events[5];
+ int eventCount = 0;
+
+ if (kenv->isAMD == 0) {
+ status = clEnqueueUnmapMemObject(kenv->command_queue, in->cl.buffer, in->data, 0, NULL, &events[eventCount++]);
+ status = clEnqueueUnmapMemObject(kenv->command_queue, out->cl.buffer, out->data, 0, NULL, &events[eventCount++]);
+ }
+
+ cl_int srcPlaneOffset0 = in->plane[0].data - in->data;
+ cl_int srcPlaneOffset1 = in->plane[1].data - in->data;
+ cl_int srcPlaneOffset2 = in->plane[2].data - in->data;
+ cl_int srcRowWords0 = in->plane[0].stride;
+ cl_int srcRowWords1 = in->plane[1].stride;
+ cl_int srcRowWords2 = in->plane[2].stride;
+ cl_int dstPlaneOffset0 = out->plane[0].data - out->data;
+ cl_int dstPlaneOffset1 = out->plane[1].data - out->data;
+ cl_int dstPlaneOffset2 = out->plane[2].data - out->data;
+ cl_int dstRowWords0 = out->plane[0].stride;
+ cl_int dstRowWords1 = out->plane[1].stride;
+ cl_int dstRowWords2 = out->plane[2].stride;
+
+ if (crop_top != 0 || crop_bottom != 0 || crop_left != 0 || crop_right != 0) {
+ srcPlaneOffset0 += crop_left + crop_top * srcRowWords0;
+ srcPlaneOffset1 += crop_left / 2 + (crop_top / 2) * srcRowWords1;
+ srcPlaneOffset2 += crop_left / 2 + (crop_top / 2) * srcRowWords2;
+ in_frame_w = in_frame_w - crop_right - crop_left;
+ in_frame_h = in_frame_h - crop_bottom - crop_top;
+ }
+
+ cl_float xscale = (out_frame_w * 1.0f) / in_frame_w;
+ cl_float yscale = (out_frame_h * 1.0f) / in_frame_h;
+ setupScaleWeights(xscale, yscale, out_frame_w, out_frame_h, os, kenv);
+
+ OCLCHECK( clSetKernelArg, os->m_kernel, 0, sizeof(cl_mem), &out_buf );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 1, sizeof(cl_mem), &in_buf );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 2, sizeof(cl_float), &xscale );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 3, sizeof(cl_float), &yscale );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 4, sizeof(cl_int), &srcPlaneOffset0 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 5, sizeof(cl_int), &srcPlaneOffset1 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 6, sizeof(cl_int), &srcPlaneOffset2 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 7, sizeof(cl_int), &dstPlaneOffset0 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 8, sizeof(cl_int), &dstPlaneOffset1 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 9, sizeof(cl_int), &dstPlaneOffset2 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 10, sizeof(cl_int), &srcRowWords0 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 11, sizeof(cl_int), &srcRowWords1 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 12, sizeof(cl_int), &srcRowWords2 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 13, sizeof(cl_int), &dstRowWords0 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 14, sizeof(cl_int), &dstRowWords1 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 15, sizeof(cl_int), &dstRowWords2 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 16, sizeof(int), &in_frame_w ); // FIXME: type mismatch
+ OCLCHECK( clSetKernelArg, os->m_kernel, 17, sizeof(int), &in_frame_h ); //
+ OCLCHECK( clSetKernelArg, os->m_kernel, 18, sizeof(int), &out_frame_w ); //
+ OCLCHECK( clSetKernelArg, os->m_kernel, 19, sizeof(int), &out_frame_h ); //
+ OCLCHECK( clSetKernelArg, os->m_kernel, 20, sizeof(cl_mem), &os->bicubic_x_weights );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 21, sizeof(cl_mem), &os->bicubic_y_weights );
+
+ size_t workOffset[] = { 0, 0, 0 };
+ size_t globalWorkSize[] = { 1, 1, 1 };
+ size_t localWorkSize[] = { 1, 1, 1 };
+
+ int xgroups = (out_frame_w + 63) / 64;
+ int ygroups = (out_frame_h + 15) / 16;
+
+ localWorkSize[0] = 64;
+ localWorkSize[1] = 1;
+ localWorkSize[2] = 1;
+ globalWorkSize[0] = xgroups * 64;
+ globalWorkSize[1] = ygroups;
+ globalWorkSize[2] = 3;
+
+ OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, os->m_kernel, 3, workOffset, globalWorkSize, localWorkSize, eventCount, (eventCount == 0) ? NULL : &events[0], &events[eventCount] );
+ ++eventCount;
+
+ if (kenv->isAMD == 0) {
+ in->data = clEnqueueMapBuffer(kenv->command_queue, in->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, in->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount], &status);
+ out->data = clEnqueueMapBuffer(kenv->command_queue, out->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, out->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount + 1], &status);
+ eventCount += 2;
+ }
+
+ clFlush(kenv->command_queue);
+ clWaitForEvents(eventCount, &events[0]);
+ int i;
+ for (i = 0; i < eventCount; ++i)
+ clReleaseEvent(events[i]);
+ }
+
+ return 1;
+}
+
+int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv) {
+ cl_int status;
+ if (os->xscale != xscale || os->width < width) {
+ cl_float *xweights = hb_bicubic_weights(xscale, width);
+ CL_FREE(os->bicubic_x_weights);
+ CREATEBUF(os->bicubic_x_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * width * 4);
+ OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_x_weights, CL_TRUE, 0, sizeof(cl_float) * width * 4, xweights, 0, NULL, NULL );
+ os->width = width;
+ os->xscale = xscale;
+ free(xweights);
+ }
+
+ if ((os->yscale != yscale) || (os->height < height)) {
+ cl_float *yweights = hb_bicubic_weights(yscale, height);
+ CL_FREE(os->bicubic_y_weights);
+ CREATEBUF(os->bicubic_y_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * height * 4);
+ OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_y_weights, CL_TRUE, 0, sizeof(cl_float) * height * 4, yweights, 0, NULL, NULL );
+ os->height = height;
+ os->yscale = yscale;
+ free(yweights);
+ }
+ return 0;
+}
+
+
+/**
+* function describe: this function is used to scaling video frame. it uses the gausi scaling algorithm
+* parameter:
* inputFrameBuffer: the source video frame opencl buffer\r
* outputdata: the destination video frame buffer\r
* inputWidth: the width of the source video frame\r
* inputHeight: the height of the source video frame\r
-* outputWidth: the width of destination video frame\r
-* outputHeight: the height of destination video frame\r
-*/\r
-int hb_ocl_scale( cl_mem in_buf, uint8_t *in_data, uint8_t *out_data, int in_w, int in_h, int out_w, int out_h, hb_oclscale_t *os )\r
-{\r
- void *data[7];\r
- static int init_flag = 0;\r
- if( init_flag == 0 )\r
- {\r
- int st = hb_register_kernel_wrapper( "frame_h_scale", hb_ocl_scale_func );\r
- if( !st )\r
- {\r
- hb_log( "OpenCL: Register kernel[%s] failed", "frame_h_scale" );\r
- return 0;\r
- }\r
- init_flag++;\r
- }\r
-\r
- if( in_data==NULL )\r
- {\r
- data[0] = in_buf;\r
- os->use_ocl_mem = 1;\r
- }\r
- else\r
- {\r
- data[0] = in_data;\r
- os->use_ocl_mem = 0;\r
- }\r
-\r
- data[1] = out_data;\r
- data[2] = (void*)in_w;\r
- data[3] = (void*)in_h;\r
- data[4] = (void*)out_w;\r
- data[5] = (void*)out_h;\r
- data[6] = os;\r
-\r
- if( !hb_run_kernel( "frame_h_scale", data ) )\r
- {\r
- hb_log( "OpenCL: Run kernel[%s] failed", "frame_scale" );\r
- }\r
-\r
- return 0;\r
-}\r
-#endif\r
+* outputWidth: the width of destination video frame
+* outputHeight: the height of destination video frame
+*/
+
+
+static int s_scale_init_flag = 0;
+
+int do_scale_init()
+{
+ if ( s_scale_init_flag==0 )
+ {
+ int st = hb_register_kernel_wrapper( "frame_scale", hb_ocl_scale_func );
+ if( !st )
+ {
+ hb_log( "register kernel[%s] failed", "frame_scale" );
+ return 0;
+ }
+ s_scale_init_flag++;
+ }
+ return 1;
+}
+
+
+int hb_ocl_scale(hb_buffer_t *in, hb_buffer_t *out, int *crop, hb_oclscale_t *os)
+{
+ void *data[13];
+
+ if (do_scale_init() == 0)
+ return 0;
+
+ data[0] = in->cl.buffer;
+ data[1] = out->cl.buffer;
+ data[2] = (void*)(crop[0]);
+ data[3] = (void*)(crop[1]);
+ data[4] = (void*)(crop[2]);
+ data[5] = (void*)(crop[3]);
+ data[6] = (void*)(in->f.width);
+ data[7] = (void*)(in->f.height);
+ data[8] = (void*)(out->f.width);
+ data[9] = (void*)(out->f.height);
+ data[10] = os;
+ data[11] = in;
+ data[12] = out;
+
+ if( !hb_run_kernel( "frame_scale", data ) )
+ hb_log( "run kernel[%s] failed", "frame_scale" );
+ return 0;
+}
+
+
+
+
+
+#endif
char kernelName[MAX_KERNEL_NAME_LEN+1];\r
char * kernelStr;\r
}hb_kernel_node;\r
-\r
-static GPUEnv gpu_env;\r
-static int isInited = 0;\r
-static hb_kernel_node gKernels[MAX_KERNEL_NUM];\r
-\r
-#define ADD_KERNEL_CFG( idx, s, p ){\\r
+
+static GPUEnv gpu_env;
+static int isInited = 0;
+static int useBuffers = 0;
+static hb_kernel_node gKernels[MAX_KERNEL_NUM];
+
+#define ADD_KERNEL_CFG( idx, s, p ){\
strcpy( gKernels[idx].kernelName, s );\\r
gKernels[idx].kernelStr = p;\\r
strcpy( gpu_env.kernel_names[idx], s );\\r
\r
/**\r
* hb_regist_opencl_kernel\r
- */\r
-int hb_regist_opencl_kernel()\r
-{\r
- if( !gpu_env.isUserCreated )\r
- memset( &gpu_env, 0, sizeof(gpu_env) );\r
-\r
- gpu_env.file_count = 0; //argc;\r
- gpu_env.kernel_count = 0UL;\r
-\r
- ADD_KERNEL_CFG( 0, "frame_h_scale", NULL )\r
- ADD_KERNEL_CFG( 1, "frame_v_scale", NULL )\r
- ADD_KERNEL_CFG( 2, "nv12toyuv", NULL )\r
- ADD_KERNEL_CFG( 3, "scale_opencl", NULL )\r
- ADD_KERNEL_CFG( 4, "yadif_filter", NULL )\r
-\r
- return 0;\r
-}\r
+ */
+int hb_regist_opencl_kernel()
+{
+ //if( !gpu_env.isUserCreated )
+ // memset( &gpu_env, 0, sizeof(gpu_env) );
+ //Comment for posterity: When in doubt just zero out a structure full of pointers to allocated resources.
+
+ gpu_env.file_count = 0; //argc;
+ gpu_env.kernel_count = 0UL;
+
+ ADD_KERNEL_CFG( 0, "frame_scale", NULL )
+ ADD_KERNEL_CFG( 1, "yadif_filter", NULL )
+
+ return 0;
+}
\r
/**\r
* hb_regist_opencl_kernel\r
}\r
\r
/**\r
- * hb_init_opencl_env\r
- * @param gpu_info -\r
- */\r
-int hb_init_opencl_env( GPUEnv *gpu_info )\r
-{\r
- size_t length;\r
+ * hb_init_opencl_env
+ * @param gpu_info -
+ */
+
+static int init_once = 0;
+int hb_init_opencl_env( GPUEnv *gpu_info )
+{
+ size_t length;
cl_int status;\r
cl_uint numPlatforms, numDevices;\r
cl_platform_id *platforms;\r
cl_context_properties cps[3];\r
char platformName[100];\r
- unsigned int i;\r
- void *handle = INVALID_HANDLE_VALUE;\r
-\r
- /*\r
- * Have a look at the available platforms.\r
- */\r
+ unsigned int i;
+ void *handle = INVALID_HANDLE_VALUE;
+
+
+ if (init_once != 0)
+ return 0;
+ else
+ init_once = 1;
+ /*
+ * Have a look at the available platforms.
+ */
if( !gpu_info->isUserCreated )\r
{\r
status = clGetPlatformIDs( 0, NULL, &numPlatforms );\r
\r
#ifdef USE_EXTERNAL_KERNEL\r
status = hb_convert_to_string( filename, &source_str, gpu_info, idx );\r
- if( status == 0 )\r
- return(0);\r
-#else\r
- int kernel_src_size = strlen( kernel_src_hscale ) + strlen( kernel_src_vscale ) \r
- + strlen( kernel_src_nvtoyuv ) + strlen( kernel_src_hscaleall ) \r
- + strlen( kernel_src_hscalefast ) + strlen( kernel_src_vscalealldither ) \r
- + strlen( kernel_src_vscaleallnodither ) + strlen( kernel_src_vscalefast )\r
- + strlen( kernel_src_yadif_filter );\r
- source_str = (char*)malloc( kernel_src_size + 2 );\r
- strcpy( source_str, kernel_src_hscale );\r
- strcat( source_str, kernel_src_vscale );\r
- strcat( source_str, kernel_src_nvtoyuv );\r
- strcat( source_str, kernel_src_hscaleall );\r
- strcat( source_str, kernel_src_hscalefast );\r
- strcat( source_str, kernel_src_vscalealldither );\r
- strcat( source_str, kernel_src_vscaleallnodither );\r
- strcat( source_str, kernel_src_vscalefast );\r
- strcat( source_str, kernel_src_yadif_filter );\r
-#endif\r
-\r
+ if( status == 0 )
+ return(0);
+#else
+ int kernel_src_size = strlen(kernel_src_scale) + strlen(kernel_src_yadif_filter);
+
+// char *scale_src;
+// status = hb_convert_to_string("./scale_kernels.cl", &scale_src, gpu_info, idx);
+// if (status != 0)
+// kernel_src_size += strlen(scale_src);
+
+ source_str = (char*)malloc( kernel_src_size + 2 );
+ strcpy( source_str, kernel_src_scale );
+// strcat( source_str, scale_src ); //
+ strcat( source_str, kernel_src_yadif_filter );
+#endif
+
source = source_str;\r
source_size[0] = strlen( source );\r
\r
}\r
\r
strcpy( gpu_env.kernelSrcFile[idx], filename );\r
-\r
- if (binaryExisted != 1)\r
- {\r
- hb_generat_bin_from_kernel_source(gpu_env.programs[idx], filename);\r
- }\r
-\r
- gpu_info->file_count += 1;\r
+
+ if (binaryExisted != 1)
+ {
+ //hb_generat_bin_from_kernel_source(gpu_env.programs[idx], filename);
+ }
+
+ gpu_info->file_count += 1;
\r
return(1);\r
}\r
if( status == 0 || gpu_env.kernel_count == 0 )\r
{\r
return(1);\r
-\r
- }\r
-\r
- isInited = 1;\r
- }\r
-\r
+
+ }
+
+ useBuffers = 1;
+ isInited = 1;
+ }
+
return(0);\r
}\r
\r
return 0; \r
}\r
\r
- return 1;\r
-}\r
-\r
-int hb_copy_buffer(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t cb)\r
-{\r
- int status = clEnqueueCopyBuffer(gpu_env.command_queue,\r
+ return 1;
+}
+
+int hb_cl_create_mapped_buffer(cl_mem *mem, unsigned char **addr, int size)
+{
+ int status;
+ int flags = CL_MEM_ALLOC_HOST_PTR;
+ //cl_event event;
+ *mem = clCreateBuffer(gpu_env.context, flags, size, NULL, &status);
+ *addr = clEnqueueMapBuffer(gpu_env.command_queue, *mem, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL/*&event*/, &status);
+
+ //hb_log("\t **** context: %.8x cmdqueue: %.8x cl_mem: %.8x mapaddr: %.8x size: %d status: %d", gpu_env.context, gpu_env.command_queue, mem, addr, size, status);
+
+ return (status == CL_SUCCESS) ? 1 : 0;
+}
+
+int hb_cl_free_mapped_buffer(cl_mem mem, unsigned char *addr)
+{
+ cl_event event;
+ int status = clEnqueueUnmapMemObject(gpu_env.command_queue, mem, addr, 0, NULL, &event);
+ if (status == CL_SUCCESS)
+ clWaitForEvents(1, &event);
+ else
+ hb_log("hb_free_mapped_buffer: error %d", status);
+ return (status == CL_SUCCESS) ? 1 : 0;
+}
+
+void hb_opencl_init()
+{
+ hb_get_opencl_env();
+}
+
+int hb_use_buffers()
+{
+ return useBuffers;
+}
+
+int hb_copy_buffer(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t cb)
+{
+ int status = clEnqueueCopyBuffer(gpu_env.command_queue,
src_buffer,\r
dst_buffer,\r
src_offset, dst_offset, cb,\r