{
// Decomb parameters
int mode;
+ int use_opencl;
int filter_mode;
int spatial_metric;
int motion_threshold;
taskset_t mask_dilate_taskset; // Threads for decomb mask dilate
taskset_t eedi2_taskset; // Threads for eedi2 - one per plane
+
+ void* cl_mem_dst;
+ void* cl_mem_prev;
+ void* cl_mem_cur;
+ void* cl_mem_next;
};
static int hb_decomb_init( hb_filter_object_t * filter,
build_gamma_lut( pv );
+ pv->use_opencl = init->job->use_opencl;
+
pv->deinterlaced_frames = 0;
pv->blended_frames = 0;
pv->unfiltered_frames = 0;
memset(pv->mask_filtered->data, 0, pv->mask_filtered->size);
memset(pv->mask_temp->data, 0, pv->mask_temp->size);
+#ifdef USE_OPENCL
+ if(pv->use_opencl){
+ hb_log("decomb with OpenCL");
+ if( !hb_create_buffer( &(pv->cl_mem_dst), CL_MEM_READ_WRITE, 3133440 ) )
+ {
+ hb_log("hb_create_buffer cl_outbuf Error\n");
+ return -1;
+ }
+ if( !hb_create_buffer( &(pv->cl_mem_prev), CL_MEM_READ_WRITE, 3133440 ) )
+ {
+ hb_log("hb_create_buffer cl_outbuf Error\n");
+ return -1;
+ }
+ if( !hb_create_buffer( &(pv->cl_mem_cur), CL_MEM_READ_WRITE, 3133440 ) )
+ {
+ hb_log("hb_create_buffer cl_outbuf Error\n");
+ return -1;
+ }
+ if( !hb_create_buffer( &(pv->cl_mem_next), CL_MEM_READ_WRITE, 3133440 ) )
+ {
+ hb_log("hb_create_buffer cl_outbuf Error\n");
+ return -1;
+ }
+ }
+#endif
+
int ii;
if( pv->mode & MODE_EEDI2 )
{
hb_buffer_close(&pv->mask_filtered);
hb_buffer_close(&pv->mask_temp);
+ clReleaseMemObject( pv->cl_mem_dst );
+ clReleaseMemObject( pv->cl_mem_prev );
+ clReleaseMemObject( pv->cl_mem_cur );
+ clReleaseMemObject( pv->cl_mem_next );
+
if( pv->mode & MODE_EEDI2 )
{
/* Cleanup eedi-half buffers */
/* deinterlace both fields if mcdeint is enabled without bob */
int frame, num_frames = 1;
- if (pv->mode & (MODE_MCDEINT | MODE_BOB))
- {
- num_frames = 2;
- }
- // Will need up to 2 buffers simultaneously
- int idx = 0;
- hb_buffer_t * o_buf[2] = {NULL,};
-
- /* Perform yadif filtering */
for( frame = 0; frame < num_frames; frame++ )
{
int parity = frame ^ tff ^ 1;
// tff for eedi2
pv->tff = !parity;
- if (o_buf[idx] == NULL)
- {
- o_buf[idx] = hb_video_buffer_init(in->f.width, in->f.height);
+ if(out == NULL){
+ out = hb_video_buffer_init(in->f.width, in->f.height);
}
if (frame)
else
pv->skip_comb_check = 0;
- yadif_filter(pv, o_buf[idx], parity, tff);
-
- // Unfortunately, all frames must be fed to mcdeint combed or
- // not since it maintains state that is updated by each frame.
- if (pv->mcdeint_mode >= 0)
- {
- if (o_buf[idx^1] == NULL)
+#ifdef USE_OPENCL
+ if(pv->use_opencl){
+ int is_combed;
+ if (!pv->skip_comb_check)
{
- o_buf[idx^1] = hb_video_buffer_init(in->f.width, in->f.height);
+ is_combed = pv->spatial_metric >= 0 ? comb_segmenter( pv ) : 1;
+ }
+ else
+ {
+ is_combed = pv->is_combed;
}
- /* Perform mcdeint filtering */
- mcdeint_filter(o_buf[idx^1], o_buf[idx], parity, &pv->mcdeint);
-
- // If frame was combed, we will use results from mcdeint
- // else we will use yadif result
- if (pv->is_combed)
- idx ^= 1;
- }
- // Add to list of output buffers (should be at most 2)
- if ((pv->mode & MODE_BOB) ||
- pv->is_combed == 0 ||
- frame == num_frames - 1)
- {
- if ( out == NULL )
+ if( is_combed == 1 )
{
- last = out = o_buf[idx];
+ pv->deinterlaced_frames++;
+ }
+ else if( is_combed == 2 )
+ {
+ pv->blended_frames++;
}
else
{
- last->next = o_buf[idx];
- last = last->next;
+ pv->unfiltered_frames++;
}
- last->next = NULL;
- // Indicate that buffer was consumed
- o_buf[idx] = NULL;
+ pv->is_combed = is_combed;
+ out->s = in->s;
+ out->f = in->f;
+
+ if(is_combed){
+ hb_write_opencl_frame_buffer(pv->cl_mem_prev, pv->ref[0]->plane[0].data, pv->ref[0]->plane[1].data, pv->ref[0]->plane[2].data, pv->ref[0]->plane[0].stride, pv->ref[0]->plane[1].stride, pv->ref[0]->plane[2].stride, in->f.height, 0);
- /* Copy buffered settings to output buffer settings */
- last->s = pv->ref[1]->s;
- idx ^= 1;
+ hb_write_opencl_frame_buffer(pv->cl_mem_cur, pv->ref[1]->plane[0].data, pv->ref[1]->plane[1].data, pv->ref[1]->plane[2].data, pv->ref[1]->plane[0].stride, pv->ref[1]->plane[1].stride, pv->ref[1]->plane[2].stride, in->f.height, 0);
- if ((pv->mode & MODE_MASK) && pv->spatial_metric >= 0 )
+ hb_write_opencl_frame_buffer(pv->cl_mem_next, pv->ref[2]->plane[0].data, pv->ref[2]->plane[1].data, pv->ref[2]->plane[2].data, pv->ref[2]->plane[0].stride, pv->ref[2]->plane[1].stride, pv->ref[2]->plane[2].stride, in->f.height, 0);
+
+ cl_yadif_filter(pv->cl_mem_dst,
+ pv->cl_mem_prev,
+ pv->cl_mem_cur,
+ pv->cl_mem_next,
+ parity,
+ tff,
+ pv->ref[1]->plane[0].stride,
+ pv->ref[1]->plane[1].stride,
+ pv->ref[1]->plane[0].stride,
+ pv->ref[1]->plane[1].stride,
+ pv->mode,
+ in->f.width,
+ in->f.height);
+
+ hb_read_opencl_frame_buffer(pv->cl_mem_dst, out->plane[0].data, out->plane[1].data, out->plane[2].data, pv->ref[1]->plane[0].stride, pv->ref[1]->plane[1].stride, pv->ref[1]->plane[2].stride, in->f.height);
+
+ hb_buffer_move_subs( out, pv->ref[1] );
+ }
+ else
{
- if (pv->mode == MODE_MASK ||
- ((pv->mode & MODE_MASK) && (pv->mode & MODE_FILTER)) ||
- ((pv->mode & MODE_MASK) && (pv->mode & MODE_GAMMA)) ||
- pv->is_combed)
- {
- apply_mask(pv, last);
- }
+ hb_buffer_copy(out, pv->ref[1]);
}
+ }else{
+ yadif_filter(pv, out, parity, tff);
}
- }
- // Copy subs only to first output buffer
- hb_buffer_move_subs( out, pv->ref[1] );
+#else
+ yadif_filter(pv, out, parity, tff);
+#endif
- hb_buffer_close(&o_buf[0]);
- hb_buffer_close(&o_buf[1]);
+ hb_buffer_t* mcdeint_out = NULL;
+ if (pv->mcdeint_mode >= 0)
+ {
+ mcdeint_out = hb_video_buffer_init(in->f.width, in->f.height);
+ mcdeint_filter(mcdeint_out, out, parity, &pv->mcdeint);
+ }
- /* if this frame was deinterlaced and bob mode is engaged, halve
- the duration of the saved timestamps. */
- if ((pv->mode & MODE_BOB) && pv->is_combed)
- {
- out->s.stop -= (out->s.stop - out->s.start) / 2LL;
- last->s.start = out->s.stop;
- last->s.new_chap = 0;
+ if(mcdeint_out != NULL){
+ last = mcdeint_out;
+ }else{
+ last = out;
+ }
+
+ last->s = pv->ref[1]->s;
+ if ((pv->mode & MODE_MASK) && pv->spatial_metric >= 0 )
+ {
+ if (pv->mode == MODE_MASK ||
+ ((pv->mode & MODE_MASK) && (pv->mode & MODE_FILTER)) ||
+ ((pv->mode & MODE_MASK) && (pv->mode & MODE_GAMMA)) ||
+ pv->is_combed)
+ {
+ apply_mask(pv, last);
+ }
+ }
}
*buf_out = out;
}\r
);\r
\r
+char *kernel_src_yadif_filter = KERNEL(\r
+ void filter_v6(\r
+ global unsigned char *dst,\r
+ global unsigned char *prev, \r
+ global unsigned char *cur, \r
+ global unsigned char *next,\r
+ int x,\r
+ int y,\r
+ int width,\r
+ int height,\r
+ int parity,\r
+ int inlinesize,\r
+ int outlinesize,\r
+ int inmode,\r
+ int uvflag\r
+ )\r
+ {\r
+\r
+ int flag = uvflag * (y >=height) * height; \r
+ int prefs = select(-(inlinesize), inlinesize,((y+1) - flag) <height);\r
+ int mrefs = select(inlinesize, -(inlinesize),y - flag);\r
+ int mode = select(inmode,2,(y - flag==1) || (y - flag + 2==height));\r
+ int score;\r
+ \r
+ global unsigned char *prev2 = parity ? prev : cur ;\r
+ global unsigned char *next2 = parity ? cur : next;\r
+ int index = x + y * inlinesize;\r
+ int outindex = x + y * outlinesize;\r
+ int c = cur[index + mrefs]; \r
+ int d = (prev2[index] + next2[index])>>1; \r
+ int e = cur[index + prefs]; \r
+ int temporal_diff0 = abs((prev2[index]) - (next2[index])); \r
+ int temporal_diff1 =(abs(prev[index + mrefs] - c) + abs(prev[index + prefs] - e) )>>1; \r
+ int temporal_diff2 =(abs(next[index + mrefs] - c) + abs(next[index + prefs] - e) )>>1; \r
+ int diff = max(max(temporal_diff0>>1, temporal_diff1), temporal_diff2); \r
+ int spatial_pred = (c+e)>>1; \r
+ int spatial_score = abs(cur[index + mrefs-1] - cur[index + prefs-1]) + abs(c-e) + abs(cur[index + mrefs+1] - cur[index + prefs+1]) - 1; \r
+ //check -1\r
+ score = abs(cur[index + mrefs-2] - cur[index + prefs])\r
+ + abs(cur[index + mrefs-1] - cur[index + prefs+1])\r
+ + abs(cur[index + mrefs] - cur[index + prefs+2]);\r
+ if (score < spatial_score)\r
+ {\r
+ spatial_score= score;\r
+ spatial_pred= (cur[index + mrefs-1] + cur[index + prefs+1])>>1;\r
+ }\r
+ //check -2\r
+ score = abs(cur[index + mrefs-3] - cur[index + prefs+1])\r
+ + abs(cur[index + mrefs-2] - cur[index + prefs+2])\r
+ + abs(cur[index + mrefs-1] - cur[index + prefs+3]);\r
+ if (score < spatial_score)\r
+ {\r
+ spatial_score= score;\r
+ spatial_pred= (cur[index + mrefs-2] + cur[index + prefs+2])>>1;\r
+ }\r
+ //check 1\r
+ score = abs(cur[index + mrefs] - cur[index + prefs-2])\r
+ + abs(cur[index + mrefs+1] - cur[index + prefs-1])\r
+ + abs(cur[index + mrefs+2] - cur[index + prefs]);\r
+ if (score < spatial_score)\r
+ {\r
+ spatial_score= score;\r
+ spatial_pred= (cur[index + mrefs+1] + cur[index + prefs-1])>>1;\r
+ }\r
+ //check 2\r
+ score = abs(cur[index + mrefs+1] - cur[index + prefs-3])\r
+ + abs(cur[index + mrefs+2] - cur[index + prefs-2])\r
+ + abs(cur[index + mrefs+3] - cur[index + prefs-1]);\r
+ if (score < spatial_score)\r
+ {\r
+ spatial_score= score;\r
+ spatial_pred= (cur[index + mrefs+2] + cur[index + prefs-2])>>1;\r
+ }\r
+ if (mode < 2)\r
+ { \r
+ int b = (prev2[index + (mrefs<<1)] + next2[index + (mrefs<<1)])>>1; \r
+ int f = (prev2[index + (prefs<<1)] + next2[index + (prefs<<1)])>>1; \r
+ int diffmax = max(max(d-e, d-c), min(b-c, f-e)); \r
+ int diffmin = min(min(d-e, d-c), max(b-c, f-e)); \r
+\r
+ diff = max(max(diff, diffmin), -diffmax); \r
+ } \r
+ if (spatial_pred > d + diff) \r
+ {\r
+ spatial_pred = d + diff; \r
+ }\r
+ else if (spatial_pred < d - diff) \r
+ {\r
+ spatial_pred = d - diff; \r
+ }\r
+\r
+ dst[outindex] = spatial_pred; \r
+ }\r
+\r
+ kernel void yadif_filter(\r
+ global unsigned char *dst,\r
+ global unsigned char *prev,\r
+ global unsigned char *cur,\r
+ global unsigned char *next,\r
+ int parity,\r
+ int inlinesizeY,\r
+ int inlinesizeUV,\r
+ int outlinesizeY,\r
+ int outlinesizeUV,\r
+ int mode)\r
+ {\r
+ int x=get_global_id(0);\r
+ int y=(get_global_id(1)<<1) + (!parity);\r
+ int width=(get_global_size(0)<<1)/3;\r
+ int height=get_global_size(1)<<1;\r
+ \r
+\r
+ global unsigned char *dst_Y=dst;\r
+ global unsigned char *dst_U=dst_Y+height*outlinesizeY;\r
+\r
+ global unsigned char *prev_Y=prev;\r
+ global unsigned char *prev_U=prev_Y+height*inlinesizeY;\r
+\r
+ global unsigned char *cur_Y=cur;\r
+ global unsigned char *cur_U=cur_Y+height*inlinesizeY;\r
+\r
+ global unsigned char *next_Y=next;\r
+ global unsigned char *next_U=next_Y+height*inlinesizeY;\r
+\r
+ if(x < width)\r
+ {\r
+ filter_v6(dst_Y,prev_Y,cur_Y,next_Y,x,y,width,height,parity,inlinesizeY,outlinesizeY,mode,0);\r
+ }\r
+ else\r
+ {\r
+ x = x - width;\r
+ filter_v6(dst_U,prev_U,cur_U,next_U,x,y,width>>1,height>>1,parity,inlinesizeUV,outlinesizeUV,mode,1);\r
+ }\r
+ }\r
+ );\r
+\r
#endif\r
ADD_KERNEL_CFG( 1, "frame_v_scale", NULL )\r
ADD_KERNEL_CFG( 2, "nv12toyuv", NULL )\r
ADD_KERNEL_CFG( 3, "scale_opencl", NULL )\r
+ ADD_KERNEL_CFG( 4, "yadif_filter", NULL )\r
\r
return 0;\r
}\r
deviceName,\r
NULL);\r
\r
- str = (char*)strstr(cl_file_name, ".cl");\r
+ str = (char*)strstr( cl_file_name, (char*)".cl" );\r
memcpy(cl_name, cl_file_name, str - cl_file_name);\r
cl_name[str - cl_file_name] = '\0';\r
sprintf(fileName, "./%s - %s.bin", cl_name, deviceName);\r
#else\r
int kernel_src_size = strlen( kernel_src_hscale ) + strlen( kernel_src_vscale ) \r
+ strlen( kernel_src_nvtoyuv ) + strlen( kernel_src_hscaleall ) \r
- + strlen( kernel_src_hscalefast ) + strlen( kernel_src_vscalealldither ) \r
- + strlen( kernel_src_vscaleallnodither ) + strlen( kernel_src_vscalefast );\r
+ + strlen( kernel_src_hscalefast ) + strlen( kernel_src_vscalealldither ) \r
+ + strlen( kernel_src_vscaleallnodither ) + strlen( kernel_src_vscalefast )\r
+ + strlen( kernel_src_yadif_filter );\r
source_str = (char*)malloc( kernel_src_size + 2 );\r
strcpy( source_str, kernel_src_hscale );\r
strcat( source_str, kernel_src_vscale );\r
strcat( source_str, kernel_src_vscalealldither );\r
strcat( source_str, kernel_src_vscaleallnodither );\r
strcat( source_str, kernel_src_vscalefast );\r
+ strcat( source_str, kernel_src_yadif_filter );\r
#endif\r
\r
source = source_str;\r
\r
/* create a cl program executable for all the devices specified */\r
if( !gpu_info->isUserCreated ) \r
- {\r
+ {\r
status = clBuildProgram( gpu_info->programs[idx], 1, gpu_info->devices,\r
build_option, NULL, NULL );\r
- }\r
+ }\r
else\r
- {\r
+ {\r
status = clBuildProgram( gpu_info->programs[idx], 1, &(gpu_info->dev),\r
build_option, NULL, NULL );\r
- }\r
+ }\r
\r
if( status != CL_SUCCESS )\r
{\r
if( !gpu_info->isUserCreated ) \r
- {\r
+ {\r
status = clGetProgramBuildInfo( gpu_info->programs[idx],\r
gpu_info->devices[0],\r
CL_PROGRAM_BUILD_LOG, 0, NULL, &length );\r
- }\r
- else\r
- {\r
- status = clGetProgramBuildInfo( gpu_info->programs[idx],\r
+ }\r
+ else\r
+ {\r
+ status = clGetProgramBuildInfo( gpu_info->programs[idx],\r
gpu_info->dev,\r
CL_PROGRAM_BUILD_LOG, 0, NULL, &length );\r
- }\r
+ }\r
\r
if( status != CL_SUCCESS )\r
{\r
}\r
\r
if( !gpu_info->isUserCreated )\r
- {\r
+ {\r
status = clGetProgramBuildInfo( gpu_info->programs[idx], gpu_info->devices[0],\r
CL_PROGRAM_BUILD_LOG, length, buildLog, &length );\r
- }\r
+ }\r
else\r
- {\r
+ {\r
status = clGetProgramBuildInfo( gpu_info->programs[idx], gpu_info->dev,\r
CL_PROGRAM_BUILD_LOG, length, buildLog, &length );\r
- }\r
+ }\r
\r
fd1 = fopen( "kernel-build.log", "w+" );\r
if( fd1 != NULL ) {\r
status = hb_get_kernel_env_and_func( kernel_name, &env, &function );\r
strcpy( env.kernel_name, kernel_name );\r
if( status == 1 ) \r
- {\r
+ {\r
return(function( userdata, &env ));\r
- }\r
+ }\r
\r
return(0);\r
}\r
{\r
int status = 0;\r
if( MAX_CLKERNEL_NUM <= 0 )\r
- {\r
+ {\r
return 1;\r
- }\r
+ }\r
\r
if((argc > MAX_CLFILE_NUM) || (argc<0))\r
- {\r
+ {\r
return 1;\r
- }\r
+ }\r
\r
if( !isInited )\r
{\r
\r
return 1;\r
}\r
+\r
+int hb_copy_buffer(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t cb)\r
+{\r
+ int status = clEnqueueCopyBuffer(gpu_env.command_queue,\r
+ src_buffer,\r
+ dst_buffer,\r
+ src_offset, dst_offset, cb,\r
+ 0, 0, 0);\r
+ if( status != CL_SUCCESS )\r
+ { \r
+ av_log(NULL,AV_LOG_ERROR, "hb_read_opencl_buffer error '%d'\n", status ); \r
+ return 0; \r
+ }\r
+ return 1;\r
+}\r
+\r
+int hb_read_opencl_frame_buffer(cl_mem cl_inBuf,unsigned char *Ybuf,unsigned char *Ubuf,unsigned char *Vbuf,int linesize0,int linesize1,int linesize2,int height)\r
+{\r
+\r
+ int chrH = -(-height >> 1);\r
+ unsigned char *temp = (unsigned char *)av_malloc(sizeof(uint8_t) * (linesize0 * height + linesize1 * chrH * 2));\r
+ if(hb_read_opencl_buffer(cl_inBuf,temp,sizeof(uint8_t)*(linesize0 + linesize1)*height))\r
+ {\r
+ memcpy(Ybuf,temp,linesize0 * height);\r
+ memcpy(Ubuf,temp + linesize0 * height,linesize1 *chrH);\r
+ memcpy(Vbuf,temp + linesize0 * height + linesize1 * chrH,linesize2 * chrH);\r
+ \r
+ }\r
+ av_free(temp);\r
+\r
+ return 1;\r
+}\r
+\r
+int hb_write_opencl_frame_buffer(cl_mem cl_inBuf,unsigned char *Ybuf,unsigned char *Ubuf,unsigned char *Vbuf,int linesize0,int linesize1,int linesize2,int height,int offset)\r
+{\r
+ int status;\r
+ void *mapped = clEnqueueMapBuffer( gpu_env.command_queue, cl_inBuf, CL_TRUE,CL_MAP_WRITE, 0, sizeof(uint8_t) * (linesize0 + linesize1)*height + offset, 0, NULL, NULL, NULL );\r
+ uint8_t *temp = (uint8_t *)mapped;\r
+ temp += offset;\r
+ memcpy(temp,Ybuf,sizeof(uint8_t) * linesize0 * height);\r
+ memcpy(temp + sizeof(uint8_t) * linesize0 * height,Ubuf,sizeof(uint8_t) * linesize1 * height/2);\r
+ memcpy(temp + sizeof(uint8_t) * (linesize0 * height + linesize1 * height/2),Vbuf,sizeof(uint8_t) * linesize2 * height/2);\r
+ clEnqueueUnmapMemObject(gpu_env.command_queue, cl_inBuf, mapped, 0, NULL, NULL );\r
+ return 1;\r
+}\r
+\r
+cl_command_queue hb_get_command_queue()\r
+{\r
+ return gpu_env.command_queue;\r
+}\r
+\r
+cl_context hb_get_context()\r
+{\r
+ return gpu_env.context;\r
+}\r
#endif\r