From: sr55 Date: Fri, 6 Sep 2013 16:34:09 +0000 (+0000) Subject: Changes to OpenCL scaling to reduce memory copies and improve performance. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=f35cf60ca4a4b94f352afc22cc2252ffe7ff67d1;p=handbrake Changes to OpenCL scaling to reduce memory copies and improve performance. Video frames are stored in mapped OpenCL buffers rather than (directly) malloced memory. Changed scaling to use a single kernel rather than two. Changed scaling method to bicubic. Disabled scaling during dxva video extraction. Scaling can not be done that early in the pipeline. Patch by Michael Wootton git-svn-id: svn://svn.handbrake.fr/HandBrake/branches/opencl@5769 b64f7644-9d1e-0410-96f1-a4d463321fa5 --- diff --git a/libhb/common.h b/libhb/common.h index 231cdb777..7e5512514 100644 --- a/libhb/common.h +++ b/libhb/common.h @@ -361,7 +361,7 @@ struct hb_job_s int chapter_start; int chapter_end; - /* Include chapter marker track in mp4? */ + /* Include chapter marker track in mp4? */ int chapter_markers; /* Picture settings: @@ -1031,29 +1031,22 @@ extern hb_work_object_t hb_reader; typedef struct hb_oclscale_s { #ifdef USE_OPENCL - // input buffer for running horizontal kernel. output buffer of running horizontal kernel. outpuf buffer of running vertiacla kernel - cl_mem h_in_buf; - cl_mem h_out_buf; - cl_mem v_out_buf; - // horizontal coefficent buffer for Y U and V plane, hroizontal source index for Y,U and V plane - cl_mem h_coeff_y; - cl_mem h_coeff_uv; - cl_mem h_index_y; - cl_mem h_index_uv; - // vertical coefficent buffer for Y U and V plane, vertical source index for Y,U and V plane - cl_mem v_coeff_y; - cl_mem v_coeff_uv; - cl_mem v_index_y; - cl_mem v_index_uv; + int initialized; + // bicubic scale weights + cl_mem bicubic_x_weights; + cl_mem bicubic_y_weights; + cl_float xscale; + cl_float yscale; + int width; + int height; // horizontal scaling and vertical scaling kernel handle - cl_kernel h_kernel; - cl_kernel v_kernel; + cl_kernel m_kernel; int use_ocl_mem; // 0 use host memory. 1 use gpu oclmem #endif } hb_oclscale_t; #ifdef USE_OPENCL -int hb_ocl_scale( cl_mem in_buf, uint8_t *in_data, uint8_t *out_data, int in_w, int in_h, int out_w, int out_h, hb_oclscale_t *os ); +int hb_ocl_scale( hb_buffer_t *in, hb_buffer_t *out, int *crop, hb_oclscale_t *os ); #endif #ifdef USE_OPENCL diff --git a/libhb/cropscale.c b/libhb/cropscale.c index 0e791f3a6..c7d7d9948 100644 --- a/libhb/cropscale.c +++ b/libhb/cropscale.c @@ -140,17 +140,8 @@ static void hb_crop_scale_close( hb_filter_object_t * filter ) if( pv->job->use_opencl && pv->os ) { - CL_FREE( pv->os->h_in_buf ); - CL_FREE( pv->os->h_out_buf ); - CL_FREE( pv->os->v_out_buf ); - CL_FREE( pv->os->h_coeff_y ); - CL_FREE( pv->os->h_coeff_uv ); - CL_FREE( pv->os->h_index_y ); - CL_FREE( pv->os->h_index_uv ); - CL_FREE( pv->os->v_coeff_y ); - CL_FREE( pv->os->v_coeff_uv ); - CL_FREE( pv->os->v_index_y ); - CL_FREE( pv->os->v_index_uv ); + CL_FREE( pv->os->bicubic_x_weights ); + CL_FREE( pv->os->bicubic_y_weights ); free( pv->os ); } #endif @@ -199,76 +190,14 @@ static hb_buffer_t* crop_scale( hb_filter_private_t * pv, hb_buffer_t * in ) pv->crop[0], pv->crop[2] ); #ifdef USE_OPENCL - - if( pv->job->use_opencl ) + // Use bicubic OpenCL scaling when selected and when downsampling < 4:1; + if ((pv->job->use_opencl) && (pv->width_out * 4 > pv->width_in) && (in->cl.buffer != NULL) && (out->cl.buffer != NULL)) { - int w = in->f.width - ( pv->crop[2] + pv->crop[3] ); - int h = in->f.height - ( pv->crop[0] + pv->crop[1] ); - uint8_t *tmp_in = malloc( w * h * 3 / 2 ); - uint8_t *tmp_out = malloc( pv->width_out * pv->height_out * 3 / 2 ); - if( pic_crop.data[0] || pic_crop.data[1] || pic_crop.data[2] || pic_crop.data[3] ) - { - int i; - for( i = 0; i < h >> 1; i++ ) - { - memcpy( tmp_in + ( ( i << 1 ) + 0 ) * w, pic_crop.data[0] + ( ( i << 1 ) + 0 ) * pic_crop.linesize[0], w ); - memcpy( tmp_in + ( ( i << 1 ) + 1 ) * w, pic_crop.data[0] + ( ( i << 1 ) + 1 ) * pic_crop.linesize[0], w ); - memcpy( tmp_in + ( w * h ) + i * ( w >> 1 ), pic_crop.data[1] + i * pic_crop.linesize[1], w >> 1 ); - memcpy( tmp_in + ( w * h ) + ( ( w * h ) >> 2 ) + i * ( w >> 1 ), pic_crop.data[2] + i * pic_crop.linesize[2], w >> 1 ); - } - } - else - { - memcpy( tmp_in, pic_crop.data[0], w * h ); - memcpy( tmp_in + w * h, pic_crop.data[1], (w * h) >> 2 ); - memcpy( tmp_in + w * h + ((w * h) >> 2), pic_crop.data[2], (w * h) >> 2 ); - } - hb_ocl_scale( NULL, tmp_in, tmp_out, w, h, out->f.width, out->f.height, pv->os ); - w = out->plane[0].stride; - h = out->plane[0].height; - uint8_t *dst = out->plane[0].data; - copy_plane( dst, tmp_out, w, pv->width_out, h ); - w = out->plane[1].stride; - h = out->plane[1].height; - dst = out->plane[1].data; - copy_plane( dst, tmp_out + pv->width_out * pv->height_out, w, pv->width_out >> 1, h ); - w = out->plane[2].stride; - h = out->plane[2].height; - dst = out->plane[2].data; - copy_plane( dst, tmp_out + pv->width_out * pv->height_out +( ( pv->width_out * pv->height_out ) >> 2 ), w, pv->width_out >> 1, h ); - free( tmp_out ); - free( tmp_in ); + hb_ocl_scale(in, out, pv->crop, pv->os); } else { - if( !pv->context || - pv->width_in != in->f.width || - pv->height_in != in->f.height || - pv->pix_fmt != in->f.fmt ) - { - // Something changed, need a new scaling context. - if( pv->context ) - sws_freeContext( pv->context ); - pv->context = hb_sws_get_context( - in->f.width - (pv->crop[2] + pv->crop[3]), - in->f.height - (pv->crop[0] + pv->crop[1]), - in->f.fmt, - out->f.width, out->f.height, out->f.fmt, - SWS_LANCZOS | SWS_ACCURATE_RND ); - pv->width_in = in->f.width; - pv->height_in = in->f.height; - pv->pix_fmt = in->f.fmt; - } - - // Scale pic_crop into pic_render according to the - // context set up above - sws_scale(pv->context, - (const uint8_t* const*)pic_crop.data, - pic_crop.linesize, - 0, in->f.height - (pv->crop[0] + pv->crop[1]), - pic_out.data, pic_out.linesize); - } -#else +#endif if ( !pv->context || pv->width_in != in->f.width || pv->height_in != in->f.height || @@ -296,6 +225,8 @@ static hb_buffer_t* crop_scale( hb_filter_private_t * pv, hb_buffer_t * in ) pic_crop.linesize, 0, in->f.height - (pv->crop[0] + pv->crop[1]), pic_out.data, pic_out.linesize); +#ifdef USE_OPENCL + } #endif out->s = in->s; hb_buffer_move_subs( out, in ); diff --git a/libhb/decavcodec.c b/libhb/decavcodec.c index 409039e81..461f87b66 100644 --- a/libhb/decavcodec.c +++ b/libhb/decavcodec.c @@ -106,7 +106,7 @@ struct hb_work_private_s #ifdef USE_HWD hb_va_dxva2_t *dxva2; uint8_t *dst_frame; - hb_oclscale_t *os; + hb_oclscale_t *opencl_scale; #endif hb_audio_resample_t *resample; }; @@ -306,22 +306,9 @@ static void closePrivData( hb_work_private_t ** ppv ) } hb_audio_resample_free(pv->resample); #ifdef USE_HWD - if ( pv->os ) + if ( pv->opencl_scale ) { -#ifdef USE_OPENCL - CL_FREE( pv->os->h_in_buf ); - CL_FREE( pv->os->h_out_buf ); - CL_FREE( pv->os->v_out_buf ); - CL_FREE( pv->os->h_coeff_y ); - CL_FREE( pv->os->h_coeff_uv ); - CL_FREE( pv->os->h_index_y ); - CL_FREE( pv->os->h_index_uv ); - CL_FREE( pv->os->v_coeff_y ); - CL_FREE( pv->os->v_coeff_uv ); - CL_FREE( pv->os->v_index_y ); - CL_FREE( pv->os->v_index_uv ); -#endif - free( pv->os ); + free( pv->opencl_scale ); } if ( pv->dxva2 ) { @@ -634,24 +621,15 @@ static hb_buffer_t *copy_frame( hb_work_private_t *pv, AVFrame *frame ) hb_buffer_t *buf; int ww, hh; - if( (w > pv->job->width || h > pv->job->height) && (pv->job->use_opencl) - && (pv->job->use_decomb == 0) && (pv->job->use_detelecine == 0) ) - { - buf = hb_video_buffer_init( pv->job->width, pv->job->height ); - ww = pv->job->width; - hh = pv->job->height; - } - else - { - buf = hb_video_buffer_init( w, h ); - ww = w; - hh = h; - } + buf = hb_video_buffer_init( w, h ); + ww = w; + hh = h; + if( !pv->dst_frame ) { pv->dst_frame = malloc( ww * hh * 3 / 2 ); } - if( hb_va_extract( pv->dxva2, pv->dst_frame, frame, pv->job->width, pv->job->height, pv->job->title->crop, pv->os, pv->job->use_opencl, pv->job->use_decomb, pv->job->use_detelecine ) == HB_WORK_ERROR ) + if( hb_va_extract( pv->dxva2, pv->dst_frame, frame, pv->job->width, pv->job->height, pv->job->title->crop, pv->opencl_scale, pv->job->use_opencl, pv->job->use_decomb, pv->job->use_detelecine ) == HB_WORK_ERROR ) { hb_log( "hb_va_Extract failed!!!!!!" ); } @@ -1205,8 +1183,8 @@ static int decavcodecvInit( hb_work_object_t * w, hb_job_t * job ) pv->context->get_buffer = get_frame_buf_hwd; pv->context->release_buffer = hb_ffmpeg_release_frame_buf; pv->context->get_format = hb_ffmpeg_get_format; - pv->os = ( hb_oclscale_t * )malloc( sizeof( hb_oclscale_t ) ); - memset( pv->os, 0, sizeof( hb_oclscale_t ) ); + pv->opencl_scale = ( hb_oclscale_t * )malloc( sizeof( hb_oclscale_t ) ); + memset( pv->opencl_scale, 0, sizeof( hb_oclscale_t ) ); pv->threads = 1; } diff --git a/libhb/fifo.c b/libhb/fifo.c index df83340fd..2d792199e 100644 --- a/libhb/fifo.c +++ b/libhb/fifo.c @@ -242,7 +242,14 @@ void hb_buffer_pool_free( void ) if( b->data ) { freed += b->alloc; - free( b->data ); +#ifdef USE_OPENCL + if (b->cl.buffer != NULL) { + if (hb_cl_free_mapped_buffer(b->cl.buffer, b->data) == 0) + hb_log("bad free: %.16x -> buffer %.16x map %.16x", b, b->cl.buffer, b->data); + } + else +#endif + free( b->data ); } free( b ); count++; @@ -273,7 +280,7 @@ static hb_fifo_t *size_to_pool( int size ) return NULL; } -hb_buffer_t * hb_buffer_init( int size ) +hb_buffer_t * hb_buffer_init_internal( int size , int needsMapped ) { hb_buffer_t * b; // Certain libraries (hrm ffmpeg) expect buffers passed to them to @@ -288,6 +295,18 @@ hb_buffer_t * hb_buffer_init( int size ) { b = hb_fifo_get( buffer_pool ); +#ifdef USE_OPENCL + if (b && (needsMapped != 0) && (b->cl.buffer == NULL)) + { + // We need a mapped OpenCL buffer and that is not what we got out of the pool. + // Ditch it. It will get replaced with what we need. + if (b->data) + free(b->data); + free(b); + b = NULL; + } +#endif + if( b ) { /* @@ -295,10 +314,21 @@ hb_buffer_t * hb_buffer_init( int size ) * didn't have to do this. */ uint8_t *data = b->data; +#ifdef USE_OPENCL + cl_mem buffer = b->cl.buffer; + cl_event last_event = b->cl.last_event; + int loc = b->cl.buffer_location; +#endif + memset( b, 0, sizeof(hb_buffer_t) ); b->alloc = buffer_pool->buffer_size; b->size = size; b->data = data; +#ifdef USE_OPENCL + b->cl.buffer = buffer; + b->cl.last_event = last_event; + b->cl.buffer_location = loc; +#endif return( b ); } } @@ -317,6 +347,20 @@ hb_buffer_t * hb_buffer_init( int size ) if (size) { +#ifdef USE_OPENCL + b->cl.last_event = NULL; + b->cl.buffer_location = HOST; + + if (needsMapped != 0) + { + int status; + status = hb_cl_create_mapped_buffer(&b->cl.buffer, &b->data, b->alloc); + //hb_log("buf: %.16x -> buffer %.16x map %.16x size %d", b, b->cl.buffer, b->data, size); + } + else { + b->cl.buffer = NULL; +#endif + #if defined( SYS_DARWIN ) || defined( SYS_FREEBSD ) || defined( SYS_MINGW ) b->data = malloc( b->alloc ); #elif defined( SYS_CYGWIN ) @@ -325,6 +369,10 @@ hb_buffer_t * hb_buffer_init( int size ) #else b->data = memalign( 16, b->alloc ); #endif +#ifdef USE_OPENCL + } +#endif + if( !b->data ) { hb_log( "out of memory" ); @@ -338,6 +386,11 @@ hb_buffer_t * hb_buffer_init( int size ) return b; } +hb_buffer_t * hb_buffer_init( int size ) +{ + return hb_buffer_init_internal(size, 0); +} + void hb_buffer_realloc( hb_buffer_t * b, int size ) { if ( size > b->alloc || b->data == NULL ) @@ -355,6 +408,7 @@ void hb_buffer_realloc( hb_buffer_t * b, int size ) void hb_buffer_reduce( hb_buffer_t * b, int size ) { + if ( size < b->alloc / 8 || b->data == NULL ) { hb_buffer_t * tmp = hb_buffer_init( size ); @@ -368,6 +422,7 @@ void hb_buffer_reduce( hb_buffer_t * b, int size ) hb_buffer_t * hb_buffer_dup( const hb_buffer_t * src ) { + hb_buffer_t * buf; if ( src == NULL ) @@ -461,8 +516,11 @@ hb_buffer_t * hb_frame_buffer_init( int pix_fmt, int width, int height ) hb_image_height_stride( pix_fmt, height, p ); } } - +#ifdef USE_OPENCL + buf = hb_buffer_init_internal( size , hb_use_buffers() ); +#else buf = hb_buffer_init( size ); +#endif if( buf == NULL ) return NULL; @@ -514,12 +572,22 @@ void hb_buffer_swap_copy( hb_buffer_t *src, hb_buffer_t *dst ) uint8_t *data = dst->data; int size = dst->size; int alloc = dst->alloc; +#ifdef USE_OPENCL + cl_mem buffer = dst->cl.buffer; + cl_event last_event = dst->cl.last_event; + int loc = dst->cl.buffer_location; +#endif *dst = *src; src->data = data; src->size = size; src->alloc = alloc; +#ifdef USE_OPENCL + src->cl.buffer = buffer; + src->cl.last_event = last_event; + src->cl.buffer_location = loc; +#endif } // Frees the specified buffer list. @@ -547,7 +615,14 @@ void hb_buffer_close( hb_buffer_t ** _b ) // free the buf if( b->data ) { - free( b->data ); +#ifdef USE_OPENCL + if (b->cl.buffer != NULL) { + if (hb_cl_free_mapped_buffer(b->cl.buffer, b->data) == 0) + hb_log("bad free2: %.16x -> buffer %.16x map %.16x", b, b->cl.buffer, b->data); + } + else +#endif + free( b->data ); hb_lock(buffers.lock); buffers.allocated -= b->alloc; hb_unlock(buffers.lock); diff --git a/libhb/hb.c b/libhb/hb.c index a55575f8e..f36c95311 100644 --- a/libhb/hb.c +++ b/libhb/hb.c @@ -436,6 +436,11 @@ hb_handle_t * hb_init( int verbose, int update_check ) h->interjob = calloc( sizeof( hb_interjob_t ), 1 ); + /* opencl */ +#ifdef USE_OPENCL + //hb_opencl_init(); // FIXME: Ensure gui instances call this or hb_get_opencl_env() during startup if needed. +#endif + /* Start library thread */ hb_log( "hb_init: starting libhb thread" ); h->die = 0; diff --git a/libhb/internal.h b/libhb/internal.h index 4a2f775a9..adddbe58b 100644 --- a/libhb/internal.h +++ b/libhb/internal.h @@ -114,6 +114,15 @@ struct hb_buffer_s int size; } plane[4]; // 3 Color components + alpha +#ifdef USE_OPENCL + struct cl_data + { + cl_mem buffer; + cl_event last_event; + enum { HOST, DEVICE } buffer_location; + } cl; +#endif + // PICTURESUB subtitle packets: // Video packets (after processing by the hb_sync_video work-object): diff --git a/libhb/oclscale.c b/libhb/oclscale.c index ca916c166..904183340 100644 --- a/libhb/oclscale.c +++ b/libhb/oclscale.c @@ -13,298 +13,259 @@ #ifdef USE_OPENCL -#include -#include "common.h" -#include "openclwrapper.h" -#define MaxFilterLength 16 -#define FILTER_LEN 4 - -inline double hb_fit_gauss_kernel( double x ) -{ - double powNum = -1 * M_PI; - - powNum *= x; - - powNum *= x; - - return exp( powNum ); -} - -/** - * Using gaussian algorithm to calculate the scale filter - */ -static void hb_set_gauss_interpolation( float *pcoeff, int *pmappedindex, int targetdatalength, int srcdatalength, int filterLength, float bias ) -{ - int i, j; - - float gausskernel[MaxFilterLength]; - - int half = filterLength / 2; - - float scalerate = (float)(srcdatalength) / targetdatalength; - - for( i = 0; i < targetdatalength; ++i ) - { - float flindex = i * scalerate + bias; - - if( flindex > (srcdatalength - 1)) - { - flindex -= (int)(flindex - (srcdatalength - 1)); - } - - int srcindex = (int)(flindex); - - float t = flindex - srcindex; - - for( j = 0; j < (int)half; j++ ) - { - gausskernel[j] = (float)hb_fit_gauss_kernel((half - j) - 1 + t ); - } - - for( j = 0; j < (int)half; j++ ) - { - gausskernel[half + j] = (float)hb_fit_gauss_kernel( j + 1 - t ); - } - - while( srcindex < (int)half - 1 ) - { - /* -1 0 1 2 - * M1 S P1 P2 - * - * if srcindex is 0, M1 and S will be the same sample. To keep the - * convolution kernel from having to check for edge conditions, move - * srcindex to 1, then slide down the coefficients - */ - srcindex += 1; - - gausskernel[0] += gausskernel[1]; - - for( j = 1; j < filterLength - 1; j++ ) - { - gausskernel[j] = gausskernel[j + 1]; - } - - gausskernel[filterLength - 1] = 0; - } - - while( srcindex >= srcdatalength - half ) - { - /* If srcindex is near the edge, shift down srcindex and slide up - * the coefficients - */ - srcindex -= 1; - - gausskernel[3] += gausskernel[2]; - - for( j = filterLength - 2; j > 0; j-- ) - { - gausskernel[j] = gausskernel[j - 1]; - } - - gausskernel[0] = 0; - } - - *pmappedindex++ = srcindex - half + 1; - - // Store normalized Gauss kernel - - float sumtemp = 0; - - for( j = 0; j < filterLength; ++j ) - { - sumtemp += gausskernel[j]; - } - - for( j = 0; j < filterLength; ++j ) - { - pcoeff[targetdatalength * j + i] = gausskernel[j] / sumtemp; - } - } -} - -/** -* executive scale using opencl -* get filter args -* create output buffer -* create horizontal filter buffer -* create vertical filter buffer -* create kernels -*/ -int hb_ocl_scale_func( void **data, KernelEnv *kenv ) -{ - cl_int status; - - uint8_t *in_frame = data[0]; - uint8_t *out_frame = data[1]; - int in_frame_w = (int)data[2]; - int in_frame_h = (int)data[3]; - int out_frame_w = (int)data[4]; - int out_frame_h = (int)data[5]; - hb_oclscale_t *os = data[6]; - - if( os->use_ocl_mem ) - os->h_in_buf = data[0]; - int h_filter_len = FILTER_LEN; - int v_filter_len = FILTER_LEN; - //it will make the psnr lower when filter length is 4 in the condition that the video width is shorter than 960 and width is shorter than 544,so we set the filter length to 2 - if( out_frame_w <= 960 && out_frame_h <= 544 ) - { - h_filter_len>>=1; - v_filter_len>>=1; - } - if( !os->h_out_buf ) - { - hb_log( "OpenCL: Scaling With OpenCL" ); - //malloc filter args - float *hf_y, *hf_uv, *vf_y, *vf_uv; - int *hi_y, *hi_uv, *vi_y, *vi_uv; - hf_y = (float*)malloc( sizeof(float)*out_frame_w * h_filter_len ); - hf_uv = (float*)malloc( sizeof(float)*(out_frame_w>>1) * h_filter_len ); - hi_y = (int*)malloc( sizeof(int)*out_frame_w ); - hi_uv = (int*)malloc( sizeof(int)*(out_frame_w>>1)); - vf_y = (float*)malloc( sizeof(float)*out_frame_h * v_filter_len ); - vf_uv = (float*)malloc( sizeof(float)*(out_frame_h>>1) * v_filter_len ); - vi_y = (int*)malloc( sizeof(int)*out_frame_h ); - vi_uv = (int*)malloc( sizeof(int)*(out_frame_h>>1) ); - //get filter args - hb_set_gauss_interpolation( hf_y, hi_y, out_frame_w, in_frame_w, h_filter_len, 0 ); - hb_set_gauss_interpolation( hf_uv, hi_uv, out_frame_w>>1, in_frame_w>>1, h_filter_len, 0 ); - hb_set_gauss_interpolation( vf_y, vi_y, out_frame_h, in_frame_h, v_filter_len, 0 ); - hb_set_gauss_interpolation( vf_uv, vi_uv, out_frame_h>>1, in_frame_h>>1, v_filter_len, 0 ); - //create output buffer - if( !os->use_ocl_mem ) - { - CREATEBUF( os->h_in_buf, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, sizeof(uint8_t)* in_frame_w * in_frame_h*3/2 ); - } - CREATEBUF( os->h_out_buf, CL_MEM_WRITE_ONLY, sizeof(uint8_t) * out_frame_w * in_frame_h*3/2 ); - CREATEBUF( os->v_out_buf, CL_MEM_WRITE_ONLY, sizeof(uint8_t) * out_frame_w * out_frame_h*3/2 ); - //create horizontal filter buffer - CREATEBUF( os->h_coeff_y, CL_MEM_READ_ONLY, sizeof(float) * out_frame_w * h_filter_len ); - CREATEBUF( os->h_coeff_uv, CL_MEM_READ_ONLY, sizeof(float) * (out_frame_w>>1) * h_filter_len ); - CREATEBUF( os->h_index_y, CL_MEM_READ_ONLY, sizeof(int) * out_frame_w ); - CREATEBUF( os->h_index_uv, CL_MEM_READ_ONLY, sizeof(int) * (out_frame_w>>1) ); - OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_coeff_y, CL_TRUE, 0, sizeof(float) * out_frame_w * h_filter_len, hf_y, 0, NULL, NULL ); - OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_coeff_uv, CL_TRUE, 0, sizeof(float) * (out_frame_w>>1) * h_filter_len, hf_uv, 0, NULL, NULL ); - OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_index_y, CL_TRUE, 0, sizeof(int) * out_frame_w, hi_y, 0, NULL, NULL ); - OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_index_uv, CL_TRUE, 0, sizeof(int) * (out_frame_w>>1), hi_uv, 0, NULL, NULL ); - //create vertical filter buffer - CREATEBUF( os->v_coeff_y, CL_MEM_READ_ONLY, sizeof(float) * out_frame_h * v_filter_len ); - CREATEBUF( os->v_coeff_uv, CL_MEM_READ_ONLY, sizeof(float) * (out_frame_h>>1) * v_filter_len ); - CREATEBUF( os->v_index_y, CL_MEM_READ_ONLY, sizeof(int) * out_frame_h ); - CREATEBUF( os->v_index_uv, CL_MEM_READ_ONLY, sizeof(int) * (out_frame_h>>1) ); - OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->v_coeff_y, CL_TRUE, 0, sizeof(float) * out_frame_h * v_filter_len, vf_y, 0, NULL, NULL ); - OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->v_coeff_uv, CL_TRUE, 0, sizeof(float) * (out_frame_h>>1) * v_filter_len, vf_uv, 0, NULL, NULL ); - OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->v_index_y, CL_TRUE, 0, sizeof(int) * out_frame_h, vi_y, 0, NULL, NULL ); - OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->v_index_uv, CL_TRUE, 0, sizeof(int) * (out_frame_h>>1), vi_uv, 0, NULL, NULL ); - //create horizontal kernel - os->h_kernel = clCreateKernel( kenv->program, "frame_h_scale", NULL ); - OCLCHECK( clSetKernelArg, os->h_kernel, 1, sizeof(cl_mem), &os->h_coeff_y ); - OCLCHECK( clSetKernelArg, os->h_kernel, 2, sizeof(cl_mem), &os->h_coeff_uv ); - OCLCHECK( clSetKernelArg, os->h_kernel, 3, sizeof(cl_mem), &os->h_index_y ); - OCLCHECK( clSetKernelArg, os->h_kernel, 4, sizeof(cl_mem), &os->h_index_uv ); - OCLCHECK( clSetKernelArg, os->h_kernel, 6, sizeof(int), &in_frame_w ); - OCLCHECK( clSetKernelArg, os->h_kernel, 7, sizeof(int), &h_filter_len ); - //create vertical kernel - os->v_kernel = clCreateKernel( kenv->program, "frame_v_scale", NULL ); - OCLCHECK( clSetKernelArg, os->v_kernel, 1, sizeof(cl_mem), &os->v_coeff_y ); - OCLCHECK( clSetKernelArg, os->v_kernel, 2, sizeof(cl_mem), &os->v_coeff_uv ); - OCLCHECK( clSetKernelArg, os->v_kernel, 3, sizeof(cl_mem), &os->v_index_y ); - OCLCHECK( clSetKernelArg, os->v_kernel, 4, sizeof(cl_mem), &os->v_index_uv ); - OCLCHECK( clSetKernelArg, os->v_kernel, 6, sizeof(int), &in_frame_h ); - OCLCHECK( clSetKernelArg, os->v_kernel, 7, sizeof(int), &v_filter_len ); - free( hf_y ); - free( hf_uv ); - free( vf_y ); - free( vf_uv ); - free( hi_y ); - free( hi_uv ); - free( vi_y ); - free( vi_uv ); - } - //start horizontal scaling kernel - - if( !os->use_ocl_mem ) - { - if( kenv->isAMD ) - { - char *mapped = clEnqueueMapBuffer( kenv->command_queue, os->h_in_buf, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, sizeof(uint8_t) * in_frame_w * in_frame_h*3/2, 0, NULL, NULL, NULL ); - memcpy( mapped, in_frame, sizeof(uint8_t) * in_frame_w * in_frame_h*3/2 ); - clEnqueueUnmapMemObject( kenv->command_queue, os->h_in_buf, mapped, 0, NULL, NULL ); - } - else - { - OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_in_buf, CL_TRUE, 0, sizeof(uint8_t) * in_frame_w * in_frame_h * 3/2, in_frame, 0, NULL, NULL ); - } - } - - kenv->kernel = os->h_kernel; - size_t dims[2]; - dims[0] = out_frame_w; - dims[1] = in_frame_h; - OCLCHECK( clSetKernelArg, kenv->kernel, 0, sizeof(cl_mem), &os->h_in_buf ); - OCLCHECK( clSetKernelArg, kenv->kernel, 5, sizeof(cl_mem), &os->h_out_buf ); - OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL, dims, NULL, 0, NULL, NULL ); - //start vertical scaling kernel - - kenv->kernel = os->v_kernel; - dims[0] = out_frame_w; - dims[1] = out_frame_h; - OCLCHECK( clSetKernelArg, kenv->kernel, 0, sizeof(cl_mem), &os->h_out_buf ); - OCLCHECK( clSetKernelArg, kenv->kernel, 5, sizeof(cl_mem), &os->v_out_buf ); - OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL, dims, NULL, 0, NULL, NULL ); - OCLCHECK( clEnqueueReadBuffer, kenv->command_queue, os->v_out_buf, CL_TRUE, 0, sizeof(uint8_t) * out_frame_w * out_frame_h * 3/2, out_frame, 0, NULL, NULL ); - - return 1; -} - -/** -* function describe: this function is used to scaling video frame. it uses the gausi scaling algorithm -* parameter: +#include +#include "common.h" +#include "openclwrapper.h" +#define FILTER_LEN 4 + +#define _A -0.5f + +cl_float cubic(cl_float x) +{ + if (x < 0) + x = -x; + + if (x < 1) + return (_A + 2.0f) * (x * x * x) - (_A + 3.0f) * (x * x) + 0 + 1; + else if (x < 2) + return (_A) * (x * x * x) - (5.0f * _A) * (x * x) + (8.0f * _A) * x - (4.0f * _A); + else + return 0; +} + + +cl_float *hb_bicubic_weights(cl_float scale, int length) +{ + cl_float *weights = (cl_float*) malloc(length * sizeof(cl_float) * 4); + + int i; // C rocks + cl_float *out = weights; + for (i = 0; i < length; ++i) + { + cl_float x = i / scale; + cl_float dx = x - (int)x; + *out++ = cubic(-dx - 1.0f); + *out++ = cubic(-dx); + *out++ = cubic(-dx + 1.0f); + *out++ = cubic(-dx + 2.0f); + } + return weights; +} + +int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv); + +/** +* executive scale using opencl +* get filter args +* create output buffer +* create horizontal filter buffer +* create vertical filter buffer +* create kernels +*/ +int hb_ocl_scale_func( void **data, KernelEnv *kenv ) +{ + cl_int status; + + cl_mem in_buf = data[0]; + cl_mem out_buf = data[1]; + int crop_top = data[2]; + int crop_bottom = data[3]; + int crop_left = data[4]; + int crop_right = data[5]; + int in_frame_w = (int)data[6]; + int in_frame_h = (int)data[7]; + int out_frame_w = (int)data[8]; + int out_frame_h = (int)data[9]; + hb_oclscale_t *os = data[10]; + hb_buffer_t *in = data[11]; + hb_buffer_t *out = data[12]; + + if (os->initialized == 0) + { + hb_log( "Scaling With OpenCL" ); + if (kenv->isAMD != 0) + hb_log( "Using Zero Copy"); + // create the block kernel + cl_int status; + os->m_kernel = clCreateKernel( kenv->program, "frame_scale", &status ); + + os->initialized = 1; + } + + { + // Use the new kernel + cl_event events[5]; + int eventCount = 0; + + if (kenv->isAMD == 0) { + status = clEnqueueUnmapMemObject(kenv->command_queue, in->cl.buffer, in->data, 0, NULL, &events[eventCount++]); + status = clEnqueueUnmapMemObject(kenv->command_queue, out->cl.buffer, out->data, 0, NULL, &events[eventCount++]); + } + + cl_int srcPlaneOffset0 = in->plane[0].data - in->data; + cl_int srcPlaneOffset1 = in->plane[1].data - in->data; + cl_int srcPlaneOffset2 = in->plane[2].data - in->data; + cl_int srcRowWords0 = in->plane[0].stride; + cl_int srcRowWords1 = in->plane[1].stride; + cl_int srcRowWords2 = in->plane[2].stride; + cl_int dstPlaneOffset0 = out->plane[0].data - out->data; + cl_int dstPlaneOffset1 = out->plane[1].data - out->data; + cl_int dstPlaneOffset2 = out->plane[2].data - out->data; + cl_int dstRowWords0 = out->plane[0].stride; + cl_int dstRowWords1 = out->plane[1].stride; + cl_int dstRowWords2 = out->plane[2].stride; + + if (crop_top != 0 || crop_bottom != 0 || crop_left != 0 || crop_right != 0) { + srcPlaneOffset0 += crop_left + crop_top * srcRowWords0; + srcPlaneOffset1 += crop_left / 2 + (crop_top / 2) * srcRowWords1; + srcPlaneOffset2 += crop_left / 2 + (crop_top / 2) * srcRowWords2; + in_frame_w = in_frame_w - crop_right - crop_left; + in_frame_h = in_frame_h - crop_bottom - crop_top; + } + + cl_float xscale = (out_frame_w * 1.0f) / in_frame_w; + cl_float yscale = (out_frame_h * 1.0f) / in_frame_h; + setupScaleWeights(xscale, yscale, out_frame_w, out_frame_h, os, kenv); + + OCLCHECK( clSetKernelArg, os->m_kernel, 0, sizeof(cl_mem), &out_buf ); + OCLCHECK( clSetKernelArg, os->m_kernel, 1, sizeof(cl_mem), &in_buf ); + OCLCHECK( clSetKernelArg, os->m_kernel, 2, sizeof(cl_float), &xscale ); + OCLCHECK( clSetKernelArg, os->m_kernel, 3, sizeof(cl_float), &yscale ); + OCLCHECK( clSetKernelArg, os->m_kernel, 4, sizeof(cl_int), &srcPlaneOffset0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 5, sizeof(cl_int), &srcPlaneOffset1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 6, sizeof(cl_int), &srcPlaneOffset2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 7, sizeof(cl_int), &dstPlaneOffset0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 8, sizeof(cl_int), &dstPlaneOffset1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 9, sizeof(cl_int), &dstPlaneOffset2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 10, sizeof(cl_int), &srcRowWords0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 11, sizeof(cl_int), &srcRowWords1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 12, sizeof(cl_int), &srcRowWords2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 13, sizeof(cl_int), &dstRowWords0 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 14, sizeof(cl_int), &dstRowWords1 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 15, sizeof(cl_int), &dstRowWords2 ); + OCLCHECK( clSetKernelArg, os->m_kernel, 16, sizeof(int), &in_frame_w ); // FIXME: type mismatch + OCLCHECK( clSetKernelArg, os->m_kernel, 17, sizeof(int), &in_frame_h ); // + OCLCHECK( clSetKernelArg, os->m_kernel, 18, sizeof(int), &out_frame_w ); // + OCLCHECK( clSetKernelArg, os->m_kernel, 19, sizeof(int), &out_frame_h ); // + OCLCHECK( clSetKernelArg, os->m_kernel, 20, sizeof(cl_mem), &os->bicubic_x_weights ); + OCLCHECK( clSetKernelArg, os->m_kernel, 21, sizeof(cl_mem), &os->bicubic_y_weights ); + + size_t workOffset[] = { 0, 0, 0 }; + size_t globalWorkSize[] = { 1, 1, 1 }; + size_t localWorkSize[] = { 1, 1, 1 }; + + int xgroups = (out_frame_w + 63) / 64; + int ygroups = (out_frame_h + 15) / 16; + + localWorkSize[0] = 64; + localWorkSize[1] = 1; + localWorkSize[2] = 1; + globalWorkSize[0] = xgroups * 64; + globalWorkSize[1] = ygroups; + globalWorkSize[2] = 3; + + OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, os->m_kernel, 3, workOffset, globalWorkSize, localWorkSize, eventCount, (eventCount == 0) ? NULL : &events[0], &events[eventCount] ); + ++eventCount; + + if (kenv->isAMD == 0) { + in->data = clEnqueueMapBuffer(kenv->command_queue, in->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, in->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount], &status); + out->data = clEnqueueMapBuffer(kenv->command_queue, out->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, out->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount + 1], &status); + eventCount += 2; + } + + clFlush(kenv->command_queue); + clWaitForEvents(eventCount, &events[0]); + int i; + for (i = 0; i < eventCount; ++i) + clReleaseEvent(events[i]); + } + + return 1; +} + +int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv) { + cl_int status; + if (os->xscale != xscale || os->width < width) { + cl_float *xweights = hb_bicubic_weights(xscale, width); + CL_FREE(os->bicubic_x_weights); + CREATEBUF(os->bicubic_x_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * width * 4); + OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_x_weights, CL_TRUE, 0, sizeof(cl_float) * width * 4, xweights, 0, NULL, NULL ); + os->width = width; + os->xscale = xscale; + free(xweights); + } + + if ((os->yscale != yscale) || (os->height < height)) { + cl_float *yweights = hb_bicubic_weights(yscale, height); + CL_FREE(os->bicubic_y_weights); + CREATEBUF(os->bicubic_y_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * height * 4); + OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_y_weights, CL_TRUE, 0, sizeof(cl_float) * height * 4, yweights, 0, NULL, NULL ); + os->height = height; + os->yscale = yscale; + free(yweights); + } + return 0; +} + + +/** +* function describe: this function is used to scaling video frame. it uses the gausi scaling algorithm +* parameter: * inputFrameBuffer: the source video frame opencl buffer * outputdata: the destination video frame buffer * inputWidth: the width of the source video frame * inputHeight: the height of the source video frame -* outputWidth: the width of destination video frame -* outputHeight: the height of destination video frame -*/ -int hb_ocl_scale( cl_mem in_buf, uint8_t *in_data, uint8_t *out_data, int in_w, int in_h, int out_w, int out_h, hb_oclscale_t *os ) -{ - void *data[7]; - static int init_flag = 0; - if( init_flag == 0 ) - { - int st = hb_register_kernel_wrapper( "frame_h_scale", hb_ocl_scale_func ); - if( !st ) - { - hb_log( "OpenCL: Register kernel[%s] failed", "frame_h_scale" ); - return 0; - } - init_flag++; - } - - if( in_data==NULL ) - { - data[0] = in_buf; - os->use_ocl_mem = 1; - } - else - { - data[0] = in_data; - os->use_ocl_mem = 0; - } - - data[1] = out_data; - data[2] = (void*)in_w; - data[3] = (void*)in_h; - data[4] = (void*)out_w; - data[5] = (void*)out_h; - data[6] = os; - - if( !hb_run_kernel( "frame_h_scale", data ) ) - { - hb_log( "OpenCL: Run kernel[%s] failed", "frame_scale" ); - } - - return 0; -} -#endif +* outputWidth: the width of destination video frame +* outputHeight: the height of destination video frame +*/ + + +static int s_scale_init_flag = 0; + +int do_scale_init() +{ + if ( s_scale_init_flag==0 ) + { + int st = hb_register_kernel_wrapper( "frame_scale", hb_ocl_scale_func ); + if( !st ) + { + hb_log( "register kernel[%s] failed", "frame_scale" ); + return 0; + } + s_scale_init_flag++; + } + return 1; +} + + +int hb_ocl_scale(hb_buffer_t *in, hb_buffer_t *out, int *crop, hb_oclscale_t *os) +{ + void *data[13]; + + if (do_scale_init() == 0) + return 0; + + data[0] = in->cl.buffer; + data[1] = out->cl.buffer; + data[2] = (void*)(crop[0]); + data[3] = (void*)(crop[1]); + data[4] = (void*)(crop[2]); + data[5] = (void*)(crop[3]); + data[6] = (void*)(in->f.width); + data[7] = (void*)(in->f.height); + data[8] = (void*)(out->f.width); + data[9] = (void*)(out->f.height); + data[10] = os; + data[11] = in; + data[12] = out; + + if( !hb_run_kernel( "frame_scale", data ) ) + hb_log( "run kernel[%s] failed", "frame_scale" ); + return 0; +} + + + + + +#endif diff --git a/libhb/openclkernels.h b/libhb/openclkernels.h index e34748614..f324aafdd 100644 --- a/libhb/openclkernels.h +++ b/libhb/openclkernels.h @@ -541,12 +541,100 @@ char *kernel_src_vscalefast = KERNEL ( val1 = (src[cfilterPos[h] * dstChrStride + w] + local_up_dither[ (w + 3) & 7] ) >> 7; dst[h * dstChrStride + w] = ((val1&(~0xFF)) ? ((-val1) >> 31) : (val1)); - } - ); - -char *kernel_src_yadif_filter = KERNEL( - void filter_v6( - global unsigned char *dst, + } + ); + +char *kernel_src_scale = KERNEL ( + +__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void frame_scale(__global uchar *dst, + __global const uchar *src, + const float xscale, + const float yscale, + const int srcPlaneOffset0, + const int srcPlaneOffset1, + const int srcPlaneOffset2, + const int dstPlaneOffset0, + const int dstPlaneOffset1, + const int dstPlaneOffset2, + const int srcRowWords0, + const int srcRowWords1, + const int srcRowWords2, + const int dstRowWords0, + const int dstRowWords1, + const int dstRowWords2, + const int srcWidth, + const int srcHeight, + const int dstWidth, + const int dstHeight, + __global const float4* restrict xweights, + __global const float4* restrict yweights + ) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2); + + // Abort work items outside the dst image bounds. + + if ((get_group_id(0) * 64 >= (dstWidth >> ((z == 0) ? 0 : 1))) || (get_group_id(1) * 16 >= (dstHeight >> ((z == 0) ? 0 : 1)))) + return; + + const int srcPlaneOffset = (z == 0) ? srcPlaneOffset0 : ((z == 1) ? srcPlaneOffset1 : srcPlaneOffset2); + const int dstPlaneOffset = (z == 0) ? dstPlaneOffset0 : ((z == 1) ? dstPlaneOffset1 : dstPlaneOffset2); + const int srcRowWords = (z == 0) ? srcRowWords0: ((z == 1) ? srcRowWords1 : srcRowWords2); + const int dstRowWords = (z == 0) ? dstRowWords0: ((z == 1) ? dstRowWords1 : dstRowWords2); + + __local uchar pixels[64 * 36]; + const int localRowPixels = 64; + const int groupHeight = 16; // src pixel height output by the workgroup + const int ypad = 2; + const int localx = get_local_id(0); + + const int globalStartRow = floor((get_group_id(1) * groupHeight) / yscale); + const int globalRowCount = ceil(groupHeight / yscale) + 2 * ypad; + + float4 weights = xweights[x]; + int4 woffs = floor(x / xscale); + woffs += (int4)(-1, 0, 1, 2); + woffs = clamp(woffs, 0, (srcWidth >> ((z == 0) ? 0 : 1)) - 1); + const int maxy = (srcHeight >> ((z == 0) ? 0 : 1)) - 1; + + // Scale x from global into LDS + + for (int i = 0; i <= globalRowCount; ++i) { + int4 offs = srcPlaneOffset + clamp(globalStartRow - ypad + i, 0, maxy) * srcRowWords; + offs += woffs; + pixels[localx + i * localRowPixels] = convert_uchar(clamp(round(dot(weights, + (float4)(src[offs.x], src[offs.y], src[offs.z], src[offs.w]))), 0.0f, 255.0f)); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + // Scale y from LDS into global + + if (x >= dstWidth >> ((z == 0) ? 0 : 1)) + return; + + int off = dstPlaneOffset + x + (get_group_id(1) * groupHeight) * dstRowWords; + + for (int i = 0; i < groupHeight; ++i) { + if (y >= dstHeight >> ((z == 0) ? 0 : 1)) + break; + int localy = floor((get_group_id(1) * groupHeight + i) / yscale); + localy = localy - globalStartRow + ypad; + int loff = localx + localy * localRowPixels; + dst[off] = convert_uchar(clamp(round(dot(yweights[get_group_id(1) * groupHeight + i], + (float4)(pixels[loff - localRowPixels], pixels[loff], pixels[loff + localRowPixels] + , pixels[loff + localRowPixels * 2]))), 0.0f, 255.0f)); + off += dstRowWords; + } +} +); + + +char *kernel_src_yadif_filter = KERNEL( + void filter_v6( + global unsigned char *dst, global unsigned char *prev, global unsigned char *cur, global unsigned char *next, diff --git a/libhb/openclwrapper.c b/libhb/openclwrapper.c index 01aef90d1..9a7e9888d 100644 --- a/libhb/openclwrapper.c +++ b/libhb/openclwrapper.c @@ -79,12 +79,13 @@ typedef struct char kernelName[MAX_KERNEL_NAME_LEN+1]; char * kernelStr; }hb_kernel_node; - -static GPUEnv gpu_env; -static int isInited = 0; -static hb_kernel_node gKernels[MAX_KERNEL_NUM]; - -#define ADD_KERNEL_CFG( idx, s, p ){\ + +static GPUEnv gpu_env; +static int isInited = 0; +static int useBuffers = 0; +static hb_kernel_node gKernels[MAX_KERNEL_NUM]; + +#define ADD_KERNEL_CFG( idx, s, p ){\ strcpy( gKernels[idx].kernelName, s );\ gKernels[idx].kernelStr = p;\ strcpy( gpu_env.kernel_names[idx], s );\ @@ -162,23 +163,21 @@ int hb_confirm_gpu_type() /** * hb_regist_opencl_kernel - */ -int hb_regist_opencl_kernel() -{ - if( !gpu_env.isUserCreated ) - memset( &gpu_env, 0, sizeof(gpu_env) ); - - gpu_env.file_count = 0; //argc; - gpu_env.kernel_count = 0UL; - - ADD_KERNEL_CFG( 0, "frame_h_scale", NULL ) - ADD_KERNEL_CFG( 1, "frame_v_scale", NULL ) - ADD_KERNEL_CFG( 2, "nv12toyuv", NULL ) - ADD_KERNEL_CFG( 3, "scale_opencl", NULL ) - ADD_KERNEL_CFG( 4, "yadif_filter", NULL ) - - return 0; -} + */ +int hb_regist_opencl_kernel() +{ + //if( !gpu_env.isUserCreated ) + // memset( &gpu_env, 0, sizeof(gpu_env) ); + //Comment for posterity: When in doubt just zero out a structure full of pointers to allocated resources. + + gpu_env.file_count = 0; //argc; + gpu_env.kernel_count = 0UL; + + ADD_KERNEL_CFG( 0, "frame_scale", NULL ) + ADD_KERNEL_CFG( 1, "yadif_filter", NULL ) + + return 0; +} /** * hb_regist_opencl_kernel @@ -512,23 +511,30 @@ int hb_release_kernel( KernelEnv * env ) } /** - * hb_init_opencl_env - * @param gpu_info - - */ -int hb_init_opencl_env( GPUEnv *gpu_info ) -{ - size_t length; + * hb_init_opencl_env + * @param gpu_info - + */ + +static int init_once = 0; +int hb_init_opencl_env( GPUEnv *gpu_info ) +{ + size_t length; cl_int status; cl_uint numPlatforms, numDevices; cl_platform_id *platforms; cl_context_properties cps[3]; char platformName[100]; - unsigned int i; - void *handle = INVALID_HANDLE_VALUE; - - /* - * Have a look at the available platforms. - */ + unsigned int i; + void *handle = INVALID_HANDLE_VALUE; + + + if (init_once != 0) + return 0; + else + init_once = 1; + /* + * Have a look at the available platforms. + */ if( !gpu_info->isUserCreated ) { status = clGetPlatformIDs( 0, NULL, &numPlatforms ); @@ -793,26 +799,22 @@ int hb_compile_kernel_file( const char *filename, GPUEnv *gpu_info, #ifdef USE_EXTERNAL_KERNEL status = hb_convert_to_string( filename, &source_str, gpu_info, idx ); - if( status == 0 ) - return(0); -#else - int kernel_src_size = strlen( kernel_src_hscale ) + strlen( kernel_src_vscale ) - + strlen( kernel_src_nvtoyuv ) + strlen( kernel_src_hscaleall ) - + strlen( kernel_src_hscalefast ) + strlen( kernel_src_vscalealldither ) - + strlen( kernel_src_vscaleallnodither ) + strlen( kernel_src_vscalefast ) - + strlen( kernel_src_yadif_filter ); - source_str = (char*)malloc( kernel_src_size + 2 ); - strcpy( source_str, kernel_src_hscale ); - strcat( source_str, kernel_src_vscale ); - strcat( source_str, kernel_src_nvtoyuv ); - strcat( source_str, kernel_src_hscaleall ); - strcat( source_str, kernel_src_hscalefast ); - strcat( source_str, kernel_src_vscalealldither ); - strcat( source_str, kernel_src_vscaleallnodither ); - strcat( source_str, kernel_src_vscalefast ); - strcat( source_str, kernel_src_yadif_filter ); -#endif - + if( status == 0 ) + return(0); +#else + int kernel_src_size = strlen(kernel_src_scale) + strlen(kernel_src_yadif_filter); + +// char *scale_src; +// status = hb_convert_to_string("./scale_kernels.cl", &scale_src, gpu_info, idx); +// if (status != 0) +// kernel_src_size += strlen(scale_src); + + source_str = (char*)malloc( kernel_src_size + 2 ); + strcpy( source_str, kernel_src_scale ); +// strcat( source_str, scale_src ); // + strcat( source_str, kernel_src_yadif_filter ); +#endif + source = source_str; source_size[0] = strlen( source ); @@ -947,13 +949,13 @@ int hb_compile_kernel_file( const char *filename, GPUEnv *gpu_info, } strcpy( gpu_env.kernelSrcFile[idx], filename ); - - if (binaryExisted != 1) - { - hb_generat_bin_from_kernel_source(gpu_env.programs[idx], filename); - } - - gpu_info->file_count += 1; + + if (binaryExisted != 1) + { + //hb_generat_bin_from_kernel_source(gpu_env.programs[idx], filename); + } + + gpu_info->file_count += 1; return(1); } @@ -1042,12 +1044,13 @@ int hb_init_opencl_run_env( int argc, char **argv, const char *build_option ) if( status == 0 || gpu_env.kernel_count == 0 ) { return(1); - - } - - isInited = 1; - } - + + } + + useBuffers = 1; + isInited = 1; + } + return(0); } @@ -1164,12 +1167,46 @@ int hb_read_opencl_buffer( cl_mem cl_inBuf, unsigned char *outbuf, int size ) return 0; } - return 1; -} - -int hb_copy_buffer(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t cb) -{ - int status = clEnqueueCopyBuffer(gpu_env.command_queue, + return 1; +} + +int hb_cl_create_mapped_buffer(cl_mem *mem, unsigned char **addr, int size) +{ + int status; + int flags = CL_MEM_ALLOC_HOST_PTR; + //cl_event event; + *mem = clCreateBuffer(gpu_env.context, flags, size, NULL, &status); + *addr = clEnqueueMapBuffer(gpu_env.command_queue, *mem, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL/*&event*/, &status); + + //hb_log("\t **** context: %.8x cmdqueue: %.8x cl_mem: %.8x mapaddr: %.8x size: %d status: %d", gpu_env.context, gpu_env.command_queue, mem, addr, size, status); + + return (status == CL_SUCCESS) ? 1 : 0; +} + +int hb_cl_free_mapped_buffer(cl_mem mem, unsigned char *addr) +{ + cl_event event; + int status = clEnqueueUnmapMemObject(gpu_env.command_queue, mem, addr, 0, NULL, &event); + if (status == CL_SUCCESS) + clWaitForEvents(1, &event); + else + hb_log("hb_free_mapped_buffer: error %d", status); + return (status == CL_SUCCESS) ? 1 : 0; +} + +void hb_opencl_init() +{ + hb_get_opencl_env(); +} + +int hb_use_buffers() +{ + return useBuffers; +} + +int hb_copy_buffer(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t cb) +{ + int status = clEnqueueCopyBuffer(gpu_env.command_queue, src_buffer, dst_buffer, src_offset, dst_offset, cb, diff --git a/libhb/openclwrapper.h b/libhb/openclwrapper.h index 4244c4454..8436c3a9c 100644 --- a/libhb/openclwrapper.h +++ b/libhb/openclwrapper.h @@ -67,15 +67,23 @@ int hb_init_opencl_attr( OpenCLEnv * env ); // create kernel object by a kernel name on the specified opencl run time indicated by env parameter int hb_create_kernel( char * kernelname, KernelEnv * env ); -// release kernel object which is generated by calling the hb_create_kernel api -int hb_release_kernel( KernelEnv * env ); - -int hb_get_opencl_env(); - -int hb_create_buffer(cl_mem *cl_Buf,int flags,int size); - -int hb_read_opencl_buffer(cl_mem cl_inBuf,unsigned char *outbuf,int size); - -int hb_confirm_gpu_type(); -#endif -#endif +// release kernel object which is generated by calling the hb_create_kernel api +int hb_release_kernel( KernelEnv * env ); + +void hb_opencl_init(); + +int hb_get_opencl_env(); + +int hb_create_buffer(cl_mem *cl_Buf,int flags,int size); + +int hb_read_opencl_buffer(cl_mem cl_inBuf,unsigned char *outbuf,int size); + +int hb_cl_create_mapped_buffer(cl_mem *mem, unsigned char **addr, int size); + +int hb_cl_free_mapped_buffer(cl_mem mem, unsigned char *addr); + +int hb_use_buffers(); + +int hb_confirm_gpu_type(); +#endif +#endif diff --git a/libhb/vadxva2.c b/libhb/vadxva2.c index f148f197e..4137b63c6 100644 --- a/libhb/vadxva2.c +++ b/libhb/vadxva2.c @@ -633,32 +633,10 @@ int hb_va_extract( hb_va_dxva2_t *dxva2, uint8_t *dst, AVFrame *frame, int job_w lock.Pitch, lock.Pitch, }; -#ifdef USE_OPENCL - if( ( dxva2->width > job_w || dxva2->height > job_h ) && (use_opencl) ) - { - hb_ocl_nv12toyuv( plane, lock.Pitch, dxva2->width, dxva2->height, crop, dxva2, use_decomb, use_detelecine ); - if ( use_decomb || use_detelecine ) - hb_read_opencl_buffer( dxva2->cl_mem_yuv, dst, dxva2->width * dxva2->height * 3 / 2 ); - else - { - static int init_flag = 0; - if(init_flag == 0) - { - scale_init( dxva2->width - crop[2] - crop[3], dxva2->height - crop[0] - crop[1], job_w, job_h ); - init_flag = 1; - } - - hb_init_filter( dxva2->cl_mem_yuv, dxva2->width - crop[2] - crop[3], dxva2->height - crop[0] - crop[1], dst, job_w, job_h, crop ); - } - } - else -#endif - { - hb_copy_from_nv12( dst, plane, pitch, dxva2->width, dxva2->height ); - } + + hb_copy_from_nv12( dst, plane, pitch, dxva2->width, dxva2->height ); } IDirect3DSurface9_UnlockRect( d3d ); - return HB_WORK_OK; }