Changes to OpenCL scaling to reduce memory copies and improve performance.

author sr55 <sr55.hb@outlook.com>

Fri, 6 Sep 2013 16:34:09 +0000 (16:34 +0000)

committer sr55 <sr55.hb@outlook.com>

Fri, 6 Sep 2013 16:34:09 +0000 (16:34 +0000)
author sr55 <sr55.hb@outlook.com>
Fri, 6 Sep 2013 16:34:09 +0000 (16:34 +0000)
committer sr55 <sr55.hb@outlook.com>
Fri, 6 Sep 2013 16:34:09 +0000 (16:34 +0000)
diff --git a/libhb/common.h b/libhb/common.h

index 231cdb777e629b1acb47d0e7b0e8345e5c2e87f4..7e5512514dddd0a982cf2476ca582da5c3a3181e 100644 (file)
--- a/libhb/common.h
+++ b/libhb/common.h
@@ -361,7 +361,7 @@ struct hb_job_s
      int             chapter_start;
      int             chapter_end;
  
-       /* Include chapter marker track in mp4? */
+    /* Include chapter marker track in mp4? */
      int             chapter_markers;
  
      /* Picture settings:
@@ -1031,29 +1031,22 @@ extern hb_work_object_t hb_reader;
  typedef struct hb_oclscale_s
  {
  #ifdef USE_OPENCL
-    // input buffer for running horizontal kernel. output buffer of running horizontal kernel. outpuf buffer of running vertiacla kernel
-    cl_mem h_in_buf;
-    cl_mem h_out_buf;
-    cl_mem v_out_buf;
-    // horizontal coefficent buffer for Y U and V plane, hroizontal source index for Y,U and V plane
-    cl_mem h_coeff_y;
-    cl_mem h_coeff_uv;
-    cl_mem h_index_y;
-    cl_mem h_index_uv;
-    // vertical coefficent buffer for Y U and V plane, vertical source index for Y,U and V plane
-    cl_mem v_coeff_y;
-    cl_mem v_coeff_uv;
-    cl_mem v_index_y;
-    cl_mem v_index_uv;
+    int initialized;
+    // bicubic scale weights
+    cl_mem bicubic_x_weights;
+    cl_mem bicubic_y_weights;
+    cl_float xscale;
+    cl_float yscale;
+    int width;
+    int height;
      // horizontal scaling and vertical scaling kernel handle
-    cl_kernel h_kernel;
-    cl_kernel v_kernel;
+    cl_kernel m_kernel;
      int use_ocl_mem; // 0 use host memory. 1 use gpu oclmem
  #endif
  } hb_oclscale_t;
  
  #ifdef USE_OPENCL
-int hb_ocl_scale( cl_mem in_buf, uint8_t *in_data, uint8_t *out_data, int in_w, int in_h, int out_w, int out_h, hb_oclscale_t *os );
+int hb_ocl_scale( hb_buffer_t *in, hb_buffer_t *out, int *crop, hb_oclscale_t *os );
  #endif
  
  #ifdef USE_OPENCL
diff --git a/libhb/cropscale.c b/libhb/cropscale.c

index 0e791f3a6546af7a97c0522db10714cdb972d687..c7d7d994824a0e4900d9d29d9739862d68520413 100644 (file)
--- a/libhb/cropscale.c
+++ b/libhb/cropscale.c
@@ -140,17 +140,8 @@ static void hb_crop_scale_close( hb_filter_object_t * filter )
  
      if( pv->job->use_opencl && pv->os )
      {
-        CL_FREE( pv->os->h_in_buf );
-        CL_FREE( pv->os->h_out_buf );
-        CL_FREE( pv->os->v_out_buf );
-        CL_FREE( pv->os->h_coeff_y );
-        CL_FREE( pv->os->h_coeff_uv );
-        CL_FREE( pv->os->h_index_y );
-        CL_FREE( pv->os->h_index_uv );
-        CL_FREE( pv->os->v_coeff_y );
-        CL_FREE( pv->os->v_coeff_uv );
-        CL_FREE( pv->os->v_index_y );
-        CL_FREE( pv->os->v_index_uv );
+        CL_FREE( pv->os->bicubic_x_weights );
+        CL_FREE( pv->os->bicubic_y_weights );
          free( pv->os );
      }
  #endif
@@ -199,76 +190,14 @@ static hb_buffer_t* crop_scale( hb_filter_private_t * pv, hb_buffer_t * in )
                       pv->crop[0], pv->crop[2] );
  
  #ifdef USE_OPENCL
-
-    if( pv->job->use_opencl )
+    // Use bicubic OpenCL scaling when selected and when downsampling < 4:1;
+    if ((pv->job->use_opencl) && (pv->width_out * 4 > pv->width_in) && (in->cl.buffer != NULL) && (out->cl.buffer != NULL))
      {
-        int w = in->f.width - ( pv->crop[2] + pv->crop[3] );
-        int h = in->f.height - ( pv->crop[0] + pv->crop[1] );
-        uint8_t *tmp_in = malloc( w * h * 3 / 2 );
-        uint8_t *tmp_out = malloc( pv->width_out * pv->height_out * 3 / 2 );
-        if( pic_crop.data[0] || pic_crop.data[1] || pic_crop.data[2] || pic_crop.data[3] )
-        {
-            int i;
-            for( i = 0; i < h >> 1; i++ )
-            {
-                memcpy( tmp_in + ( ( i << 1 ) + 0 ) * w, pic_crop.data[0] + ( ( i << 1 ) + 0 ) * pic_crop.linesize[0], w );
-                memcpy( tmp_in + ( ( i << 1 ) + 1 ) * w, pic_crop.data[0] + ( ( i << 1 ) + 1 ) * pic_crop.linesize[0], w );
-                memcpy( tmp_in + ( w * h ) + i * ( w >> 1 ), pic_crop.data[1] + i * pic_crop.linesize[1], w >> 1 );
-                memcpy( tmp_in + ( w * h ) + ( ( w * h ) >> 2 ) + i * ( w >> 1 ), pic_crop.data[2] + i * pic_crop.linesize[2], w >> 1 );
-            }
-        }
-        else
-        {
-            memcpy( tmp_in, pic_crop.data[0], w * h );
-            memcpy( tmp_in + w * h, pic_crop.data[1], (w * h) >> 2 );
-            memcpy( tmp_in + w * h + ((w * h) >> 2), pic_crop.data[2], (w * h) >> 2 );
-        }
-        hb_ocl_scale( NULL, tmp_in, tmp_out, w, h, out->f.width, out->f.height, pv->os );
-        w = out->plane[0].stride;
-        h = out->plane[0].height;
-        uint8_t *dst = out->plane[0].data;
-        copy_plane( dst, tmp_out, w, pv->width_out, h );
-        w = out->plane[1].stride;
-        h = out->plane[1].height;
-        dst = out->plane[1].data;
-        copy_plane( dst, tmp_out + pv->width_out * pv->height_out, w, pv->width_out >> 1, h );
-        w = out->plane[2].stride;
-        h = out->plane[2].height;
-        dst = out->plane[2].data;
-        copy_plane( dst, tmp_out + pv->width_out * pv->height_out +( ( pv->width_out * pv->height_out ) >> 2 ), w, pv->width_out >> 1, h );
-        free( tmp_out );
-        free( tmp_in );
+        hb_ocl_scale(in, out, pv->crop, pv->os);
      }
      else
      {
-        if( !pv->context ||
-            pv->width_in   != in->f.width  ||
-            pv->height_in  != in->f.height ||
-            pv->pix_fmt != in->f.fmt )
-        {
-            // Something changed, need a new scaling context.
-            if( pv->context )
-                sws_freeContext( pv->context );
-                pv->context = hb_sws_get_context(
-                                        in->f.width  - (pv->crop[2] + pv->crop[3]),
-                                        in->f.height - (pv->crop[0] + pv->crop[1]),
-                                        in->f.fmt,
-                                        out->f.width, out->f.height, out->f.fmt,
-                                        SWS_LANCZOS | SWS_ACCURATE_RND );
-                pv->width_in = in->f.width;
-                pv->height_in = in->f.height;
-                pv->pix_fmt = in->f.fmt;
-        }
-
-        // Scale pic_crop into pic_render according to the
-        // context set up above
-        sws_scale(pv->context,
-                  (const uint8_t* const*)pic_crop.data,
-                  pic_crop.linesize,
-                  0, in->f.height - (pv->crop[0] + pv->crop[1]),
-                  pic_out.data,  pic_out.linesize);
-    }
-#else
+#endif
      if ( !pv->context ||
           pv->width_in   != in->f.width  ||
           pv->height_in  != in->f.height ||
@@ -296,6 +225,8 @@ static hb_buffer_t* crop_scale( hb_filter_private_t * pv, hb_buffer_t * in )
                pic_crop.linesize,
                0, in->f.height - (pv->crop[0] + pv->crop[1]),
                pic_out.data,  pic_out.linesize);
+#ifdef USE_OPENCL
+    }
  #endif
      out->s = in->s;
      hb_buffer_move_subs( out, in );
diff --git a/libhb/decavcodec.c b/libhb/decavcodec.c

index 409039e81bb51aa622935ae06ad8028c652fa04d..461f87b667e54df5d03380fafbd9931d4af5273b 100644 (file)
--- a/libhb/decavcodec.c
+++ b/libhb/decavcodec.c
@@ -106,7 +106,7 @@ struct hb_work_private_s
  #ifdef USE_HWD
      hb_va_dxva2_t   *dxva2;
      uint8_t         *dst_frame;
-    hb_oclscale_t   *os;
+    hb_oclscale_t   *opencl_scale;
  #endif
      hb_audio_resample_t *resample;
  };
@@ -306,22 +306,9 @@ static void closePrivData( hb_work_private_t ** ppv )
          }
          hb_audio_resample_free(pv->resample);
  #ifdef USE_HWD
-        if ( pv->os )
+        if ( pv->opencl_scale )
          {
-#ifdef USE_OPENCL
-            CL_FREE( pv->os->h_in_buf );
-            CL_FREE( pv->os->h_out_buf );
-            CL_FREE( pv->os->v_out_buf );
-            CL_FREE( pv->os->h_coeff_y );
-            CL_FREE( pv->os->h_coeff_uv );
-            CL_FREE( pv->os->h_index_y );
-            CL_FREE( pv->os->h_index_uv );
-            CL_FREE( pv->os->v_coeff_y );
-            CL_FREE( pv->os->v_coeff_uv );
-            CL_FREE( pv->os->v_index_y );
-            CL_FREE( pv->os->v_index_uv );
-#endif
-            free( pv->os );
+            free( pv->opencl_scale );
          }
          if ( pv->dxva2 )
          {
@@ -634,24 +621,15 @@ static hb_buffer_t *copy_frame( hb_work_private_t *pv, AVFrame *frame )
          hb_buffer_t *buf;
          int ww, hh;
  
-        if( (w > pv->job->width || h > pv->job->height) && (pv->job->use_opencl) 
-            && (pv->job->use_decomb == 0) && (pv->job->use_detelecine == 0) )
-        {
-            buf = hb_video_buffer_init( pv->job->width, pv->job->height );
-            ww = pv->job->width;
-            hh = pv->job->height;
-        }
-        else
-        {
-            buf = hb_video_buffer_init( w, h );
-            ww = w;
-            hh = h;
-        }
+        buf = hb_video_buffer_init( w, h );
+        ww = w;
+        hh = h;
+
          if( !pv->dst_frame )
          {
              pv->dst_frame = malloc( ww * hh * 3 / 2 );
          }
-        if( hb_va_extract( pv->dxva2, pv->dst_frame, frame, pv->job->width, pv->job->height, pv->job->title->crop, pv->os, pv->job->use_opencl, pv->job->use_decomb, pv->job->use_detelecine ) == HB_WORK_ERROR )
+        if( hb_va_extract( pv->dxva2, pv->dst_frame, frame, pv->job->width, pv->job->height, pv->job->title->crop, pv->opencl_scale, pv->job->use_opencl, pv->job->use_decomb, pv->job->use_detelecine ) == HB_WORK_ERROR )
          {
              hb_log( "hb_va_Extract failed!!!!!!" );
          }
@@ -1205,8 +1183,8 @@ static int decavcodecvInit( hb_work_object_t * w, hb_job_t * job )
                  pv->context->get_buffer = get_frame_buf_hwd;
                  pv->context->release_buffer = hb_ffmpeg_release_frame_buf;
                  pv->context->get_format = hb_ffmpeg_get_format;
-                pv->os = ( hb_oclscale_t * )malloc( sizeof( hb_oclscale_t ) );
-                memset( pv->os, 0, sizeof( hb_oclscale_t ) );
+                pv->opencl_scale = ( hb_oclscale_t * )malloc( sizeof( hb_oclscale_t ) );
+                memset( pv->opencl_scale, 0, sizeof( hb_oclscale_t ) );
                  pv->threads = 1;
  
              }
diff --git a/libhb/fifo.c b/libhb/fifo.c

index df83340fd3f3aef4bb32dadb11bb9fefb1509d98..2d792199ef7dba5f6c8d4f85b9fdd02d97da10a2 100644 (file)
--- a/libhb/fifo.c
+++ b/libhb/fifo.c
@@ -242,7 +242,14 @@ void hb_buffer_pool_free( void )
              if( b->data )
              {
                  freed += b->alloc;
-                free( b->data );
+#ifdef USE_OPENCL
+                if (b->cl.buffer != NULL) {
+                    if (hb_cl_free_mapped_buffer(b->cl.buffer, b->data) == 0)
+                        hb_log("bad free: %.16x -> buffer %.16x map %.16x", b, b->cl.buffer, b->data);
+                }
+                else
+#endif
+                    free( b->data );
              }
              free( b );
              count++;
@@ -273,7 +280,7 @@ static hb_fifo_t *size_to_pool( int size )
      return NULL;
  }
  
-hb_buffer_t * hb_buffer_init( int size )
+hb_buffer_t * hb_buffer_init_internal( int size , int needsMapped )
  {
      hb_buffer_t * b;
      // Certain libraries (hrm ffmpeg) expect buffers passed to them to
@@ -288,6 +295,18 @@ hb_buffer_t * hb_buffer_init( int size )
      {
          b = hb_fifo_get( buffer_pool );
  
+#ifdef USE_OPENCL
+        if (b && (needsMapped != 0) && (b->cl.buffer == NULL))
+        {
+            // We need a mapped OpenCL buffer and that is not what we got out of the pool.
+            // Ditch it.  It will get replaced with what we need.
+            if (b->data)
+                free(b->data);
+            free(b);
+            b = NULL;
+        }
+#endif
+
          if( b )
          {
              /*
@@ -295,10 +314,21 @@ hb_buffer_t * hb_buffer_init( int size )
               * didn't have to do this.
               */
              uint8_t *data = b->data;
+#ifdef USE_OPENCL
+            cl_mem buffer = b->cl.buffer;
+            cl_event last_event = b->cl.last_event;
+            int loc = b->cl.buffer_location;
+#endif
+
              memset( b, 0, sizeof(hb_buffer_t) );
              b->alloc = buffer_pool->buffer_size;
              b->size = size;
              b->data = data;
+#ifdef USE_OPENCL
+            b->cl.buffer = buffer;
+            b->cl.last_event = last_event;
+            b->cl.buffer_location = loc;
+#endif
              return( b );
          }
      }
@@ -317,6 +347,20 @@ hb_buffer_t * hb_buffer_init( int size )
  
      if (size)
      {
+#ifdef USE_OPENCL
+        b->cl.last_event = NULL;
+        b->cl.buffer_location = HOST;
+
+        if (needsMapped != 0)
+        {
+            int status;
+            status  = hb_cl_create_mapped_buffer(&b->cl.buffer, &b->data, b->alloc);
+            //hb_log("buf: %.16x -> buffer %.16x map %.16x size %d", b, b->cl.buffer, b->data, size);
+        }
+        else {
+            b->cl.buffer = NULL;
+#endif
+
  #if defined( SYS_DARWIN ) || defined( SYS_FREEBSD ) || defined( SYS_MINGW )
          b->data  = malloc( b->alloc );
  #elif defined( SYS_CYGWIN )
@@ -325,6 +369,10 @@ hb_buffer_t * hb_buffer_init( int size )
  #else
          b->data  = memalign( 16, b->alloc );
  #endif
+#ifdef USE_OPENCL
+        }
+#endif
+
          if( !b->data )
          {
              hb_log( "out of memory" );
@@ -338,6 +386,11 @@ hb_buffer_t * hb_buffer_init( int size )
      return b;
  }
  
+hb_buffer_t * hb_buffer_init( int size )
+{
+    return hb_buffer_init_internal(size, 0);
+}
+
  void hb_buffer_realloc( hb_buffer_t * b, int size )
  {
      if ( size > b->alloc || b->data == NULL )
@@ -355,6 +408,7 @@ void hb_buffer_realloc( hb_buffer_t * b, int size )
  
  void hb_buffer_reduce( hb_buffer_t * b, int size )
  {
+
      if ( size < b->alloc / 8 || b->data == NULL )
      {
          hb_buffer_t * tmp = hb_buffer_init( size );
@@ -368,6 +422,7 @@ void hb_buffer_reduce( hb_buffer_t * b, int size )
  
  hb_buffer_t * hb_buffer_dup( const hb_buffer_t * src )
  {
+
      hb_buffer_t * buf;
  
      if ( src == NULL )
@@ -461,8 +516,11 @@ hb_buffer_t * hb_frame_buffer_init( int pix_fmt, int width, int height )
                      hb_image_height_stride( pix_fmt, height, p );
          }
      }
-
+#ifdef USE_OPENCL
+    buf = hb_buffer_init_internal( size , hb_use_buffers() );
+#else
      buf = hb_buffer_init( size );
+#endif
      if( buf == NULL )
          return NULL;
  
@@ -514,12 +572,22 @@ void hb_buffer_swap_copy( hb_buffer_t *src, hb_buffer_t *dst )
      uint8_t *data  = dst->data;
      int      size  = dst->size;
      int      alloc = dst->alloc;
+#ifdef USE_OPENCL
+    cl_mem buffer = dst->cl.buffer;
+    cl_event last_event = dst->cl.last_event;
+    int loc = dst->cl.buffer_location;
+#endif
  
      *dst = *src;
  
      src->data  = data;
      src->size  = size;
      src->alloc = alloc;
+#ifdef USE_OPENCL
+    src->cl.buffer = buffer;
+    src->cl.last_event = last_event;
+    src->cl.buffer_location = loc;
+#endif
  }
  
  // Frees the specified buffer list.
@@ -547,7 +615,14 @@ void hb_buffer_close( hb_buffer_t ** _b )
          // free the buf 
          if( b->data )
          {
-            free( b->data );
+#ifdef USE_OPENCL
+            if (b->cl.buffer != NULL) {
+                if (hb_cl_free_mapped_buffer(b->cl.buffer, b->data) == 0)
+                    hb_log("bad free2: %.16x -> buffer %.16x map %.16x", b, b->cl.buffer, b->data);
+                }
+            else
+#endif
+                free( b->data );
              hb_lock(buffers.lock);
              buffers.allocated -= b->alloc;
              hb_unlock(buffers.lock);
diff --git a/libhb/hb.c b/libhb/hb.c

index a55575f8e19829f198bd0903e2ef6706b584ad34..f36c95311d99930d9e42028f44248e8e6cf1b8e9 100644 (file)
--- a/libhb/hb.c
+++ b/libhb/hb.c
@@ -436,6 +436,11 @@ hb_handle_t * hb_init( int verbose, int update_check )
  
      h->interjob = calloc( sizeof( hb_interjob_t ), 1 );
  
+    /* opencl */
+#ifdef USE_OPENCL
+    //hb_opencl_init();        // FIXME: Ensure gui instances call this or hb_get_opencl_env() during startup if needed.
+#endif
+
      /* Start library thread */
      hb_log( "hb_init: starting libhb thread" );
      h->die         = 0;
diff --git a/libhb/internal.h b/libhb/internal.h

index 4a2f775a988f5458856a3ba10e0f5bee666d6c76..adddbe58b26917e907be33aec52f8bdc9c6d262c 100644 (file)
--- a/libhb/internal.h
+++ b/libhb/internal.h
@@ -114,6 +114,15 @@ struct hb_buffer_s
          int           size;
      } plane[4]; // 3 Color components + alpha
  
+#ifdef USE_OPENCL
+    struct cl_data
+    {
+        cl_mem buffer;
+        cl_event last_event;
+        enum { HOST, DEVICE } buffer_location;
+    } cl;
+#endif
+
      // PICTURESUB subtitle packets:
  
      // Video packets (after processing by the hb_sync_video work-object):
diff --git a/libhb/oclscale.c b/libhb/oclscale.c

index ca916c166bafa9cfeb4dbec0969b3b16a0a2011a..904183340fb7e6b0c55c226e73c9a9de48fc2234 100644 (file)
--- a/libhb/oclscale.c
+++ b/libhb/oclscale.c
@@ -13,298 +13,259 @@
  \r
  #ifdef USE_OPENCL\r
  \r
-#include <math.h>\r
-#include "common.h"\r
-#include "openclwrapper.h"\r
-#define MaxFilterLength 16\r
-#define FILTER_LEN 4\r
-\r
-inline double hb_fit_gauss_kernel( double x )\r
-{\r
-    double powNum = -1 * M_PI;\r
-\r
-    powNum *= x;\r
-\r
-    powNum *= x;\r
-\r
-    return exp( powNum );\r
-}\r
-\r
-/**\r
- * Using gaussian algorithm to calculate the scale filter\r
- */\r
-static void hb_set_gauss_interpolation( float *pcoeff, int *pmappedindex, int targetdatalength, int srcdatalength, int filterLength, float bias )\r
-{\r
-    int i, j;\r
-\r
-    float gausskernel[MaxFilterLength];\r
-\r
-    int half = filterLength / 2;\r
-\r
-    float scalerate = (float)(srcdatalength) / targetdatalength;\r
-\r
-    for( i = 0; i < targetdatalength; ++i )\r
-    {\r
-        float flindex = i * scalerate + bias;\r
-\r
-        if( flindex > (srcdatalength - 1))\r
-        {\r
-            flindex -= (int)(flindex - (srcdatalength - 1));\r
-        }\r
-\r
-        int srcindex = (int)(flindex);\r
-\r
-        float t = flindex - srcindex;\r
-\r
-        for( j = 0; j < (int)half; j++ )\r
-        {\r
-            gausskernel[j] = (float)hb_fit_gauss_kernel((half - j) - 1 + t );\r
-        }\r
-\r
-        for( j = 0; j < (int)half; j++ )\r
-        {\r
-            gausskernel[half + j] = (float)hb_fit_gauss_kernel( j + 1 - t );\r
-        }\r
-\r
-        while( srcindex < (int)half - 1 )\r
-        {\r
-            /* -1 0 1  2\r
-            * M1 S P1 P2\r
-            *\r
-            * if srcindex is 0, M1 and S will be the same sample.  To keep the\r
-            * convolution kernel from having to check for edge conditions, move\r
-            * srcindex to 1, then slide down the coefficients\r
-            */\r
-            srcindex += 1;\r
-\r
-            gausskernel[0] += gausskernel[1];\r
-\r
-            for( j = 1; j < filterLength - 1; j++ )\r
-            {\r
-                gausskernel[j] = gausskernel[j + 1];\r
-            }\r
-\r
-            gausskernel[filterLength - 1] = 0;\r
-        }\r
-\r
-        while( srcindex >= srcdatalength - half )\r
-        {\r
-            /* If srcindex is near the edge, shift down srcindex and slide up\r
-            * the coefficients\r
-            */\r
-            srcindex -= 1;\r
-\r
-            gausskernel[3] += gausskernel[2];\r
-\r
-            for( j = filterLength - 2; j > 0; j-- )\r
-            {\r
-                gausskernel[j] = gausskernel[j - 1];\r
-            }\r
-\r
-            gausskernel[0] = 0;\r
-        }\r
-\r
-        *pmappedindex++ = srcindex - half + 1;\r
-\r
-        // Store normalized Gauss kernel\r
-\r
-        float sumtemp = 0;\r
-\r
-        for( j = 0; j < filterLength; ++j )\r
-        {\r
-            sumtemp += gausskernel[j];\r
-        }\r
-\r
-        for( j = 0; j < filterLength; ++j )\r
-        {\r
-            pcoeff[targetdatalength * j + i] = gausskernel[j] / sumtemp;\r
-        }\r
-    }\r
-}\r
-\r
-/**\r
-* executive scale using opencl\r
-* get filter args\r
-* create output buffer\r
-* create horizontal filter buffer\r
-* create vertical filter buffer\r
-* create  kernels\r
-*/\r
-int hb_ocl_scale_func( void **data, KernelEnv *kenv )\r
-{\r
-    cl_int status;\r
-\r
-    uint8_t *in_frame = data[0];\r
-    uint8_t *out_frame = data[1];\r
-    int in_frame_w = (int)data[2];\r
-    int in_frame_h = (int)data[3];\r
-    int out_frame_w = (int)data[4];\r
-    int out_frame_h = (int)data[5];\r
-    hb_oclscale_t *os = data[6];\r
-\r
-    if( os->use_ocl_mem )\r
-        os->h_in_buf = data[0];\r
-    int h_filter_len = FILTER_LEN;\r
-    int v_filter_len = FILTER_LEN;\r
-    //it will make the psnr lower when filter length is 4 in the condition that the video width is shorter than 960 and width is shorter than 544,so we set the filter length to 2\r
-    if( out_frame_w <= 960 && out_frame_h <= 544 ) \r
-    {\r
-        h_filter_len>>=1;\r
-        v_filter_len>>=1;\r
-    }\r
-    if( !os->h_out_buf )\r
-    {\r
-        hb_log( "OpenCL: Scaling With OpenCL" );\r
-        //malloc filter args\r
-        float *hf_y, *hf_uv, *vf_y, *vf_uv;\r
-        int   *hi_y, *hi_uv, *vi_y, *vi_uv;\r
-        hf_y =  (float*)malloc( sizeof(float)*out_frame_w * h_filter_len );\r
-        hf_uv = (float*)malloc( sizeof(float)*(out_frame_w>>1) * h_filter_len );\r
-        hi_y =  (int*)malloc( sizeof(int)*out_frame_w );\r
-        hi_uv = (int*)malloc( sizeof(int)*(out_frame_w>>1));\r
-        vf_y =  (float*)malloc( sizeof(float)*out_frame_h * v_filter_len );\r
-        vf_uv = (float*)malloc( sizeof(float)*(out_frame_h>>1) * v_filter_len );\r
-        vi_y =  (int*)malloc( sizeof(int)*out_frame_h );\r
-        vi_uv = (int*)malloc( sizeof(int)*(out_frame_h>>1) );\r
-        //get filter args\r
-        hb_set_gauss_interpolation( hf_y, hi_y, out_frame_w, in_frame_w, h_filter_len, 0 );\r
-        hb_set_gauss_interpolation( hf_uv, hi_uv, out_frame_w>>1, in_frame_w>>1, h_filter_len, 0 );\r
-        hb_set_gauss_interpolation( vf_y, vi_y, out_frame_h, in_frame_h, v_filter_len, 0 );\r
-        hb_set_gauss_interpolation( vf_uv, vi_uv, out_frame_h>>1, in_frame_h>>1, v_filter_len, 0 );\r
-        //create output buffer\r
-        if( !os->use_ocl_mem )\r
-        {\r
-            CREATEBUF( os->h_in_buf, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, sizeof(uint8_t)* in_frame_w * in_frame_h*3/2 );\r
-        }\r
-        CREATEBUF( os->h_out_buf, CL_MEM_WRITE_ONLY, sizeof(uint8_t) * out_frame_w * in_frame_h*3/2 );\r
-        CREATEBUF( os->v_out_buf, CL_MEM_WRITE_ONLY, sizeof(uint8_t) * out_frame_w * out_frame_h*3/2 );\r
-        //create horizontal filter buffer\r
-        CREATEBUF( os->h_coeff_y, CL_MEM_READ_ONLY, sizeof(float) * out_frame_w * h_filter_len );\r
-        CREATEBUF( os->h_coeff_uv, CL_MEM_READ_ONLY, sizeof(float) * (out_frame_w>>1) * h_filter_len );\r
-        CREATEBUF( os->h_index_y, CL_MEM_READ_ONLY, sizeof(int) * out_frame_w );\r
-        CREATEBUF( os->h_index_uv, CL_MEM_READ_ONLY, sizeof(int) * (out_frame_w>>1) );\r
-        OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_coeff_y, CL_TRUE, 0, sizeof(float) * out_frame_w * h_filter_len, hf_y, 0, NULL, NULL );\r
-        OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_coeff_uv, CL_TRUE, 0, sizeof(float) * (out_frame_w>>1) * h_filter_len, hf_uv, 0, NULL, NULL );\r
-        OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_index_y, CL_TRUE, 0, sizeof(int) * out_frame_w, hi_y, 0, NULL, NULL );\r
-        OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_index_uv, CL_TRUE, 0, sizeof(int) * (out_frame_w>>1), hi_uv, 0, NULL, NULL );\r
-        //create vertical filter buffer\r
-        CREATEBUF( os->v_coeff_y, CL_MEM_READ_ONLY, sizeof(float) * out_frame_h * v_filter_len );\r
-        CREATEBUF( os->v_coeff_uv, CL_MEM_READ_ONLY, sizeof(float) * (out_frame_h>>1) * v_filter_len );\r
-        CREATEBUF( os->v_index_y, CL_MEM_READ_ONLY, sizeof(int) * out_frame_h );\r
-        CREATEBUF( os->v_index_uv, CL_MEM_READ_ONLY, sizeof(int) * (out_frame_h>>1) );\r
-        OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->v_coeff_y, CL_TRUE, 0, sizeof(float) * out_frame_h * v_filter_len, vf_y, 0, NULL, NULL );\r
-        OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->v_coeff_uv, CL_TRUE, 0, sizeof(float) * (out_frame_h>>1) * v_filter_len, vf_uv, 0, NULL, NULL );\r
-        OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->v_index_y, CL_TRUE, 0, sizeof(int) * out_frame_h, vi_y, 0, NULL, NULL );\r
-        OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->v_index_uv, CL_TRUE, 0, sizeof(int) * (out_frame_h>>1), vi_uv, 0, NULL, NULL );\r
-        //create horizontal kernel\r
-        os->h_kernel = clCreateKernel( kenv->program, "frame_h_scale", NULL );\r
-        OCLCHECK( clSetKernelArg, os->h_kernel, 1, sizeof(cl_mem), &os->h_coeff_y );\r
-        OCLCHECK( clSetKernelArg, os->h_kernel, 2, sizeof(cl_mem), &os->h_coeff_uv );\r
-        OCLCHECK( clSetKernelArg, os->h_kernel, 3, sizeof(cl_mem), &os->h_index_y );\r
-        OCLCHECK( clSetKernelArg, os->h_kernel, 4, sizeof(cl_mem), &os->h_index_uv );\r
-        OCLCHECK( clSetKernelArg, os->h_kernel, 6, sizeof(int), &in_frame_w );\r
-        OCLCHECK( clSetKernelArg, os->h_kernel, 7, sizeof(int), &h_filter_len );\r
-        //create vertical kernel\r
-        os->v_kernel = clCreateKernel( kenv->program, "frame_v_scale", NULL );\r
-        OCLCHECK( clSetKernelArg, os->v_kernel, 1, sizeof(cl_mem), &os->v_coeff_y );\r
-        OCLCHECK( clSetKernelArg, os->v_kernel, 2, sizeof(cl_mem), &os->v_coeff_uv );\r
-        OCLCHECK( clSetKernelArg, os->v_kernel, 3, sizeof(cl_mem), &os->v_index_y );\r
-        OCLCHECK( clSetKernelArg, os->v_kernel, 4, sizeof(cl_mem), &os->v_index_uv );\r
-        OCLCHECK( clSetKernelArg, os->v_kernel, 6, sizeof(int), &in_frame_h );\r
-        OCLCHECK( clSetKernelArg, os->v_kernel, 7, sizeof(int), &v_filter_len );\r
-        free( hf_y );\r
-        free( hf_uv );\r
-        free( vf_y );\r
-        free( vf_uv );\r
-        free( hi_y );\r
-        free( hi_uv );\r
-        free( vi_y );\r
-        free( vi_uv );\r
-    }\r
-    //start horizontal scaling kernel\r
-\r
-    if( !os->use_ocl_mem )\r
-    {\r
-        if( kenv->isAMD )\r
-        {\r
-            char *mapped = clEnqueueMapBuffer( kenv->command_queue, os->h_in_buf, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, sizeof(uint8_t) * in_frame_w * in_frame_h*3/2, 0, NULL, NULL, NULL );\r
-            memcpy( mapped, in_frame, sizeof(uint8_t) * in_frame_w * in_frame_h*3/2 );\r
-            clEnqueueUnmapMemObject( kenv->command_queue, os->h_in_buf, mapped, 0, NULL, NULL );\r
-        }\r
-        else\r
-        {\r
-            OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, os->h_in_buf, CL_TRUE, 0, sizeof(uint8_t) * in_frame_w * in_frame_h * 3/2, in_frame, 0, NULL, NULL );\r
-        }\r
-    }\r
-\r
-    kenv->kernel = os->h_kernel;\r
-    size_t dims[2];\r
-    dims[0] = out_frame_w;\r
-    dims[1] = in_frame_h;\r
-    OCLCHECK( clSetKernelArg, kenv->kernel, 0, sizeof(cl_mem), &os->h_in_buf );\r
-    OCLCHECK( clSetKernelArg, kenv->kernel, 5, sizeof(cl_mem), &os->h_out_buf );\r
-    OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL, dims, NULL, 0, NULL, NULL );\r
-    //start vertical scaling kernel\r
-\r
-    kenv->kernel = os->v_kernel;\r
-    dims[0] = out_frame_w;\r
-    dims[1] = out_frame_h;\r
-    OCLCHECK( clSetKernelArg, kenv->kernel, 0, sizeof(cl_mem), &os->h_out_buf );\r
-    OCLCHECK( clSetKernelArg, kenv->kernel, 5, sizeof(cl_mem), &os->v_out_buf );\r
-    OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL, dims, NULL, 0, NULL, NULL );\r
-    OCLCHECK( clEnqueueReadBuffer, kenv->command_queue, os->v_out_buf, CL_TRUE, 0, sizeof(uint8_t) * out_frame_w * out_frame_h * 3/2, out_frame, 0, NULL, NULL );\r
-\r
-    return 1;\r
-}\r
-\r
-/**\r
-* function describe: this function is used to scaling video frame. it uses the gausi scaling algorithm\r
-*  parameter:\r
+#include <math.h>
+#include "common.h"
+#include "openclwrapper.h"
+#define FILTER_LEN 4
+
+#define _A -0.5f
+
+cl_float cubic(cl_float x)
+{
+    if (x < 0)
+        x = -x;
+
+    if (x < 1)
+        return (_A + 2.0f) * (x * x * x) - (_A + 3.0f) * (x * x) + 0 + 1;
+    else if (x < 2)
+        return (_A) * (x * x * x) - (5.0f * _A) * (x * x) + (8.0f * _A) * x - (4.0f * _A);
+    else
+        return 0;
+}
+
+
+cl_float *hb_bicubic_weights(cl_float scale, int length)
+{
+    cl_float *weights = (cl_float*) malloc(length * sizeof(cl_float) * 4);
+
+    int i;    // C rocks
+    cl_float *out = weights;
+    for (i = 0; i < length; ++i)
+    {
+        cl_float x = i / scale;
+        cl_float dx = x - (int)x;
+        *out++ = cubic(-dx - 1.0f);
+        *out++ = cubic(-dx);
+        *out++ = cubic(-dx + 1.0f);
+        *out++ = cubic(-dx + 2.0f);
+    }
+    return weights;
+}
+
+int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv);
+
+/**
+* executive scale using opencl
+* get filter args
+* create output buffer
+* create horizontal filter buffer
+* create vertical filter buffer
+* create  kernels
+*/
+int hb_ocl_scale_func( void **data, KernelEnv *kenv )
+{
+    cl_int status;
+
+    cl_mem in_buf = data[0];
+    cl_mem out_buf = data[1];
+    int crop_top = data[2];
+    int crop_bottom = data[3];
+    int crop_left = data[4];
+    int crop_right = data[5];
+    int in_frame_w = (int)data[6];
+    int in_frame_h = (int)data[7];
+    int out_frame_w = (int)data[8];
+    int out_frame_h = (int)data[9];
+    hb_oclscale_t  *os = data[10];
+    hb_buffer_t *in = data[11];
+    hb_buffer_t *out = data[12];
+
+    if (os->initialized == 0)
+    {
+        hb_log( "Scaling With OpenCL" );
+        if (kenv->isAMD != 0)
+            hb_log( "Using Zero Copy");
+        // create the block kernel
+        cl_int status;
+        os->m_kernel = clCreateKernel( kenv->program, "frame_scale", &status );
+
+        os->initialized = 1;
+    }
+
+    {
+        // Use the new kernel
+        cl_event events[5];
+        int eventCount = 0;
+
+        if (kenv->isAMD == 0) {
+            status = clEnqueueUnmapMemObject(kenv->command_queue, in->cl.buffer, in->data, 0, NULL, &events[eventCount++]);
+            status = clEnqueueUnmapMemObject(kenv->command_queue, out->cl.buffer, out->data, 0, NULL, &events[eventCount++]);
+        }
+
+        cl_int srcPlaneOffset0 = in->plane[0].data - in->data;
+        cl_int srcPlaneOffset1 = in->plane[1].data - in->data;
+        cl_int srcPlaneOffset2 = in->plane[2].data - in->data;
+        cl_int srcRowWords0 = in->plane[0].stride;
+        cl_int srcRowWords1 = in->plane[1].stride;
+        cl_int srcRowWords2 = in->plane[2].stride;
+        cl_int dstPlaneOffset0 = out->plane[0].data - out->data;
+        cl_int dstPlaneOffset1 = out->plane[1].data - out->data;
+        cl_int dstPlaneOffset2 = out->plane[2].data - out->data;
+        cl_int dstRowWords0 = out->plane[0].stride;
+        cl_int dstRowWords1 = out->plane[1].stride;
+        cl_int dstRowWords2 = out->plane[2].stride;
+
+        if (crop_top != 0 || crop_bottom != 0 || crop_left != 0 || crop_right != 0) {
+            srcPlaneOffset0 += crop_left + crop_top * srcRowWords0;
+            srcPlaneOffset1 += crop_left / 2 + (crop_top / 2) * srcRowWords1;
+            srcPlaneOffset2 += crop_left / 2 + (crop_top / 2) * srcRowWords2;
+            in_frame_w = in_frame_w - crop_right - crop_left;
+            in_frame_h = in_frame_h - crop_bottom - crop_top;
+        }
+
+        cl_float xscale = (out_frame_w * 1.0f) / in_frame_w;
+        cl_float yscale = (out_frame_h * 1.0f) / in_frame_h;
+        setupScaleWeights(xscale, yscale, out_frame_w, out_frame_h, os, kenv);
+
+        OCLCHECK( clSetKernelArg, os->m_kernel, 0, sizeof(cl_mem), &out_buf );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 1, sizeof(cl_mem), &in_buf );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 2, sizeof(cl_float), &xscale );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 3, sizeof(cl_float), &yscale );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 4, sizeof(cl_int), &srcPlaneOffset0 );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 5, sizeof(cl_int), &srcPlaneOffset1 );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 6, sizeof(cl_int), &srcPlaneOffset2 );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 7, sizeof(cl_int), &dstPlaneOffset0 );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 8, sizeof(cl_int), &dstPlaneOffset1 );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 9, sizeof(cl_int), &dstPlaneOffset2 );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 10, sizeof(cl_int), &srcRowWords0 );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 11, sizeof(cl_int), &srcRowWords1 );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 12, sizeof(cl_int), &srcRowWords2 );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 13, sizeof(cl_int), &dstRowWords0 );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 14, sizeof(cl_int), &dstRowWords1 );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 15, sizeof(cl_int), &dstRowWords2 );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 16, sizeof(int), &in_frame_w );        // FIXME: type mismatch
+        OCLCHECK( clSetKernelArg, os->m_kernel, 17, sizeof(int), &in_frame_h );        //
+        OCLCHECK( clSetKernelArg, os->m_kernel, 18, sizeof(int), &out_frame_w );        //
+        OCLCHECK( clSetKernelArg, os->m_kernel, 19, sizeof(int), &out_frame_h );        //
+        OCLCHECK( clSetKernelArg, os->m_kernel, 20, sizeof(cl_mem), &os->bicubic_x_weights );
+        OCLCHECK( clSetKernelArg, os->m_kernel, 21, sizeof(cl_mem), &os->bicubic_y_weights );
+
+        size_t workOffset[] = { 0, 0, 0 };
+        size_t globalWorkSize[] = { 1, 1, 1 };
+        size_t localWorkSize[] = { 1, 1, 1 };
+
+        int xgroups = (out_frame_w + 63) / 64;
+        int ygroups = (out_frame_h + 15) / 16;
+
+        localWorkSize[0] = 64;
+        localWorkSize[1] = 1;
+        localWorkSize[2] = 1;
+        globalWorkSize[0] = xgroups * 64;
+        globalWorkSize[1] = ygroups;
+        globalWorkSize[2] = 3;
+
+        OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, os->m_kernel, 3, workOffset, globalWorkSize, localWorkSize, eventCount, (eventCount == 0) ? NULL : &events[0], &events[eventCount] );
+        ++eventCount;
+
+        if (kenv->isAMD == 0) {
+            in->data = clEnqueueMapBuffer(kenv->command_queue, in->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, in->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount], &status);
+            out->data = clEnqueueMapBuffer(kenv->command_queue, out->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, out->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount + 1], &status);
+            eventCount += 2;
+        }
+
+        clFlush(kenv->command_queue);
+        clWaitForEvents(eventCount, &events[0]);
+        int i;
+        for (i = 0; i < eventCount; ++i)
+            clReleaseEvent(events[i]);
+    }
+
+    return 1;
+}
+
+int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv) {
+    cl_int status;
+    if (os->xscale != xscale || os->width < width) {
+        cl_float *xweights = hb_bicubic_weights(xscale, width);
+        CL_FREE(os->bicubic_x_weights);
+        CREATEBUF(os->bicubic_x_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * width * 4);
+        OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_x_weights, CL_TRUE, 0, sizeof(cl_float) * width * 4, xweights, 0, NULL, NULL );
+        os->width = width;
+        os->xscale = xscale;
+        free(xweights);
+    }
+
+    if ((os->yscale != yscale) || (os->height < height)) {
+        cl_float *yweights = hb_bicubic_weights(yscale, height);
+        CL_FREE(os->bicubic_y_weights);
+        CREATEBUF(os->bicubic_y_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * height * 4);
+        OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_y_weights, CL_TRUE, 0, sizeof(cl_float) * height * 4, yweights, 0, NULL, NULL );
+        os->height = height;
+        os->yscale = yscale;
+        free(yweights);
+    }
+    return 0;
+}
+
+
+/**
+* function describe: this function is used to scaling video frame. it uses the gausi scaling algorithm
+*  parameter:
  *           inputFrameBuffer: the source video frame opencl buffer\r
  *           outputdata: the destination video frame buffer\r
  *           inputWidth: the width of the source video frame\r
  *           inputHeight: the height of the source video frame\r
-*           outputWidth: the width of destination video frame\r
-*           outputHeight: the height of destination video frame\r
-*/\r
-int hb_ocl_scale( cl_mem in_buf, uint8_t *in_data, uint8_t *out_data, int in_w, int in_h, int out_w, int out_h, hb_oclscale_t  *os )\r
-{\r
-    void *data[7];\r
-    static int init_flag = 0;\r
-    if( init_flag == 0 )\r
-    {\r
-        int st = hb_register_kernel_wrapper( "frame_h_scale", hb_ocl_scale_func );\r
-        if( !st )\r
-        {\r
-            hb_log( "OpenCL: Register kernel[%s] failed", "frame_h_scale" );\r
-            return 0;\r
-        }\r
-        init_flag++;\r
-    }\r
-\r
-    if( in_data==NULL )\r
-    {\r
-        data[0] = in_buf;\r
-        os->use_ocl_mem = 1;\r
-    }\r
-    else\r
-    {\r
-        data[0] = in_data;\r
-        os->use_ocl_mem = 0;\r
-    }\r
-\r
-    data[1] = out_data;\r
-    data[2] = (void*)in_w;\r
-    data[3] = (void*)in_h;\r
-    data[4] = (void*)out_w;\r
-    data[5] = (void*)out_h;\r
-    data[6] = os;\r
-\r
-    if( !hb_run_kernel( "frame_h_scale", data ) )\r
-       {\r
-        hb_log( "OpenCL: Run kernel[%s] failed", "frame_scale" );\r
-       }\r
-\r
-    return 0;\r
-}\r
-#endif\r
+*           outputWidth: the width of destination video frame
+*           outputHeight: the height of destination video frame
+*/
+
+
+static int s_scale_init_flag = 0;
+
+int do_scale_init()
+{
+    if ( s_scale_init_flag==0 )
+    {
+        int st = hb_register_kernel_wrapper( "frame_scale", hb_ocl_scale_func );
+        if( !st )
+        {
+            hb_log( "register kernel[%s] failed", "frame_scale" );
+            return 0;
+        }
+        s_scale_init_flag++;
+    }
+    return 1;
+}
+
+
+int hb_ocl_scale(hb_buffer_t *in, hb_buffer_t *out, int *crop, hb_oclscale_t *os)
+{
+    void *data[13];
+
+    if (do_scale_init() == 0)
+        return 0;
+
+    data[0] = in->cl.buffer;
+    data[1] = out->cl.buffer;
+    data[2] = (void*)(crop[0]);
+    data[3] = (void*)(crop[1]);
+    data[4] = (void*)(crop[2]);
+    data[5] = (void*)(crop[3]);
+    data[6] = (void*)(in->f.width);
+    data[7] = (void*)(in->f.height);
+    data[8] = (void*)(out->f.width);
+    data[9] = (void*)(out->f.height);
+    data[10] = os;
+    data[11] = in;
+    data[12] = out;
+
+    if( !hb_run_kernel( "frame_scale", data ) )
+        hb_log( "run kernel[%s] failed", "frame_scale" );
+    return 0;
+}
+
+
+
+
+
+#endif
diff --git a/libhb/openclkernels.h b/libhb/openclkernels.h

index e347486140753133ed2afb6c0f2c26daf23b1233..f324aafdd9ed228ea26b44faa9ce2f62daa0b349 100644 (file)
--- a/libhb/openclkernels.h
+++ b/libhb/openclkernels.h
@@ -541,12 +541,100 @@ char *kernel_src_vscalefast = KERNEL (
          val1 = (src[cfilterPos[h] * dstChrStride + w] + local_up_dither[ (w + 3) & 7] ) >> 7;\r
          dst[h * dstChrStride + w] = ((val1&(~0xFF)) ? ((-val1) >> 31) : (val1));\r
  \r
-    }\r
-    );\r
-\r
-char *kernel_src_yadif_filter = KERNEL(\r
-    void filter_v6(\r
-        global unsigned char *dst,\r
+    }
+    );
+
+char *kernel_src_scale = KERNEL (
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void frame_scale(__global uchar *dst,
+                          __global const uchar *src,
+                          const float xscale,
+                          const float yscale,
+                          const int srcPlaneOffset0,
+                          const int srcPlaneOffset1,
+                          const int srcPlaneOffset2,
+                          const int dstPlaneOffset0,
+                          const int dstPlaneOffset1,
+                          const int dstPlaneOffset2,
+                          const int srcRowWords0,
+                          const int srcRowWords1,
+                          const int srcRowWords2,
+                          const int dstRowWords0,
+                          const int dstRowWords1,
+                          const int dstRowWords2,
+                          const int srcWidth,
+                          const int srcHeight,
+                          const int dstWidth,
+                          const int dstHeight,
+                          __global const float4* restrict xweights,
+                          __global const float4* restrict yweights
+                          )
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Abort work items outside the dst image bounds.
+
+    if ((get_group_id(0) * 64 >= (dstWidth >> ((z == 0) ? 0 : 1))) || (get_group_id(1) * 16 >= (dstHeight >> ((z == 0) ? 0 : 1))))
+      return;
+
+    const int srcPlaneOffset = (z == 0) ? srcPlaneOffset0 : ((z == 1) ? srcPlaneOffset1 : srcPlaneOffset2);
+    const int dstPlaneOffset = (z == 0) ? dstPlaneOffset0 : ((z == 1) ? dstPlaneOffset1 : dstPlaneOffset2);
+    const int srcRowWords = (z == 0) ? srcRowWords0: ((z == 1) ? srcRowWords1 : srcRowWords2);
+    const int dstRowWords = (z == 0) ? dstRowWords0: ((z == 1) ? dstRowWords1 : dstRowWords2);
+
+    __local uchar pixels[64 * 36];
+    const int localRowPixels = 64;
+    const int groupHeight = 16;     // src pixel height output by the workgroup
+    const int ypad = 2;
+    const int localx = get_local_id(0);
+
+    const int globalStartRow = floor((get_group_id(1) * groupHeight) / yscale);
+    const int globalRowCount = ceil(groupHeight / yscale) + 2 * ypad;
+
+    float4 weights = xweights[x];
+    int4 woffs = floor(x / xscale);
+    woffs += (int4)(-1, 0, 1, 2);
+    woffs = clamp(woffs, 0, (srcWidth >> ((z == 0) ? 0 : 1)) - 1);
+    const int maxy = (srcHeight >> ((z == 0) ? 0 : 1)) - 1;
+
+    // Scale x from global into LDS
+
+    for (int i = 0; i <= globalRowCount; ++i) {
+        int4 offs = srcPlaneOffset + clamp(globalStartRow - ypad + i, 0, maxy) * srcRowWords;
+        offs += woffs;
+        pixels[localx + i * localRowPixels] = convert_uchar(clamp(round(dot(weights,
+                       (float4)(src[offs.x], src[offs.y], src[offs.z], src[offs.w]))), 0.0f, 255.0f));
+    }
+ 
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Scale y from LDS into global
+
+    if (x >= dstWidth >> ((z == 0) ? 0 : 1))
+        return;
+
+    int off = dstPlaneOffset + x + (get_group_id(1) * groupHeight) * dstRowWords;
+
+    for (int i = 0; i < groupHeight; ++i) {
+        if (y >= dstHeight >> ((z == 0) ? 0 : 1))
+          break;
+        int localy = floor((get_group_id(1) * groupHeight + i) / yscale);
+        localy = localy - globalStartRow + ypad;
+        int loff = localx + localy * localRowPixels;
+        dst[off] = convert_uchar(clamp(round(dot(yweights[get_group_id(1) * groupHeight + i],
+                                (float4)(pixels[loff - localRowPixels], pixels[loff], pixels[loff + localRowPixels]
+                                , pixels[loff + localRowPixels * 2]))), 0.0f, 255.0f));
+        off += dstRowWords;
+    }
+}
+);
+
+
+char *kernel_src_yadif_filter = KERNEL(
+    void filter_v6(
+        global unsigned char *dst,
          global unsigned char *prev, \r
          global unsigned char *cur, \r
          global unsigned char *next,\r
diff --git a/libhb/openclwrapper.c b/libhb/openclwrapper.c

index 01aef90d102c7b71537ee0ae12c699cf7192a828..9a7e9888d866562aef3452fdc80608e61ef7775c 100644 (file)
--- a/libhb/openclwrapper.c
+++ b/libhb/openclwrapper.c
@@ -79,12 +79,13 @@ typedef struct
      char kernelName[MAX_KERNEL_NAME_LEN+1];\r
      char * kernelStr;\r
  }hb_kernel_node;\r
-\r
-static GPUEnv gpu_env;\r
-static int isInited = 0;\r
-static hb_kernel_node gKernels[MAX_KERNEL_NUM];\r
-\r
-#define ADD_KERNEL_CFG( idx, s, p ){\\r
+
+static GPUEnv gpu_env;
+static int isInited = 0;
+static int useBuffers = 0;
+static hb_kernel_node gKernels[MAX_KERNEL_NUM];
+
+#define ADD_KERNEL_CFG( idx, s, p ){\
          strcpy( gKernels[idx].kernelName, s );\\r
          gKernels[idx].kernelStr = p;\\r
          strcpy( gpu_env.kernel_names[idx], s );\\r
@@ -162,23 +163,21 @@ int hb_confirm_gpu_type()
  \r
  /**\r
   * hb_regist_opencl_kernel\r
- */\r
-int hb_regist_opencl_kernel()\r
-{\r
-    if( !gpu_env.isUserCreated )\r
-        memset( &gpu_env, 0, sizeof(gpu_env) );\r
-\r
-    gpu_env.file_count = 0; //argc;\r
-    gpu_env.kernel_count = 0UL;\r
-\r
-    ADD_KERNEL_CFG( 0, "frame_h_scale", NULL )\r
-    ADD_KERNEL_CFG( 1, "frame_v_scale", NULL )\r
-    ADD_KERNEL_CFG( 2, "nv12toyuv", NULL )\r
-    ADD_KERNEL_CFG( 3, "scale_opencl", NULL )\r
-    ADD_KERNEL_CFG( 4, "yadif_filter", NULL )\r
-\r
-    return 0;\r
-}\r
+ */
+int hb_regist_opencl_kernel()
+{
+    //if( !gpu_env.isUserCreated )
+    //    memset( &gpu_env, 0, sizeof(gpu_env) );
+    //Comment for posterity: When in doubt just zero out a structure full of pointers to allocated resources.
+
+    gpu_env.file_count = 0; //argc;
+    gpu_env.kernel_count = 0UL;
+
+    ADD_KERNEL_CFG( 0, "frame_scale", NULL )
+    ADD_KERNEL_CFG( 1, "yadif_filter", NULL )
+
+    return 0;
+}
  \r
  /**\r
   * hb_regist_opencl_kernel\r
@@ -512,23 +511,30 @@ int hb_release_kernel( KernelEnv * env )
  }\r
  \r
  /**\r
- * hb_init_opencl_env\r
- * @param gpu_info -\r
- */\r
-int hb_init_opencl_env( GPUEnv *gpu_info )\r
-{\r
-    size_t length;\r
+ * hb_init_opencl_env
+ * @param gpu_info -
+ */
+
+static int init_once = 0;
+int hb_init_opencl_env( GPUEnv *gpu_info )
+{
+    size_t length;
      cl_int status;\r
      cl_uint numPlatforms, numDevices;\r
      cl_platform_id *platforms;\r
      cl_context_properties cps[3];\r
      char platformName[100];\r
-    unsigned int i;\r
-    void *handle = INVALID_HANDLE_VALUE;\r
-\r
-    /*\r
-     * Have a look at the available platforms.\r
-     */\r
+    unsigned int i;
+    void *handle = INVALID_HANDLE_VALUE;
+
+
+    if (init_once != 0)
+        return 0;
+    else
+        init_once = 1;
+    /*
+     * Have a look at the available platforms.
+     */
      if( !gpu_info->isUserCreated )\r
      {\r
          status = clGetPlatformIDs( 0, NULL, &numPlatforms );\r
@@ -793,26 +799,22 @@ int hb_compile_kernel_file( const char *filename, GPUEnv *gpu_info,
  \r
  #ifdef USE_EXTERNAL_KERNEL\r
      status = hb_convert_to_string( filename, &source_str, gpu_info, idx );\r
-    if( status == 0 )\r
-        return(0);\r
-#else\r
-    int kernel_src_size = strlen( kernel_src_hscale ) + strlen( kernel_src_vscale ) \r
-                                  + strlen( kernel_src_nvtoyuv ) + strlen( kernel_src_hscaleall ) \r
-                                  + strlen( kernel_src_hscalefast ) + strlen( kernel_src_vscalealldither ) \r
-                                  + strlen( kernel_src_vscaleallnodither ) + strlen( kernel_src_vscalefast )\r
-                                  + strlen( kernel_src_yadif_filter );\r
-    source_str = (char*)malloc( kernel_src_size + 2 );\r
-    strcpy( source_str, kernel_src_hscale );\r
-    strcat( source_str, kernel_src_vscale );\r
-    strcat( source_str, kernel_src_nvtoyuv );\r
-    strcat( source_str, kernel_src_hscaleall );\r
-    strcat( source_str, kernel_src_hscalefast );\r
-    strcat( source_str, kernel_src_vscalealldither );\r
-    strcat( source_str, kernel_src_vscaleallnodither );\r
-    strcat( source_str, kernel_src_vscalefast );\r
-    strcat( source_str, kernel_src_yadif_filter );\r
-#endif\r
-\r
+    if( status == 0 )
+        return(0);
+#else
+    int kernel_src_size = strlen(kernel_src_scale) + strlen(kernel_src_yadif_filter);
+
+//    char *scale_src;
+//    status = hb_convert_to_string("./scale_kernels.cl", &scale_src, gpu_info, idx);
+//    if (status != 0)
+//        kernel_src_size += strlen(scale_src);
+
+    source_str = (char*)malloc( kernel_src_size + 2 );
+    strcpy( source_str, kernel_src_scale );
+//    strcat( source_str, scale_src );    //
+    strcat( source_str, kernel_src_yadif_filter );
+#endif
+
      source = source_str;\r
      source_size[0] = strlen( source );\r
  \r
@@ -947,13 +949,13 @@ int hb_compile_kernel_file( const char *filename, GPUEnv *gpu_info,
      }\r
  \r
      strcpy( gpu_env.kernelSrcFile[idx], filename );\r
-\r
-    if (binaryExisted != 1)\r
-    {\r
-        hb_generat_bin_from_kernel_source(gpu_env.programs[idx], filename);\r
-    }\r
-\r
-    gpu_info->file_count += 1;\r
+
+    if (binaryExisted != 1)
+    {
+        //hb_generat_bin_from_kernel_source(gpu_env.programs[idx], filename);
+    }
+
+    gpu_info->file_count += 1;
  \r
      return(1);\r
  }\r
@@ -1042,12 +1044,13 @@ int hb_init_opencl_run_env( int argc, char **argv, const char *build_option )
          if( status == 0 || gpu_env.kernel_count == 0 )\r
          {\r
              return(1);\r
-\r
-        }\r
-\r
-        isInited = 1;\r
-    }\r
-\r
+
+        }
+
+        useBuffers = 1;
+        isInited = 1;
+    }
+
      return(0);\r
  }\r
  \r
@@ -1164,12 +1167,46 @@ int hb_read_opencl_buffer( cl_mem cl_inBuf, unsigned char *outbuf, int size )
          return 0; \r
      }\r
  \r
-    return 1;\r
-}\r
-\r
-int hb_copy_buffer(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t cb)\r
-{\r
-    int status = clEnqueueCopyBuffer(gpu_env.command_queue,\r
+    return 1;
+}
+
+int hb_cl_create_mapped_buffer(cl_mem *mem, unsigned char **addr, int size)
+{
+    int status;
+    int flags = CL_MEM_ALLOC_HOST_PTR;
+    //cl_event event;
+    *mem = clCreateBuffer(gpu_env.context, flags, size, NULL, &status);
+    *addr = clEnqueueMapBuffer(gpu_env.command_queue, *mem, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL/*&event*/, &status);
+
+    //hb_log("\t **** context: %.8x   cmdqueue: %.8x   cl_mem: %.8x   mapaddr: %.8x   size: %d    status: %d", gpu_env.context, gpu_env.command_queue, mem, addr, size, status);
+
+    return (status == CL_SUCCESS) ? 1 : 0;
+}
+
+int hb_cl_free_mapped_buffer(cl_mem mem, unsigned char *addr)
+{
+    cl_event event;
+    int status = clEnqueueUnmapMemObject(gpu_env.command_queue, mem, addr, 0, NULL, &event);
+    if (status == CL_SUCCESS)
+        clWaitForEvents(1, &event);
+    else
+        hb_log("hb_free_mapped_buffer: error %d", status);
+    return (status == CL_SUCCESS) ? 1 : 0;
+}
+
+void hb_opencl_init()
+{
+    hb_get_opencl_env();
+}
+
+int hb_use_buffers()
+{
+    return useBuffers;
+}
+
+int hb_copy_buffer(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t cb)
+{
+    int status = clEnqueueCopyBuffer(gpu_env.command_queue,
          src_buffer,\r
          dst_buffer,\r
          src_offset, dst_offset, cb,\r
diff --git a/libhb/openclwrapper.h b/libhb/openclwrapper.h

index 4244c4454a5b54fccd097cb1c60885dc3321b952..8436c3a9c5f76d16cf79b7641e93a4755fcb118a 100644 (file)
--- a/libhb/openclwrapper.h
+++ b/libhb/openclwrapper.h
@@ -67,15 +67,23 @@ int hb_init_opencl_attr( OpenCLEnv * env );
  // create kernel object  by a kernel name on the specified opencl run time indicated by env parameter\r
  int hb_create_kernel( char * kernelname, KernelEnv * env );\r
  \r
-// release kernel object which is generated by calling the hb_create_kernel api\r
-int hb_release_kernel( KernelEnv * env );\r
-\r
-int hb_get_opencl_env();\r
-\r
-int hb_create_buffer(cl_mem *cl_Buf,int flags,int size);\r
-\r
-int hb_read_opencl_buffer(cl_mem cl_inBuf,unsigned char *outbuf,int size);\r
-\r
-int hb_confirm_gpu_type();\r
-#endif\r
-#endif\r
+// release kernel object which is generated by calling the hb_create_kernel api
+int hb_release_kernel( KernelEnv * env );
+
+void hb_opencl_init();
+
+int hb_get_opencl_env();
+
+int hb_create_buffer(cl_mem *cl_Buf,int flags,int size);
+
+int hb_read_opencl_buffer(cl_mem cl_inBuf,unsigned char *outbuf,int size);
+
+int hb_cl_create_mapped_buffer(cl_mem *mem, unsigned char **addr, int size);
+
+int hb_cl_free_mapped_buffer(cl_mem mem, unsigned char *addr);
+
+int hb_use_buffers();
+
+int hb_confirm_gpu_type();
+#endif
+#endif
diff --git a/libhb/vadxva2.c b/libhb/vadxva2.c

index f148f197e88b144f363b4895382e24025fe71e0a..4137b63c611e2f575d73c1d32c6e847dbd96f9a6 100644 (file)
--- a/libhb/vadxva2.c
+++ b/libhb/vadxva2.c
@@ -633,32 +633,10 @@ int hb_va_extract( hb_va_dxva2_t *dxva2, uint8_t *dst, AVFrame *frame, int job_w
              lock.Pitch,
              lock.Pitch,
          };
-#ifdef USE_OPENCL
-        if( ( dxva2->width > job_w || dxva2->height > job_h ) && (use_opencl) )
-        {
-            hb_ocl_nv12toyuv( plane, lock.Pitch,  dxva2->width, dxva2->height, crop, dxva2, use_decomb, use_detelecine );
-            if ( use_decomb || use_detelecine )
-                hb_read_opencl_buffer( dxva2->cl_mem_yuv, dst, dxva2->width * dxva2->height * 3 / 2 );
-            else
-            {
-                static int init_flag = 0;
-                if(init_flag == 0)
-                {
-                    scale_init( dxva2->width - crop[2] - crop[3], dxva2->height - crop[0] - crop[1], job_w, job_h );
-                    init_flag = 1;
-                }
-
-                hb_init_filter( dxva2->cl_mem_yuv, dxva2->width - crop[2] - crop[3], dxva2->height - crop[0] - crop[1], dst, job_w, job_h, crop ); 
-            }
-        }
-        else
-#endif
-        {
-            hb_copy_from_nv12( dst, plane, pitch, dxva2->width, dxva2->height );
-        }
+
+        hb_copy_from_nv12( dst, plane, pitch, dxva2->width, dxva2->height );
      }
      IDirect3DSurface9_UnlockRect( d3d );
-
      return HB_WORK_OK;
  }
author	sr55 <sr55.hb@outlook.com>
	Fri, 6 Sep 2013 16:34:09 +0000 (16:34 +0000)
committer	sr55 <sr55.hb@outlook.com>
	Fri, 6 Sep 2013 16:34:09 +0000 (16:34 +0000)
libhb/common.h		patch \| blob \| history
libhb/cropscale.c		patch \| blob \| history
libhb/decavcodec.c		patch \| blob \| history
libhb/fifo.c		patch \| blob \| history
libhb/hb.c		patch \| blob \| history
libhb/internal.h		patch \| blob \| history
libhb/oclscale.c		patch \| blob \| history
libhb/openclkernels.h		patch \| blob \| history
libhb/openclwrapper.c		patch \| blob \| history
libhb/openclwrapper.h		patch \| blob \| history
libhb/vadxva2.c		patch \| blob \| history