FFMPEG.CONFIGURE.extra += \
--enable-w32threads \
--enable-memalign-hack \
+ --enable-dxva2 \
+ --enable-hwaccel=h264_dxva2 \
+ --enable-hwaccel=mpeg2_dxva2 \
+ --enable-hwaccel=vc1_dxva2 \
+ --enable-hwaccel=wmv3_dxva2 \
--target-os=mingw32 \
--arch=i386 \
--enable-cross-compile --cross-prefix=$(BUILD.cross.prefix)
job->metadata = hb_metadata_copy( title->metadata );
}
+
static void job_setup( hb_job_t * job, hb_title_t * title )
{
if ( job == NULL || title == NULL )
hb_deep_log( level, " %-50s%20s", line, ascii );
}
}
+
+int hb_gui_use_hwd_flag = 0;
+int hb_use_dxva( hb_title_t * title )
+{
+ return ( (title->video_codec_param == AV_CODEC_ID_MPEG2VIDEO
+ || title->video_codec_param == AV_CODEC_ID_H264
+ || title->video_codec_param == AV_CODEC_ID_VC1
+ || title->video_codec_param == AV_CODEC_ID_WMV3
+ || title->video_codec_param == AV_CODEC_ID_MPEG4 )
+ && title->opaque_priv );
+}
+
#include <sys/types.h>
#include <sys/stat.h>
#include <dirent.h>
-
+#ifdef USE_OPENCL
+#if defined(__APPLE__)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+#endif
/*
* It seems WinXP doesn't align the stack of new threads to 16 bytes.
* To prevent crashes in SSE functions, we need to force stack alignement
int hb_subtitle_can_pass( int source, int mux );
hb_attachment_t *hb_attachment_copy(const hb_attachment_t *src);
+
hb_list_t *hb_attachment_list_copy(const hb_list_t *src);
void hb_attachment_close(hb_attachment_t **attachment);
int feature; // Detected DVD feature title
};
+extern int hb_gui_use_hwd_flag;
+
/******************************************************************************
* hb_job_t: settings to be filled by the UI
* Update win/CS/HandBrake.Interop/HandBrakeInterop/HbLib/hb_job_s.cs when changing this struct
int chapter_start;
int chapter_end;
- /* Include chapter marker track in mp4? */
+ /* Include chapter marker track in mp4? */
int chapter_markers;
/* Picture settings:
uint32_t frames_to_skip; // decode but discard this many frames
// initially (for frame accurate positioning
// to non-I frames).
+ int use_opencl;/* 0 is disable use of opencl. 1 is enable use of opencl */
+ int use_hwd;
+ int use_decomb;
+ int use_detelecine;
#ifdef USE_QSV
// QSV-specific settings
struct
uint32_t flags;
// set if video stream doesn't have IDR frames
+ int opencl_support;
+ int hwd_support;
#define HBTF_NO_IDR (1 << 0)
#define HBTF_SCAN_COMPLETE (1 << 0)
};
#define HB_FILTER_DROP 3
#define HB_FILTER_DONE 4
+typedef struct hb_oclscale_s
+{
+#ifdef USE_OPENCL
+ int initialized;
+ // bicubic scale weights
+ cl_mem bicubic_x_weights;
+ cl_mem bicubic_y_weights;
+ cl_float xscale;
+ cl_float yscale;
+ int width;
+ int height;
+ // horizontal scaling and vertical scaling kernel handle
+ cl_kernel m_kernel;
+ int use_ocl_mem; // 0 use host memory. 1 use gpu oclmem
+#endif
+} hb_oclscale_t;
+
+#ifdef USE_OPENCL
+int hb_ocl_scale( hb_buffer_t *in, hb_buffer_t *out, int *crop, hb_oclscale_t *os );
+#endif
+
+#ifdef USE_OPENCL
+int hb_use_dxva( hb_title_t * title );
+// create opencl buffer
+#define CREATEBUF( out, flags, size )\
+ {\
+ out = clCreateBuffer( kenv->context, (flags), (size), NULL, &status );\
+ if( CL_SUCCESS != status ) return -1;\
+ }
+
+#define OCLCHECK( method, ... )\
+ status = method( __VA_ARGS__ ); if( status != CL_SUCCESS ) {\
+ printf( # method " error '%d'\n", status ); return status; }
+
+#define CL_FREE( buf )\
+{\
+ if( buf )\
+ {\
+ { clReleaseMemObject( buf ); }\
+ buf = NULL;\
+ }\
+}
+
+
+#endif
+
typedef struct hb_filter_init_s
{
hb_job_t * job;
int vrate_base;
int vrate;
int cfr;
+#ifdef USE_OPENCL
+ int use_dxva;
+#endif
+
} hb_filter_init_t;
typedef struct hb_filter_info_s
HB_FILTER_DENOISE,
HB_FILTER_RENDER_SUB,
HB_FILTER_CROP_SCALE,
+
// Finally filters that don't care what order they are in,
// except that they must be after the above filters
HB_FILTER_ROTATE,
#include "hb.h"
#include "hbffmpeg.h"
+#include "common.h"
+
struct hb_filter_private_s
{
+ hb_job_t *job;
int width_in;
int height_in;
int pix_fmt;
int width_out;
int height_out;
int crop[4];
+
+#ifdef USE_OPENCL
+ int use_dxva;
+ int use_decomb;
+ int use_detelecine;
+ hb_oclscale_t *os; //ocl scaler handler
+#endif
struct SwsContext * context;
};
hb_filter_private_t * pv = filter->private_data;
// TODO: add pix format option to settings
+ pv->job = init->job;
pv->pix_fmt_out = init->pix_fmt;
pv->width_in = init->width;
pv->height_in = init->height;
pv->width_out = init->width - (init->crop[2] + init->crop[3]);
pv->height_out = init->height - (init->crop[0] + init->crop[1]);
+#ifdef USE_OPENCL
+ pv->use_dxva = init->use_dxva;
+ pv->use_decomb = init->job->use_decomb;
+ pv->use_detelecine = init->job->use_detelecine;
+
+ if( pv->job->use_opencl )
+ {
+ pv->os = ( hb_oclscale_t * )malloc( sizeof( hb_oclscale_t ) );
+ memset( pv->os, 0, sizeof( hb_oclscale_t ) );
+ }
+#endif
memcpy( pv->crop, init->crop, sizeof( int[4] ) );
if( filter->settings )
{
init->width = pv->width_out;
init->height = pv->height_out;
memcpy( init->crop, pv->crop, sizeof( int[4] ) );
+#ifdef USE_OPENCL
+ pv->use_dxva = init->use_dxva;
+#endif
return 0;
}
{
return;
}
+#ifdef USE_OPENCL
- if ( pv->context )
+ if( pv->job->use_opencl && pv->os )
+ {
+ CL_FREE( pv->os->bicubic_x_weights );
+ CL_FREE( pv->os->bicubic_y_weights );
+ free( pv->os );
+ }
+#endif
+ if( pv->context )
{
sws_freeContext( pv->context );
}
filter->private_data = NULL;
}
+#ifdef USE_OPENCL
+static uint8_t *copy_plane( uint8_t *dst, uint8_t* src, int dstride, int sstride, int h )
+{
+ if( dstride == sstride )
+ {
+ memcpy( dst, src, dstride * h );
+ return dst + dstride * h;
+ }
+ int lbytes = dstride <= sstride ? dstride : sstride;
+ while( --h >= 0 )
+ {
+ memcpy( dst, src, lbytes );
+ src += sstride;
+ dst += dstride;
+ }
+ return dst;
+}
+#endif
+
static hb_buffer_t* crop_scale( hb_filter_private_t * pv, hb_buffer_t * in )
{
AVPicture pic_in;
av_picture_crop( &pic_crop, &pic_in, in->f.fmt,
pv->crop[0], pv->crop[2] );
+#ifdef USE_OPENCL
+ // Use bicubic OpenCL scaling when selected and when downsampling < 4:1;
+ if ((pv->job->use_opencl) && (pv->width_out * 4 > pv->width_in) && (in->cl.buffer != NULL) && (out->cl.buffer != NULL))
+ {
+ hb_ocl_scale(in, out, pv->crop, pv->os);
+ }
+ else
+ {
+#endif
if ( !pv->context ||
pv->width_in != in->f.width ||
pv->height_in != in->f.height ||
pic_crop.linesize,
0, in->f.height - (pv->crop[0] + pv->crop[1]),
pic_out.data, pic_out.linesize);
-
+#ifdef USE_OPENCL
+ }
+#endif
out->s = in->s;
hb_buffer_move_subs( out, in );
return out;
pv->width_out = in->f.width - (pv->crop[2] + pv->crop[3]);
pv->height_out = in->f.height - (pv->crop[0] + pv->crop[1]);
}
+#ifdef USE_OPENCL
+ if ( (in->f.fmt == pv->pix_fmt_out &&
+ !pv->crop[0] && !pv->crop[1] && !pv->crop[2] && !pv->crop[3] &&
+ in->f.width == pv->width_out && in->f.height == pv->height_out) &&
+ (pv->use_decomb == 0) && (pv->use_detelecine == 0) ||
+ (pv->use_dxva && in->f.width == pv->width_out && in->f.height == pv->height_out) )
+ {
+ *buf_out = in;
+ *buf_in = NULL;
+ return HB_FILTER_OK;
+ }
+#else
if ( in->f.fmt == pv->pix_fmt_out &&
!pv->crop[0] && !pv->crop[1] && !pv->crop[2] && !pv->crop[3] &&
in->f.width == pv->width_out && in->f.height == pv->height_out )
*buf_in = NULL;
return HB_FILTER_OK;
}
+#endif
+
*buf_out = crop_scale( pv, in );
return HB_FILTER_OK;
}
double frame_dur = (6. * 256. * 90000.) / pv->rate;
- double pts = (ipts != -1) ? (double)ipts : pv->next_expected_pts;
+ double pts;
+ if (hb_gui_use_hwd_flag == 1 && ipts != -1)
+ pts = ((double)ipts >= pv->next_expected_pts) ? (double)ipts : pv->next_expected_pts;
+ else
+ pts = (ipts != -1) ? (double)ipts : pv->next_expected_pts;
/* AC3 passthrough: don't decode the AC3 frame */
if (audio->config.out.codec == HB_ACODEC_AC3_PASS)
#include "hbffmpeg.h"
#include "audio_resample.h"
+#ifdef USE_HWD
+#include "vadxva2.h"
+#endif
+
#ifdef USE_QSV
#include "enc_qsv.h"
#include "qsv_common.h"
int sws_pix_fmt;
int cadence[12];
int wait_for_keyframe;
-
+#ifdef USE_HWD
+ hb_va_dxva2_t *dxva2;
+ uint8_t *dst_frame;
+ hb_oclscale_t *opencl_scale;
+#endif
hb_audio_resample_t *resample;
#ifdef USE_QSV
hb_list_empty( &pv->list );
}
hb_audio_resample_free(pv->resample);
+
+#ifdef USE_HWD
+ if ( pv->opencl_scale )
+ {
+ free( pv->opencl_scale );
+ }
+
+ if ( pv->dxva2 )
+ {
+#ifdef USE_OPENCL
+ CL_FREE( pv->dxva2->cl_mem_nv12 );
+#endif
+ hb_va_close( pv->dxva2 );
+ }
+#endif
+
#ifdef USE_QSV_PTS_WORKAROUND
if (pv->qsv.decode && pv->qsv.pts_list != NULL)
hb_list_close(&pv->qsv.pts_list);
}
#endif
+
free( pv );
}
*ppv = NULL;
static void decavcodecClose( hb_work_object_t * w )
{
hb_work_private_t * pv = w->private_data;
-
+#ifdef USE_HWD
+ if( pv->dst_frame ) free( pv->dst_frame );
+#endif
if ( pv )
{
closePrivData( &pv );
}
else
{
- w = pv->job->title->width;
- h = pv->job->title->height;
+ w = pv->job->title->width;
+ h = pv->job->title->height;
}
- hb_buffer_t *buf = hb_video_buffer_init( w, h );
+#ifdef USE_HWD
+ if (pv->dxva2 && pv->job)
+ {
+ hb_buffer_t *buf;
+ int ww, hh;
+
+ buf = hb_video_buffer_init( w, h );
+ ww = w;
+ hh = h;
+
+ if( !pv->dst_frame )
+ {
+ pv->dst_frame = malloc( ww * hh * 3 / 2 );
+ }
+ if( hb_va_extract( pv->dxva2, pv->dst_frame, frame, pv->job->width, pv->job->height, pv->job->title->crop, pv->opencl_scale, pv->job->use_opencl, pv->job->use_decomb, pv->job->use_detelecine ) == HB_WORK_ERROR )
+ {
+ hb_log( "hb_va_Extract failed!!!!!!" );
+ }
+ w = buf->plane[0].stride;
+ h = buf->plane[0].height;
+ uint8_t *dst = buf->plane[0].data;
+ copy_plane( dst, pv->dst_frame, w, ww, h );
+ w = buf->plane[1].stride;
+ h = buf->plane[1].height;
+ dst = buf->plane[1].data;
+ copy_plane( dst, pv->dst_frame + ww * hh, w, ww >> 1, h );
+ w = buf->plane[2].stride;
+ h = buf->plane[2].height;
+ dst = buf->plane[2].data;
+ copy_plane( dst, pv->dst_frame + ww * hh +( ( ww * hh ) >> 2 ), w, ww >> 1, h );
+ return buf;
+ }
+ else
+#endif
+ {
+ hb_buffer_t *buf = hb_video_buffer_init( w, h );
+
#ifdef USE_QSV
// no need to copy the frame data when decoding with QSV to opaque memory
if (pv->qsv.decode &&
}
#endif
- uint8_t *dst = buf->data;
+ uint8_t *dst = buf->data;
- if (context->pix_fmt != AV_PIX_FMT_YUV420P || w != context->width ||
- h != context->height)
- {
- // have to convert to our internal color space and/or rescale
- AVPicture dstpic;
- hb_avpicture_fill(&dstpic, buf);
-
- if (pv->sws_context == NULL ||
- pv->sws_width != context->width ||
- pv->sws_height != context->height ||
- pv->sws_pix_fmt != context->pix_fmt)
+ if (context->pix_fmt != AV_PIX_FMT_YUV420P || w != context->width ||
+ h != context->height)
+ {
+ // have to convert to our internal color space and/or rescale
+ AVPicture dstpic;
+ hb_avpicture_fill(&dstpic, buf);
+
+ if (pv->sws_context == NULL ||
+ pv->sws_width != context->width ||
+ pv->sws_height != context->height ||
+ pv->sws_pix_fmt != context->pix_fmt)
+ {
+ if (pv->sws_context != NULL)
+ sws_freeContext(pv->sws_context);
+ pv->sws_context = hb_sws_get_context(context->width,
+ context->height,
+ context->pix_fmt,
+ w, h, AV_PIX_FMT_YUV420P,
+ SWS_LANCZOS|SWS_ACCURATE_RND);
+ pv->sws_width = context->width;
+ pv->sws_height = context->height;
+ pv->sws_pix_fmt = context->pix_fmt;
+ }
+ sws_scale(pv->sws_context,
+ (const uint8_t* const *)frame->data, frame->linesize,
+ 0, context->height, dstpic.data, dstpic.linesize);
+ }
+ else
{
- if (pv->sws_context != NULL)
- sws_freeContext(pv->sws_context);
- pv->sws_context = hb_sws_get_context(context->width,
- context->height,
- context->pix_fmt,
- w, h, AV_PIX_FMT_YUV420P,
- SWS_LANCZOS|SWS_ACCURATE_RND);
- pv->sws_width = context->width;
- pv->sws_height = context->height;
- pv->sws_pix_fmt = context->pix_fmt;
+ w = buf->plane[0].stride;
+ h = buf->plane[0].height;
+ dst = buf->plane[0].data;
+ copy_plane( dst, frame->data[0], w, frame->linesize[0], h );
+ w = buf->plane[1].stride;
+ h = buf->plane[1].height;
+ dst = buf->plane[1].data;
+ copy_plane( dst, frame->data[1], w, frame->linesize[1], h );
+ w = buf->plane[2].stride;
+ h = buf->plane[2].height;
+ dst = buf->plane[2].data;
+ copy_plane( dst, frame->data[2], w, frame->linesize[2], h );
}
- sws_scale(pv->sws_context,
- (const uint8_t* const *)frame->data, frame->linesize,
- 0, context->height, dstpic.data, dstpic.linesize);
+ return buf;
+ }
+}
+
+#ifdef USE_HWD
+
+static int get_frame_buf_hwd( AVCodecContext *context, AVFrame *frame )
+{
+
+ hb_work_private_t *pv = (hb_work_private_t*)context->opaque;
+ if ( (pv != NULL) && pv->dxva2 )
+ {
+ int result = HB_WORK_ERROR;
+ hb_work_private_t *pv = (hb_work_private_t*)context->opaque;
+ result = hb_va_get_frame_buf( pv->dxva2, context, frame );
+ if( result == HB_WORK_ERROR )
+ return avcodec_default_get_buffer( context, frame );
+ return 0;
}
else
+ return avcodec_default_get_buffer( context, frame );
+}
+
+static void hb_ffmpeg_release_frame_buf( struct AVCodecContext *p_context, AVFrame *frame )
+{
+ hb_work_private_t *p_dec = (hb_work_private_t*)p_context->opaque;
+ int i;
+ if( p_dec->dxva2 )
{
- w = buf->plane[0].stride;
- h = buf->plane[0].height;
- dst = buf->plane[0].data;
- copy_plane( dst, frame->data[0], w, frame->linesize[0], h );
- w = buf->plane[1].stride;
- h = buf->plane[1].height;
- dst = buf->plane[1].data;
- copy_plane( dst, frame->data[1], w, frame->linesize[1], h );
- w = buf->plane[2].stride;
- h = buf->plane[2].height;
- dst = buf->plane[2].data;
- copy_plane( dst, frame->data[2], w, frame->linesize[2], h );
+ hb_va_release( p_dec->dxva2, frame );
+ }
+ else if( !frame->opaque )
+ {
+ if( frame->type == FF_BUFFER_TYPE_INTERNAL )
+ avcodec_default_release_buffer( p_context, frame );
}
- return buf;
+ for( i = 0; i < 4; i++ )
+ frame->data[i] = NULL;
}
+#endif
static void log_chapter( hb_work_private_t *pv, int chap_num, int64_t pts )
{
if ( !pv->frame_duration_set )
compute_frame_duration( pv );
+ double pts;
double frame_dur = pv->duration;
if ( frame.repeat_pict )
{
frame_dur += frame.repeat_pict * pv->field_duration;
}
-
+#ifdef USE_HWD
+ if( pv->dxva2 && pv->dxva2->do_job == HB_WORK_OK )
+ {
+ if( avp.pts>0 )
+ {
+ if( pv->dxva2->input_pts[0] != 0 && pv->dxva2->input_pts[1] == 0 )
+ frame.pkt_pts = pv->dxva2->input_pts[0];
+ else
+ frame.pkt_pts = pv->dxva2->input_pts[0]<pv->dxva2->input_pts[1] ? pv->dxva2->input_pts[0] : pv->dxva2->input_pts[1];
+ }
+ }
+#endif
// If there was no pts for this frame, assume constant frame rate
// video & estimate the next frame time from the last & duration.
- double pts;
- if (frame.pkt_pts == AV_NOPTS_VALUE)
+ if (frame.pkt_pts == AV_NOPTS_VALUE || hb_gui_use_hwd_flag == 1)
{
pts = pv->pts_next;
}
{
pts = frame.pkt_pts;
}
+
pv->pts_next = pts + frame_dur;
if ( frame.top_field_first )
pv->context->workaround_bugs = FF_BUG_AUTODETECT;
pv->context->err_recognition = AV_EF_CRCCHECK;
pv->context->error_concealment = FF_EC_GUESS_MVS|FF_EC_DEBLOCK;
+#ifdef USE_HWD
+ if ( pv->job && job->use_hwd && hb_use_dxva( pv->title ) )
+ {
+ pv->dxva2 = hb_va_create_dxva2( pv->dxva2, w->codec_param );
+ if( pv->dxva2 && pv->dxva2->do_job == HB_WORK_OK )
+ {
+ hb_va_new_dxva2( pv->dxva2, pv->context );
+ pv->context->slice_flags |= SLICE_FLAG_ALLOW_FIELD;
+ pv->context->opaque = pv;
+ pv->context->get_buffer = get_frame_buf_hwd;
+ pv->context->release_buffer = hb_ffmpeg_release_frame_buf;
+ pv->context->get_format = hb_ffmpeg_get_format;
+ pv->opencl_scale = ( hb_oclscale_t * )malloc( sizeof( hb_oclscale_t ) );
+ memset( pv->opencl_scale, 0, sizeof( hb_oclscale_t ) );
+ pv->threads = 1;
+ }
+ }
+#endif
+
#ifdef USE_QSV
if (pv->qsv.decode)
pv->new_chap = in->s.new_chap;
pv->chap_time = pts >= 0? pts : pv->pts_next;
}
+#ifdef USE_HWD
+ if( pv->dxva2 && pv->dxva2->do_job == HB_WORK_OK )
+ {
+ if( pv->dxva2->input_pts[0] <= pv->dxva2->input_pts[1] )
+ pv->dxva2->input_pts[0] = pts;
+ else if( pv->dxva2->input_pts[0] > pv->dxva2->input_pts[1] )
+ pv->dxva2->input_pts[1] = pts;
+ pv->dxva2->input_dts = dts;
+ }
+#endif
decodeVideo( w, in->data, in->size, in->sequence, pts, dts, in->s.frametype );
hb_buffer_close( &in );
*buf_out = link_buf_list( pv );
.info = decavcodecvInfo,
.bsinfo = decavcodecvBSInfo
};
-
static void decodeAudio(hb_audio_t *audio, hb_work_private_t *pv, uint8_t *data,
int size, int64_t pts)
{
int i;
struct pullup_field * f0 = f;
const char aff_l[] = "+..", aff_r[] = "..+";
- printf( "\naffinity: " );
+ hb_log( "affinity: " );
for( i = 0; i < 4; i++ )
{
printf( "%c%d%c",
--- /dev/null
+/* dxva2api.c
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>
+ Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>
+
+ */
+#ifdef USE_HWD
+#include "dxva2api.h"
+
+__inline float hb_dx_fixedtofloat( const DXVA2_Fixed32 _fixed_ )
+{
+ return (FLOAT)_fixed_.Value + (FLOAT)_fixed_.Fraction / 0x10000;
+}
+
+__inline const DXVA2_Fixed32 hb_dx_fixed32_opaque_alpha()
+{
+ DXVA2_Fixed32 _fixed_;
+ _fixed_.Fraction = 0;
+ _fixed_.Value = 0;
+ _fixed_.ll = 1;
+ return _fixed_;
+}
+
+
+__inline DXVA2_Fixed32 hb_dx_floattofixed( const float _float_ )
+{
+ DXVA2_Fixed32 _fixed_;
+ _fixed_.Fraction = LOWORD( _float_ * 0x10000 );
+ _fixed_.Value = HIWORD( _float_ * 0x10000 );
+ return _fixed_;
+}
+#endif
--- /dev/null
+/* dxva2api.h
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>
+ Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>
+
+ */
+
+
+#ifndef _DXVA2API_H
+#define _DXVA2API_H
+#ifdef USE_HWD
+#define MINGW_DXVA2API_H_VERSION (2)
+
+#if __GNUC__ >= 3
+#pragma GCC system_header
+#endif
+
+#include <objbase.h>
+#include <d3d9.h>
+
+/* Define it to allow using nameless struct/union (non C99 compliant) to match
+ * the official documentation. */
+//#define DXVA2API_USE_BITFIELDS
+
+/****************STRUCTURES******************/
+#pragma pack(push, 1)
+
+#define DXVA2API_USE_BITFIELDS
+
+typedef struct _DXVA2_ExtendedFormat {
+#ifdef DXVA2API_USE_BITFIELDS
+ union {
+ struct {
+ UINT SampleFormat : 8;
+ UINT VideoChromaSubsampling : 4;
+ UINT NominalRange : 3;
+ UINT VideoTransferMatrix : 3;
+ UINT VideoLighting : 4;
+ UINT VideoPrimaries : 5;
+ UINT VideoTransferFunction : 5;
+ };
+ UINT value;
+ };
+#else
+ UINT value;
+#endif
+} DXVA2_ExtendedFormat;
+
+typedef struct _DXVA2_Frequency {
+ UINT Numerator;
+ UINT Denominator;
+} DXVA2_Frequency;
+
+typedef struct _DXVA2_VideoDesc {
+ UINT SampleWidth;
+ UINT SampleHeight;
+ DXVA2_ExtendedFormat SampleFormat;
+ D3DFORMAT Format;
+ DXVA2_Frequency InputSampleFreq;
+ DXVA2_Frequency OutputFrameFreq;
+ UINT UABProtectionLevel;
+ UINT Reserved;
+} DXVA2_VideoDesc;
+
+typedef struct _DXVA2_ConfigPictureDecode {
+ GUID guidConfigBitstreamEncryption;
+ GUID guidConfigMBcontrolEncryption;
+ GUID guidConfigResidDiffEncryption;
+ UINT ConfigBitstreamRaw;
+ UINT ConfigMBcontrolRasterOrder;
+ UINT ConfigResidDiffHost;
+ UINT ConfigSpatialResid8;
+ UINT ConfigResid8Subtraction;
+ UINT ConfigSpatialHost8or9Clipping;
+ UINT ConfigSpatialResidInterleaved;
+ UINT ConfigIntraResidUnsigned;
+ UINT ConfigResidDiffAccelerator;
+ UINT ConfigHostInverseScan;
+ UINT ConfigSpecificIDCT;
+ UINT Config4GroupedCoefs;
+ USHORT ConfigMinRenderTargetBuffCount;
+ USHORT ConfigDecoderSpecific;
+} DXVA2_ConfigPictureDecode;
+
+typedef struct _DXVA2_DecodeBufferDesc {
+ DWORD CompressedBufferType;
+ UINT BufferIndex;
+ UINT DataOffset;
+ UINT DataSize;
+ UINT FirstMBaddress;
+ UINT NumMBsInBuffer;
+ UINT Width;
+ UINT Height;
+ UINT Stride;
+ UINT ReservedBits;
+ PVOID pvPVPState;
+} DXVA2_DecodeBufferDesc;
+
+typedef struct _DXVA2_DecodeExtensionData {
+ UINT Function;
+ PVOID pPrivateInputData;
+ UINT PrivateInputDataSize;
+ PVOID pPrivateOutputData;
+ UINT PrivateOutputDataSize;
+} DXVA2_DecodeExtensionData;
+
+typedef struct _DXVA2_DecodeExecuteParams {
+ UINT NumCompBuffers;
+ DXVA2_DecodeBufferDesc *pCompressedBuffers;
+ DXVA2_DecodeExtensionData *pExtensionData;
+} DXVA2_DecodeExecuteParams;
+
+enum {
+ DXVA2_VideoDecoderRenderTarget = 0,
+ DXVA2_VideoProcessorRenderTarget= 1,
+ DXVA2_VideoSoftwareRenderTarget = 2
+};
+
+enum {
+ DXVA2_PictureParametersBufferType = 0,
+ DXVA2_MacroBlockControlBufferType = 1,
+ DXVA2_ResidualDifferenceBufferType = 2,
+ DXVA2_DeblockingControlBufferType = 3,
+ DXVA2_InverseQuantizationMatrixBufferType = 4,
+ DXVA2_SliceControlBufferType = 5,
+ DXVA2_BitStreamDateBufferType = 6,
+ DXVA2_MotionVectorBuffer = 7,
+ DXVA2_FilmGrainBuffer = 8
+};
+
+/* DXVA MPEG-I/II and VC-1 */
+typedef struct _DXVA_PictureParameters {
+ USHORT wDecodedPictureIndex;
+ USHORT wDeblockedPictureIndex;
+ USHORT wForwardRefPictureIndex;
+ USHORT wBackwardRefPictureIndex;
+ USHORT wPicWidthInMBminus1;
+ USHORT wPicHeightInMBminus1;
+ UCHAR bMacroblockWidthMinus1;
+ UCHAR bMacroblockHeightMinus1;
+ UCHAR bBlockWidthMinus1;
+ UCHAR bBlockHeightMinus1;
+ UCHAR bBPPminus1;
+ UCHAR bPicStructure;
+ UCHAR bSecondField;
+ UCHAR bPicIntra;
+ UCHAR bPicBackwardPrediction;
+ UCHAR bBidirectionalAveragingMode;
+ UCHAR bMVprecisionAndChromaRelation;
+ UCHAR bChromaFormat;
+ UCHAR bPicScanFixed;
+ UCHAR bPicScanMethod;
+ UCHAR bPicReadbackRequests;
+ UCHAR bRcontrol;
+ UCHAR bPicSpatialResid8;
+ UCHAR bPicOverflowBlocks;
+ UCHAR bPicExtrapolation;
+ UCHAR bPicDeblocked;
+ UCHAR bPicDeblockConfined;
+ UCHAR bPic4MVallowed;
+ UCHAR bPicOBMC;
+ UCHAR bPicBinPB;
+ UCHAR bMV_RPS;
+ UCHAR bReservedBits;
+ USHORT wBitstreamFcodes;
+ USHORT wBitstreamPCEelements;
+ UCHAR bBitstreamConcealmentNeed;
+ UCHAR bBitstreamConcealmentMethod;
+} DXVA_PictureParameters, *LPDXVA_PictureParameters;
+
+typedef struct _DXVA_QmatrixData {
+ BYTE bNewQmatrix[4];
+ WORD Qmatrix[4][8 * 8];
+} DXVA_QmatrixData, *LPDXVA_QmatrixData;
+
+typedef struct _DXVA_SliceInfo {
+ USHORT wHorizontalPosition;
+ USHORT wVerticalPosition;
+ UINT dwSliceBitsInBuffer;
+ UINT dwSliceDataLocation;
+ UCHAR bStartCodeBitOffset;
+ UCHAR bReservedBits;
+ USHORT wMBbitOffset;
+ USHORT wNumberMBsInSlice;
+ USHORT wQuantizerScaleCode;
+ USHORT wBadSliceChopping;
+} DXVA_SliceInfo, *LPDXVA_SliceInfo;
+
+/* DXVA H264 */
+typedef struct {
+#ifdef DXVA2API_USE_BITFIELDS
+ union {
+ struct {
+ UCHAR Index7Bits : 7;
+ UCHAR AssociatedFlag : 1;
+ };
+ UCHAR bPicEntry;
+ };
+#else
+ UCHAR bPicEntry;
+#endif
+} DXVA_PicEntry_H264;
+
+
+typedef struct {
+ USHORT wFrameWidthInMbsMinus1;
+ USHORT wFrameHeightInMbsMinus1;
+ DXVA_PicEntry_H264 CurrPic;
+ UCHAR num_ref_frames;
+#ifdef DXVA2API_USE_BITFIELDS
+ union {
+ struct {
+ USHORT field_pic_flag : 1;
+ USHORT MbaffFrameFlag : 1;
+ USHORT residual_colour_transform_flag : 1;
+ USHORT sp_for_switch_flag : 1;
+ USHORT chroma_format_idc : 2;
+ USHORT RefPicFlag : 1;
+ USHORT constrained_intra_pred_flag : 1;
+ USHORT weighted_pred_flag : 1;
+ USHORT weighted_bipred_idc : 2;
+ USHORT MbsConsecutiveFlag : 1;
+ USHORT frame_mbs_only_flag : 1;
+ USHORT transform_8x8_mode_flag : 1;
+ USHORT MinLumaBipredSize8x8Flag : 1;
+ USHORT IntraPicFlag : 1;
+ };
+ USHORT wBitFields;
+ };
+#else
+ USHORT wBitFields;
+#endif
+ UCHAR bit_depth_luma_minus8;
+ UCHAR bit_depth_chroma_minus8;
+ USHORT Reserved16Bits;
+ UINT StatusReportFeedbackNumber;
+ DXVA_PicEntry_H264 RefFrameList[16];
+ INT CurrFieldOrderCnt[2];
+ INT FieldOrderCntList[16][2];
+ CHAR pic_init_qs_minus26;
+ CHAR chroma_qp_index_offset;
+ CHAR second_chroma_qp_index_offset;
+ UCHAR ContinuationFlag;
+ CHAR pic_init_qp_minus26;
+ UCHAR num_ref_idx_l0_active_minus1;
+ UCHAR num_ref_idx_l1_active_minus1;
+ UCHAR Reserved8BitsA;
+ USHORT FrameNumList[16];
+
+ UINT UsedForReferenceFlags;
+ USHORT NonExistingFrameFlags;
+ USHORT frame_num;
+ UCHAR log2_max_frame_num_minus4;
+ UCHAR pic_order_cnt_type;
+ UCHAR log2_max_pic_order_cnt_lsb_minus4;
+ UCHAR delta_pic_order_always_zero_flag;
+ UCHAR direct_8x8_inference_flag;
+ UCHAR entropy_coding_mode_flag;
+ UCHAR pic_order_present_flag;
+ UCHAR num_slice_groups_minus1;
+ UCHAR slice_group_map_type;
+ UCHAR deblocking_filter_control_present_flag;
+ UCHAR redundant_pic_cnt_present_flag;
+ UCHAR Reserved8BitsB;
+ USHORT slice_group_change_rate_minus1;
+ UCHAR SliceGroupMap[810];
+} DXVA_PicParams_H264;
+
+typedef struct {
+ UCHAR bScalingLists4x4[6][16];
+ UCHAR bScalingLists8x8[2][64];
+} DXVA_Qmatrix_H264;
+
+
+typedef struct {
+ UINT BSNALunitDataLocation;
+ UINT SliceBytesInBuffer;
+ USHORT wBadSliceChopping;
+ USHORT first_mb_in_slice;
+ USHORT NumMbsForSlice;
+ USHORT BitOffsetToSliceData;
+ UCHAR slice_type;
+ UCHAR luma_log2_weight_denom;
+ UCHAR chroma_log2_weight_denom;
+
+ UCHAR num_ref_idx_l0_active_minus1;
+ UCHAR num_ref_idx_l1_active_minus1;
+ CHAR slice_alpha_c0_offset_div2;
+ CHAR slice_beta_offset_div2;
+ UCHAR Reserved8Bits;
+ DXVA_PicEntry_H264 RefPicList[2][32];
+ SHORT Weights[2][32][3][2];
+ CHAR slice_qs_delta;
+ CHAR slice_qp_delta;
+ UCHAR redundant_pic_cnt;
+ UCHAR direct_spatial_mv_pred_flag;
+ UCHAR cabac_init_idc;
+ UCHAR disable_deblocking_filter_idc;
+ USHORT slice_id;
+} DXVA_Slice_H264_Long;
+
+typedef struct {
+ UINT BSNALunitDataLocation;
+ UINT SliceBytesInBuffer;
+ USHORT wBadSliceChopping;
+} DXVA_Slice_H264_Short;
+
+typedef struct {
+ USHORT wFrameWidthInMbsMinus1;
+ USHORT wFrameHeightInMbsMinus1;
+ DXVA_PicEntry_H264 InPic;
+ DXVA_PicEntry_H264 OutPic;
+ USHORT PicOrderCnt_offset;
+ INT CurrPicOrderCnt;
+ UINT StatusReportFeedbackNumber;
+ UCHAR model_id;
+ UCHAR separate_colour_description_present_flag;
+ UCHAR film_grain_bit_depth_luma_minus8;
+ UCHAR film_grain_bit_depth_chroma_minus8;
+ UCHAR film_grain_full_range_flag;
+ UCHAR film_grain_colour_primaries;
+ UCHAR film_grain_transfer_characteristics;
+ UCHAR film_grain_matrix_coefficients;
+ UCHAR blending_mode_id;
+ UCHAR log2_scale_factor;
+ UCHAR comp_model_present_flag[4];
+ UCHAR num_intensity_intervals_minus1[4];
+ UCHAR num_model_values_minus1[4];
+ UCHAR intensity_interval_lower_bound[3][16];
+ UCHAR intensity_interval_upper_bound[3][16];
+ SHORT comp_model_value[3][16][8];
+} DXVA_FilmGrainChar_H264;
+
+typedef struct {
+ union {
+ struct {
+ USHORT Fraction;
+ SHORT Value;
+ };
+ LONG ll;
+ };
+}DXVA2_Fixed32;
+
+typedef struct {
+ UCHAR Cr;
+ UCHAR Cb;
+ UCHAR Y;
+ UCHAR Alpha;
+}DXVA2_AYUVSample8;
+
+typedef struct {
+ USHORT Cr;
+ USHORT Cb;
+ USHORT Y;
+ USHORT Alpha;
+}DXVA2_AYUVSample16;
+
+typedef struct {
+ DXVA2_Fixed32 MinValue;
+ DXVA2_Fixed32 MaxValue;
+ DXVA2_Fixed32 DefaultValue;
+ DXVA2_Fixed32 StepSize;
+}DXVA2_ValueRange;
+
+typedef struct {
+ DXVA2_Fixed32 Brightness;
+ DXVA2_Fixed32 Contrast;
+ DXVA2_Fixed32 Hue;
+ DXVA2_Fixed32 Saturation;
+}DXVA2_ProcAmpValues;
+
+typedef struct {
+ DXVA2_Fixed32 Level;
+ DXVA2_Fixed32 Threshold;
+ DXVA2_Fixed32 Radius;
+}DXVA2_FilterValues;
+
+typedef struct {
+ UINT DeviceCaps;
+ D3DPOOL InputPool;
+ UINT NumForwardRefSamples;
+ UINT NumBackwardRefSamples;
+ UINT Reserved;
+ UINT DeinterlaceTechnology;
+ UINT ProcAmpControlCaps;
+ UINT VideoProcessorOperations;
+ UINT NoiseFilterTechnology;
+ UINT DetailFilterTechnology;
+}DXVA2_VideoProcessorCaps;
+
+#ifndef _REFERENCE_TIME_
+#define _REFERENCE_TIME_
+typedef long long int64_t;
+typedef int64_t REFERENCE_TIME;
+#endif
+
+typedef struct {
+ REFERENCE_TIME Start;
+ REFERENCE_TIME End;
+ DXVA2_ExtendedFormat SampleFormat;
+ IDirect3DSurface9 *SrcSurface;
+ RECT SrcRect;
+ RECT DstRect;
+ DXVA2_AYUVSample8 Pal[16];
+ DXVA2_Fixed32 PlanarAlpha;
+ DWORD SampleData;
+}DXVA2_VideoSample;
+
+
+typedef struct {
+ REFERENCE_TIME TargetFrame;
+ RECT TargetRect;
+ SIZE ConstrictionSize;
+ UINT StreamingFlags;
+ DXVA2_AYUVSample16 BackgroundColor;
+ DXVA2_ExtendedFormat DestFormat;
+ DXVA2_ProcAmpValues ProcAmpValues;
+ DXVA2_Fixed32 Alpha;
+ DXVA2_FilterValues NoiseFilterLuma;
+ DXVA2_FilterValues NoiseFilterChroma;
+ DXVA2_FilterValues DetailFilterLuma;
+ DXVA2_FilterValues DetailFilterChroma;
+ DWORD DestData;
+} DXVA2_VideoProcessBltParams;
+
+#pragma pack(pop)
+
+/*************INTERFACES************/
+#ifdef __cplusplus
+extern "C" {
+#endif
+#define _COM_interface struct
+typedef _COM_interface IDirectXVideoDecoderService IDirectXVideoDecoderService;
+typedef _COM_interface IDirectXVideoDecoder IDirectXVideoDecoder;
+
+#undef INTERFACE
+#define INTERFACE IDirectXVideoDecoder
+DECLARE_INTERFACE_( IDirectXVideoDecoder, IUnknown )
+{
+ STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE;
+ STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE;
+ STDMETHOD_( ULONG, Release ) ( THIS ) PURE;
+ STDMETHOD( GetVideoDecoderService ) ( THIS_ IDirectXVideoDecoderService** ) PURE;
+ STDMETHOD( GetCreationParameters ) ( THIS_ GUID*, DXVA2_VideoDesc*, DXVA2_ConfigPictureDecode*, IDirect3DSurface9***, UINT* ) PURE;
+ STDMETHOD( GetBuffer ) ( THIS_ UINT, void**, UINT* ) PURE;
+ STDMETHOD( ReleaseBuffer ) ( THIS_ UINT ) PURE;
+ STDMETHOD( BeginFrame ) ( THIS_ IDirect3DSurface9 *, void* ) PURE;
+ STDMETHOD( EndFrame ) ( THIS_ HANDLE * ) PURE;
+ STDMETHOD( Execute ) ( THIS_ const DXVA2_DecodeExecuteParams* ) PURE;
+
+
+};
+
+#if !defined(__cplusplus) || defined(CINTERFACE)
+#define IDirectXVideoDecoder_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b )
+#define IDirectXVideoDecoder_AddRef( p ) (p)->lpVtbl->AddRef( p )
+#define IDirectXVideoDecoder_Release( p ) (p)->lpVtbl->Release( p )
+#define IDirectXVideoDecoder_BeginFrame( p, a, b ) (p)->lpVtbl->BeginFrame( p, a, b )
+#define IDirectXVideoDecoder_EndFrame( p, a ) (p)->lpVtbl->EndFrame( p, a )
+#define IDirectXVideoDecoder_Execute( p, a ) (p)->lpVtbl->Execute( p, a )
+#define IDirectXVideoDecoder_GetBuffer( p, a, b, c ) (p)->lpVtbl->GetBuffer( p, a, b, c )
+#define IDirectXVideoDecoder_GetCreationParameters( p, a, b, c, d, e ) (p)->lpVtbl->GetCreationParameters( p, a, b, c, d, e )
+#define IDirectXVideoDecoder_GetVideoDecoderService( p, a ) (p)->lpVtbl->GetVideoDecoderService( p, a )
+#define IDirectXVideoDecoder_ReleaseBuffer( p, a ) (p)->lpVtbl->ReleaseBuffer( p, a )
+#else
+#define IDirectXVideoDecoder_QueryInterface( p, a, b ) (p)->QueryInterface( a, b )
+#define IDirectXVideoDecoder_AddRef( p ) (p)->AddRef()
+#define IDirectXVideoDecoder_Release( p ) (p)->Release()
+#define IDirectXVideoDecoder_BeginFrame( p, a, b ) (p)->BeginFrame( a, b )
+#define IDirectXVideoDecoder_EndFrame( p, a ) (p)->EndFrame( a )
+#define IDirectXVideoDecoder_Execute( p, a ) (p)->Execute( a )
+#define IDirectXVideoDecoder_GetBuffer( p, a, b, c ) (p)->GetBuffer( a, b, c )
+#define IDirectXVideoDecoder_GetCreationParameters( p, a, b, c, d, e ) (p)->GetCreationParameters( a, b, c, d, e )
+#define IDirectXVideoDecoder_GetVideoDecoderService( p, a ) (p)->GetVideoDecoderService( a )
+#define IDirectXVideoDecoder_ReleaseBuffer( p, a ) (p)->ReleaseBuffer( a )
+#endif
+
+#undef INTERFACE
+#define INTERFACE IDirectXVideoAccelerationService
+DECLARE_INTERFACE_( IDirectXVideoAccelerationService, IUnknown )
+{
+ STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE;
+ STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE;
+ STDMETHOD_( ULONG, Release ) ( THIS ) PURE;
+ STDMETHOD( CreateSurface ) ( THIS_ UINT, UINT, UINT, D3DFORMAT, D3DPOOL, DWORD, DWORD, IDirect3DSurface9**, HANDLE* ) PURE;
+
+};
+
+#if !defined(__cplusplus) || defined(CINTERFACE)
+#define IDirectXVideoAccelerationService_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b )
+#define IDirectXVideoAccelerationService_AddRef( p ) (p)->lpVtbl->AddRef( p )
+#define IDirectXVideoAccelerationService_Release( p ) (p)->lpVtbl->Release( p )
+#define IDirectXVideoAccelerationService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->lpVtbl->CreateSurface( p, a, b, c, d, e, f, g, h, i )
+#else
+#define IDirectXVideoAccelerationService_QueryInterface( p, a, b ) (p)->QueryInterface( a, b )
+#define IDirectXVideoAccelerationService_AddRef( p ) (p)->AddRef()
+#define IDirectXVideoAccelerationService_Release( p ) (p)->Release()
+#define IDirectXVideoAccelerationService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->CreateSurface( a, b, c, d, e, f, g, h, i )
+#endif
+
+#undef INTERFACE
+#define INTERFACE IDirectXVideoDecoderService
+DECLARE_INTERFACE_( IDirectXVideoDecoderService, IDirectXVideoAccelerationService )
+{
+ STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE;
+ STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE;
+ STDMETHOD_( ULONG, Release ) ( THIS ) PURE;
+ STDMETHOD( CreateSurface ) ( THIS_ UINT, UINT, UINT, D3DFORMAT, D3DPOOL, DWORD, DWORD, IDirect3DSurface9**, HANDLE* ) PURE;
+ STDMETHOD( GetDecoderDeviceGuids ) ( THIS_ UINT*, GUID ** ) PURE;
+ STDMETHOD( GetDecoderRenderTargets ) ( THIS_ REFGUID, UINT*, D3DFORMAT** ) PURE;
+ STDMETHOD( GetDecoderConfigurations ) ( THIS_ REFGUID, const DXVA2_VideoDesc*, IUnknown*, UINT*, DXVA2_ConfigPictureDecode** ) PURE;
+ STDMETHOD( CreateVideoDecoder ) ( THIS_ REFGUID, const DXVA2_VideoDesc*, DXVA2_ConfigPictureDecode*, IDirect3DSurface9**, UINT, IDirectXVideoDecoder** ) PURE;
+};
+
+#if !defined(__cplusplus) || defined(CINTERFACE)
+#define IDirectXVideoDecoderService_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b )
+#define IDirectXVideoDecoderService_AddRef( p ) (p)->lpVtbl->AddRef( p )
+#define IDirectXVideoDecoderService_Release( p ) (p)->lpVtbl->Release( p )
+#define IDirectXVideoDecoderService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->lpVtbl->CreateSurface( p, a, b, c, d, e, f, g, h, i )
+#define IDirectXVideoDecoderService_CreateVideoDecoder( p, a, b, c, d, e, f ) (p)->lpVtbl->CreateVideoDecoder( p, a, b, c, d, e, f )
+#define IDirectXVideoDecoderService_GetDecoderConfigurations( p, a, b, c, d, e ) (p)->lpVtbl->GetDecoderConfigurations( p, a, b, c, d, e )
+#define IDirectXVideoDecoderService_GetDecoderDeviceGuids( p, a, b ) (p)->lpVtbl->GetDecoderDeviceGuids( p, a, b )
+#define IDirectXVideoDecoderService_GetDecoderRenderTargets( p, a, b, c ) (p)->lpVtbl->GetDecoderRenderTargets( p, a, b, c )
+#else
+#define IDirectXVideoDecoderService_QueryInterface( p, a, b ) (p)->QueryInterface( a, b )
+#define IDirectXVideoDecoderService_AddRef( p ) (p)->AddRef()
+#define IDirectXVideoDecoderService_Release( p ) (p)->Release()
+#define IDirectXVideoDecoderService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->CreateSurface( a, b, c, d, e, f, g, h, i )
+#define IDirectXVideoDecoderService_CreateVideoDecoder( p, a, b, c, d, e, f ) (p)->CreateVideoDecoder( a, b, c, d, e, f )
+#define IDirectXVideoDecoderService_GetDecoderConfigurations( p, a, b, c, d, e ) (p)->GetDecoderConfigurations( a, b, c, d, e )
+#define IDirectXVideoDecoderService_GetDecoderDeviceGuids( p, a, b ) (p)->GetDecoderDeviceGuids( a, b )
+#define IDirectXVideoDecoderService_GetDecoderRenderTargets( p, a, b, c ) (p)->GetDecoderRenderTargets( a, b, c )
+#endif
+
+#undef INTERFACE
+#define INTERFACE IDirect3DDeviceManager9
+DECLARE_INTERFACE_( IDirect3DDeviceManager9, IUnknown )
+{
+ STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE;
+ STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE;
+ STDMETHOD_( ULONG, Release ) ( THIS ) PURE;
+ STDMETHOD( ResetDevice ) ( THIS_ IDirect3DDevice9*, UINT ) PURE;
+ STDMETHOD( OpenDeviceHandle ) ( THIS_ HANDLE* ) PURE;
+ STDMETHOD( CloseDeviceHandle ) ( THIS_ HANDLE ) PURE;
+ STDMETHOD( TestDevice ) ( THIS_ HANDLE ) PURE;
+ STDMETHOD( LockDevice ) ( THIS_ HANDLE, IDirect3DDevice9**, BOOL ) PURE;
+ STDMETHOD( UnlockDevice ) ( THIS_ HANDLE, BOOL ) PURE;
+ STDMETHOD( GetVideoService ) ( THIS_ HANDLE, REFIID, void** ) PURE;
+};
+
+#if !defined(__cplusplus) || defined(CINTERFACE)
+#define IDirect3DDeviceManager9_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b )
+#define IDirect3DDeviceManager9_AddRef( p ) (p)->lpVtbl->AddRef( p )
+#define IDirect3DDeviceManager9_Release( p ) (p)->lpVtbl->Release( p )
+#define IDirect3DDeviceManager9_ResetDevice( p, a, b ) (p)->lpVtbl->ResetDevice( p, a, b )
+#define IDirect3DDeviceManager9_OpenDeviceHandle( p, a ) (p)->lpVtbl->OpenDeviceHandle( p, a )
+#define IDirect3DDeviceManager9_CloseDeviceHandle( p, a ) (p)->lpVtbl->CloseDeviceHandle( p, a )
+#define IDirect3DDeviceManager9_TestDevice( p, a ) (p)->lpVtbl->TestDevice( p, a )
+#define IDirect3DDeviceManager9_LockDevice( p, a, b, c ) (p)->lpVtbl->LockDevice( p, a, b, c )
+#define IDirect3DDeviceManager9_UnlockDevice( p, a, b ) (p)->lpVtbl->UnlockDevice( p, a, b )
+#define IDirect3DDeviceManager9_GetVideoService( p, a, b, c ) (p)->lpVtbl->GetVideoService( p, a, b, c )
+#else
+#define IDirect3DDeviceManager9_QueryInterface( p, a, b ) (p)->QueryInterface( a, b )
+#define IDirect3DDeviceManager9_AddRef( p ) (p)->AddRef()
+#define IDirect3DDeviceManager9_Release( p ) (p)->Release()
+#define IDirect3DDeviceManager9_ResetDevice( p, a, b ) (p)->ResetDevice( a, b )
+#define IDirect3DDeviceManager9_OpenDeviceHandle( p, a ) (p)->OpenDeviceHandle( a )
+#define IDirect3DDeviceManager9_CloseDeviceHandle( p, a ) (p)->CloseDeviceHandle( a )
+#define IDirect3DDeviceManager9_TestDevice( p, a ) (p)->TestDevice( a )
+#define IDirect3DDeviceManager9_LockDevice( p, a, b, c ) (p)->LockDevice( a, b, c )
+#define IDirect3DDeviceManager9_UnlockDevice( p, a, b ) (p)->UnlockDevice( a, b )
+#define IDirect3DDeviceManager9_GetVideoService( p, a, b, c ) (p)->GetVideoService( a, b, c )
+#endif
+
+typedef _COM_interface IDirectXVideoProcessorService IDirectXVideoProcessorService;
+typedef _COM_interface IDirectXVideoProcessor IDirectXVideoProcessor;
+
+#undef INTERFACE
+#define INTERFACE IDirectXVideoProcessor
+DECLARE_INTERFACE_( IDirectXVideoProcessor, IUnknown )
+{
+ STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE;
+ STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE;
+ STDMETHOD_( ULONG, Release ) ( THIS ) PURE;
+ STDMETHOD( GetVideoProcessorService ) ( THIS_ IDirectXVideoProcessorService** ) PURE;
+ STDMETHOD( GetCreationParameters ) ( THIS_ GUID*, DXVA2_VideoDesc*, D3DFORMAT*, UINT* ) PURE;
+ STDMETHOD( GetVideoProcessorCaps ) ( THIS_ DXVA2_VideoProcessorCaps* ) PURE;
+ STDMETHOD( GetProcAmpRange ) ( THIS_ UINT, DXVA2_ValueRange* ) PURE;
+ STDMETHOD( GetFilterPropertyRange ) ( THIS_ UINT, DXVA2_ValueRange* ) PURE;
+ STDMETHOD( VideoProcessBlt ) ( THIS_ IDirect3DSurface9*, DXVA2_VideoProcessBltParams*, DXVA2_VideoSample*, UINT, HANDLE* ) PURE;
+};
+
+#if !defined(__cplusplus) || defined(CINTERFACE)
+#define IDirectXVideoProcessor_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b )
+#define IDirectXVideoProcessor_AddRef( p ) (p)->lpVtbl->AddRef( p )
+#define IDirectXVideoProcessor_Release( p ) (p)->lpVtbl->Release( p )
+#define IDirectXVideoProcessor_GetVideoProcessorService( p, a ) (p)->lpVtbl->GetVideoProcessorService( p, a )
+#define IDirectXVideoProcessor_GetCreationParameters( p, a, b, c, d ) (p)->lpVtbl->GetCreationParameters( p, a, b, c, d )
+#define IDirectXVideoProcessor_GetVideoProcessorCaps( p, a ) (p)->lpVtbl->GetVideoProcessorCaps( p, a )
+#define IDirectXVideoProcessor_GetProcAmpRange( p, a, b ) (p)->lpVtbl->GetProcAmpRange( p, a, b )
+#define IDirectXVideoProcessor_GetFilterPropertyRange( p, a, b ) (p)->lpVtbl->GetFilterPropertyRange( p, a, b )
+#define IDirectXVideoProcessor_VideoProcessBlt( p, a, b, c, d, e ) (p)->lpVtbl->VideoProcessBlt( p, a, b, c, d, e )
+#else
+#define IDirectXVideoProcessor_QueryInterface( p, a, b ) (p)->QueryInterface( a, b )
+#define IDirectXVideoProcessor_AddRef( p ) (p)->AddRef()
+#define IDirectXVideoProcessor_Release( p ) (p)->Release()
+#define IDirectXVideoProcessor_GetVideoProcessorService( p, a ) (p)->GetVideoProcessorService( a )
+#define IDirectXVideoProcessor_GetCreationParameters( p, a, b, c, d ) (p)->GetCreationParameters( a, b, c, d )
+#define IDirectXVideoProcessor_GetVideoProcessorCaps( p, a ) (p)->GetVideoProcessorCaps( a )
+#define IDirectXVideoProcessor_GetProcAmpRange( p, a, b ) (p)->GetProcAmpRange( a, b )
+#define IDirectXVideoProcessor_GetFilterPropertyRange( p, a, b ) (p)->GetFilterPropertyRange( a, b )
+#define IDirectXVideoProcessor_VideoProcessBlt( p, a, b, c, d, e ) (p)->VideoProcessBlt( a, b, c, d, e )
+#endif
+
+
+#undef INTERFACE
+#define INTERFACE IDirectXVideoProcessorService
+DECLARE_INTERFACE_( IDirectXVideoProcessorService, IDirectXVideoAccelerationService )
+{
+ STDMETHOD( QueryInterface ) ( THIS_ REFIID, PVOID* ) PURE;
+ STDMETHOD_( ULONG, AddRef ) ( THIS ) PURE;
+ STDMETHOD_( ULONG, Release ) ( THIS ) PURE;
+ STDMETHOD( CreateSurface ) ( THIS_ UINT, UINT, UINT, D3DFORMAT, D3DPOOL, DWORD, DWORD, IDirect3DSurface9**, HANDLE* ) PURE;
+ STDMETHOD( RegisterVideoProcessorSoftwareDevice ) ( THIS_ void* ) PURE;
+ STDMETHOD( GetVideoProcessorDeviceGuids ) ( THIS_ DXVA2_VideoDesc*, UINT, GUID** ) PURE;
+ STDMETHOD( GetVideoProcessorRenderTargets ) ( THIS_ REFGUID, DXVA2_VideoDesc*, UINT*, D3DFORMAT** ) PURE;
+ STDMETHOD( GetVideoProcessorSubStreamFormats ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, UINT*, D3DFORMAT** ) PURE;
+ STDMETHOD( GetVideoProcessorCaps ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, DXVA2_VideoProcessorCaps* ) PURE;
+ STDMETHOD( GetProcAmpRange ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, UINT, DXVA2_ValueRange* ) PURE;
+ STDMETHOD( GetFilterPropertyRange ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, UINT, DXVA2_ValueRange* ) PURE;
+ STDMETHOD( CreateVideoProcessor ) ( THIS_ REFGUID, DXVA2_VideoDesc*, D3DFORMAT, UINT, IDirectXVideoProcessor** ) PURE;
+};
+
+#if !defined(__cplusplus) || defined(CINTERFACE)
+#define IDirectXVideoProcessorService_QueryInterface( p, a, b ) (p)->lpVtbl->QueryInterface( p, a, b )
+#define IDirectXVideoProcessorService_AddRef( p ) (p)->lpVtbl->AddRef( p )
+#define IDirectXVideoProcessorService_Release( p ) (p)->lpVtbl->Release( p )
+#define IDirectXVideoProcessorService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->lpVtbl->CreateSurface( p, a, b, c, d, e, f, g, h, i )
+#define IDirectXVideoProcessorService_RegisterVideoProcessorSoftwareDevice( p, a ) (p)->lpVtbl->RegisterVideoProcessorSoftwareDevice( p, a )
+#define IDirectXVideoProcessorService_GetVideoProcessorDeviceGuids( p, a, b, c ) (p)->lpVtbl->GetVideoProcessorDeviceGuids( p, a, b, c )
+#define IDirectXVideoProcessorService_GetVideoProcessorRenderTargets( p, a, b, c, d ) (p)->lpVtbl->GetVideoProcessorRenderTargets( p, a, b, c, d )
+#define IDirectXVideoProcessorService_GetVideoProcessorSubStreamFormats( p, a, b, c, d, e ) (p)->lpVtbl->GetVideoProcessorSubStreamFormats( p, a, b, c, d, e )
+#define IDirectXVideoProcessorService_GetVideoProcessorCaps( p, a, b, c, d ) (p)->lpVtbl->GetVideoProcessorCaps( p, a, b, c, d )
+#define IDirectXVideoProcessorService_GetProcAmpRange( p, a, b, c, d, e ) (p)->lpVtbl->GetProcAmpRange( p, a, b, c, d, e )
+#define IDirectXVideoProcessorService_GetFilterPropertyRange( p, a, b, c, d, e ) (p)->lpVtbl->GetFilterPropertyRange( p, a, b, c, d, e )
+#define IDirectXVideoProcessorService_CreateVideoProcessor( p, a, b, c, d, e ) (p)->lpVtbl->CreateVideoProcessor( p, a, b, c, d, e )
+#else
+#define IDirectXVideoProcessorService_QueryInterface( p, a, b ) (p)->QueryInterface( a, b )
+#define IDirectXVideoProcessorService_AddRef( p ) (p)->AddRef()
+#define IDirectXVideoProcessorService_Release( p ) (p)->Release()
+#define IDirectXVideoProcessorService_CreateSurface( p, a, b, c, d, e, f, g, h, i ) (p)->CreateSurface( a, b, c, d, e, f, g, h, i )
+#define IDirectXVideoProcessorService_RegisterVideoProcessorSoftwareDevice( p, a ) (p)->RegisterVideoProcessorSoftwareDevice( a )
+#define IDirectXVideoProcessorService_GetVideoProcessorDeviceGuids( p, a, b, c ) (p)->GetVideoProcessorDeviceGuids( a, b, c )
+#define IDirectXVideoProcessorService_GetVideoProcessorRenderTargets( p, a, b, c, d ) (p)->GetVideoProcessorRenderTargets( a, b, c, d )
+#define IDirectXVideoProcessorService_GetVideoProcessorSubStreamFormats( p, a, b, c, d, e ) (p)->GetVideoProcessorSubStreamFormats( a, b, c, d, e )
+#define IDirectXVideoProcessorService_GetVideoProcessorCaps( p, a, b, c, d ) (p)->GetVideoProcessorCaps( a, b, c, d )
+#define IDirectXVideoProcessorService_GetProcAmpRange( p, a, b, c, d, e ) (p)->GetProcAmpRange( a, b, c, d, e )
+#define IDirectXVideoProcessorService_GetFilterPropertyRange( p, a, b, c, d, e ) (p)->GetFilterPropertyRange( a, b, c, d, e )
+#define IDirectXVideoProcessorService_CreateVideoProcessor( p, a, b, c, d, e ) (p)->CreateVideoProcessor( a, b, c, d, e )
+#endif
+
+
+/*****************************************************************************************************
+************************DXVA Video Processor********************************************************
+*******************************************************************************************************/
+
+
+
+/*#undef INTERFACE
+#define INTERFACE IDirectXVideoService
+DECLARE_INTERFACE_(IDirectXVideoService,IUnknown)
+{
+ STDMETHOD(DXVA2CreateVideoService)(IDirect3DDevice9*, REFIID, void**) PURE;
+};
+
+#if !defined(__cplusplus) || defined(CINTERFACE)
+#define IDirectXVideoService_DXVA2CreateVideoService(a,b,c) DXVA2CreateVideoService(a,b,c)
+#else
+#define IDirectXVideoService_DXVA2CreateVideoService(a,b,c) DXVA2CreateVideoService(a,b,c)
+#endif*/
+
+
+#ifdef __cplusplus
+};
+#endif
+
+#ifdef __cplusplus
+extern "C" HRESULT WINAPI DXVA2CreateVideoService( IDirect3DDevice9 *,
+ REFIID riid,
+ void **ppService );
+#else
+extern HRESULT WINAPI DXVA2CreateVideoService( IDirect3DDevice9 *,
+ REFIID riid,
+ void **ppService );
+#endif
+
+typedef
+enum _DXVA2_VideoChromaSubSampling
+{ DXVA2_VideoChromaSubsamplingMask = 0xf,
+ DXVA2_VideoChromaSubsampling_Unknown = 0,
+ DXVA2_VideoChromaSubsampling_ProgressiveChroma = 0x8,
+ DXVA2_VideoChromaSubsampling_Horizontally_Cosited = 0x4,
+ DXVA2_VideoChromaSubsampling_Vertically_Cosited = 0x2,
+ DXVA2_VideoChromaSubsampling_Vertically_AlignedChromaPlanes = 0x1,
+ DXVA2_VideoChromaSubsampling_MPEG2 = ( DXVA2_VideoChromaSubsampling_Horizontally_Cosited | DXVA2_VideoChromaSubsampling_Vertically_AlignedChromaPlanes ),
+ DXVA2_VideoChromaSubsampling_MPEG1 = DXVA2_VideoChromaSubsampling_Vertically_AlignedChromaPlanes,
+ DXVA2_VideoChromaSubsampling_DV_PAL = ( DXVA2_VideoChromaSubsampling_Horizontally_Cosited | DXVA2_VideoChromaSubsampling_Vertically_Cosited ),
+ DXVA2_VideoChromaSubsampling_Cosited = ( ( DXVA2_VideoChromaSubsampling_Horizontally_Cosited | DXVA2_VideoChromaSubsampling_Vertically_Cosited ) | DXVA2_VideoChromaSubsampling_Vertically_AlignedChromaPlanes )} DXVA2_VideoChromaSubSampling;
+
+typedef
+enum _DXVA2_NominalRange
+{ DXVA2_NominalRangeMask = 0x7,
+ DXVA2_NominalRange_Unknown = 0,
+ DXVA2_NominalRange_Normal = 1,
+ DXVA2_NominalRange_Wide = 2,
+ DXVA2_NominalRange_0_255 = 1,
+ DXVA2_NominalRange_16_235 = 2,
+ DXVA2_NominalRange_48_208 = 3} DXVA2_NominalRange;
+
+typedef
+enum _DXVA2_VideoLighting
+{ DXVA2_VideoLightingMask = 0xf,
+ DXVA2_VideoLighting_Unknown = 0,
+ DXVA2_VideoLighting_bright = 1,
+ DXVA2_VideoLighting_office = 2,
+ DXVA2_VideoLighting_dim = 3,
+ DXVA2_VideoLighting_dark = 4} DXVA2_VideoLighting;
+
+typedef
+enum _DXVA2_VideoPrimaries
+{ DXVA2_VideoPrimariesMask = 0x1f,
+ DXVA2_VideoPrimaries_Unknown = 0,
+ DXVA2_VideoPrimaries_reserved = 1,
+ DXVA2_VideoPrimaries_BT709 = 2,
+ DXVA2_VideoPrimaries_BT470_2_SysM = 3,
+ DXVA2_VideoPrimaries_BT470_2_SysBG = 4,
+ DXVA2_VideoPrimaries_SMPTE170M = 5,
+ DXVA2_VideoPrimaries_SMPTE240M = 6,
+ DXVA2_VideoPrimaries_EBU3213 = 7,
+ DXVA2_VideoPrimaries_SMPTE_C = 8} DXVA2_VideoPrimaries;
+
+typedef
+enum _DXVA2_VideoTransferFunction
+{ DXVA2_VideoTransFuncMask = 0x1f,
+ DXVA2_VideoTransFunc_Unknown = 0,
+ DXVA2_VideoTransFunc_10 = 1,
+ DXVA2_VideoTransFunc_18 = 2,
+ DXVA2_VideoTransFunc_20 = 3,
+ DXVA2_VideoTransFunc_22 = 4,
+ DXVA2_VideoTransFunc_709 = 5,
+ DXVA2_VideoTransFunc_240M = 6,
+ DXVA2_VideoTransFunc_sRGB = 7,
+ DXVA2_VideoTransFunc_28 = 8} DXVA2_VideoTransferFunction;
+
+typedef
+enum _DXVA2_SampleFormat
+{ DXVA2_SampleFormatMask = 0xff,
+ DXVA2_SampleUnknown = 0,
+ DXVA2_SampleProgressiveFrame = 2,
+ DXVA2_SampleFieldInterleavedEvenFirst = 3,
+ DXVA2_SampleFieldInterleavedOddFirst = 4,
+ DXVA2_SampleFieldSingleEven = 5,
+ DXVA2_SampleFieldSingleOdd = 6,
+ DXVA2_SampleSubStream = 7} DXVA2_SampleFormat;
+
+typedef
+enum _DXVA2_VideoTransferMatrix
+{ DXVA2_VideoTransferMatrixMask = 0x7,
+ DXVA2_VideoTransferMatrix_Unknown = 0,
+ DXVA2_VideoTransferMatrix_BT709 = 1,
+ DXVA2_VideoTransferMatrix_BT601 = 2,
+ DXVA2_VideoTransferMatrix_SMPTE240M = 3} DXVA2_VideoTransferMatrix;
+
+enum __MIDL___MIDL_itf_dxva2api_0000_0000_0004
+{ DXVA2_NoiseFilterLumaLevel = 1,
+ DXVA2_NoiseFilterLumaThreshold = 2,
+ DXVA2_NoiseFilterLumaRadius = 3,
+ DXVA2_NoiseFilterChromaLevel = 4,
+ DXVA2_NoiseFilterChromaThreshold = 5,
+ DXVA2_NoiseFilterChromaRadius = 6,
+ DXVA2_DetailFilterLumaLevel = 7,
+ DXVA2_DetailFilterLumaThreshold = 8,
+ DXVA2_DetailFilterLumaRadius = 9,
+ DXVA2_DetailFilterChromaLevel = 10,
+ DXVA2_DetailFilterChromaThreshold = 11,
+ DXVA2_DetailFilterChromaRadius = 12};
+
+enum __MIDL___MIDL_itf_dxva2api_0000_0000_0008
+{ DXVA2_VideoProcess_None = 0,
+ DXVA2_VideoProcess_YUV2RGB = 0x1,
+ DXVA2_VideoProcess_StretchX = 0x2,
+ DXVA2_VideoProcess_StretchY = 0x4,
+ DXVA2_VideoProcess_AlphaBlend = 0x8,
+ DXVA2_VideoProcess_SubRects = 0x10,
+ DXVA2_VideoProcess_SubStreams = 0x20,
+ DXVA2_VideoProcess_SubStreamsExtended = 0x40,
+ DXVA2_VideoProcess_YUV2RGBExtended = 0x80,
+ DXVA2_VideoProcess_AlphaBlendExtended = 0x100,
+ DXVA2_VideoProcess_Constriction = 0x200,
+ DXVA2_VideoProcess_NoiseFilter = 0x400,
+ DXVA2_VideoProcess_DetailFilter = 0x800,
+ DXVA2_VideoProcess_PlanarAlpha = 0x1000,
+ DXVA2_VideoProcess_LinearScaling = 0x2000,
+ DXVA2_VideoProcess_GammaCompensated = 0x4000,
+ DXVA2_VideoProcess_MaintainsOriginalFieldData = 0x8000,
+ DXVA2_VideoProcess_Mask = 0xffff};
+
+
+
+__inline float hb_dx_fixedtofloat( const DXVA2_Fixed32 _fixed_ );
+
+__inline const DXVA2_Fixed32 hb_dx_fixed32_opaque_alpha();
+
+__inline DXVA2_Fixed32 hb_dx_floattofixed( const float _float_ );
+#endif
+#endif //_DXVA2API_H
if( b->data )
{
freed += b->alloc;
- free( b->data );
+#ifdef USE_OPENCL
+ if (b->cl.buffer != NULL) {
+ if (hb_cl_free_mapped_buffer(b->cl.buffer, b->data) == 0)
+ hb_log("bad free: %.16x -> buffer %.16x map %.16x", b, b->cl.buffer, b->data);
+ }
+ else
+#endif
+ free( b->data );
}
free( b );
count++;
return NULL;
}
-hb_buffer_t * hb_buffer_init( int size )
+hb_buffer_t * hb_buffer_init_internal( int size , int needsMapped )
{
hb_buffer_t * b;
// Certain libraries (hrm ffmpeg) expect buffers passed to them to
{
b = hb_fifo_get( buffer_pool );
+#ifdef USE_OPENCL
+ if (b && (needsMapped != 0) && (b->cl.buffer == NULL))
+ {
+ // We need a mapped OpenCL buffer and that is not what we got out of the pool.
+ // Ditch it. It will get replaced with what we need.
+ if (b->data)
+ free(b->data);
+ free(b);
+ b = NULL;
+ }
+#endif
+
if( b )
{
/*
* didn't have to do this.
*/
uint8_t *data = b->data;
+#ifdef USE_OPENCL
+ cl_mem buffer = b->cl.buffer;
+ cl_event last_event = b->cl.last_event;
+ int loc = b->cl.buffer_location;
+#endif
+
memset( b, 0, sizeof(hb_buffer_t) );
b->alloc = buffer_pool->buffer_size;
b->size = size;
b->s.start = -1;
b->s.stop = -1;
b->s.renderOffset = -1;
+#ifdef USE_OPENCL
+ b->cl.buffer = buffer;
+ b->cl.last_event = last_event;
+ b->cl.buffer_location = loc;
+#endif
return( b );
}
}
if (size)
{
+#ifdef USE_OPENCL
+ b->cl.last_event = NULL;
+ b->cl.buffer_location = HOST;
+
+ if (needsMapped != 0)
+ {
+ int status;
+ status = hb_cl_create_mapped_buffer(&b->cl.buffer, &b->data, b->alloc);
+ //hb_log("buf: %.16x -> buffer %.16x map %.16x size %d", b, b->cl.buffer, b->data, size);
+ }
+ else {
+ b->cl.buffer = NULL;
+#endif
+
#if defined( SYS_DARWIN ) || defined( SYS_FREEBSD ) || defined( SYS_MINGW )
b->data = malloc( b->alloc );
#elif defined( SYS_CYGWIN )
#else
b->data = memalign( 16, b->alloc );
#endif
+#ifdef USE_OPENCL
+ }
+#endif
+
if( !b->data )
{
hb_log( "out of memory" );
return b;
}
+hb_buffer_t * hb_buffer_init( int size )
+{
+ return hb_buffer_init_internal(size, 0);
+}
+
void hb_buffer_realloc( hb_buffer_t * b, int size )
{
if ( size > b->alloc || b->data == NULL )
void hb_buffer_reduce( hb_buffer_t * b, int size )
{
+
if ( size < b->alloc / 8 || b->data == NULL )
{
hb_buffer_t * tmp = hb_buffer_init( size );
hb_buffer_t * hb_buffer_dup( const hb_buffer_t * src )
{
+
hb_buffer_t * buf;
if ( src == NULL )
hb_image_height_stride( pix_fmt, height, p );
}
}
-
+#ifdef USE_OPENCL
+ buf = hb_buffer_init_internal( size , hb_use_buffers() );
+#else
buf = hb_buffer_init( size );
+#endif
if( buf == NULL )
return NULL;
uint8_t *data = dst->data;
int size = dst->size;
int alloc = dst->alloc;
+#ifdef USE_OPENCL
+ cl_mem buffer = dst->cl.buffer;
+ cl_event last_event = dst->cl.last_event;
+ int loc = dst->cl.buffer_location;
+#endif
*dst = *src;
src->data = data;
src->size = size;
src->alloc = alloc;
+#ifdef USE_OPENCL
+ src->cl.buffer = buffer;
+ src->cl.last_event = last_event;
+ src->cl.buffer_location = loc;
+#endif
}
// Frees the specified buffer list.
// free the buf
if( b->data )
{
- free( b->data );
+#ifdef USE_OPENCL
+ if (b->cl.buffer != NULL) {
+ if (hb_cl_free_mapped_buffer(b->cl.buffer, b->data) == 0)
+ hb_log("bad free2: %.16x -> buffer %.16x map %.16x", b, b->cl.buffer, b->data);
+ }
+ else
+#endif
+ free( b->data );
hb_lock(buffers.lock);
buffers.allocated -= b->alloc;
hb_unlock(buffers.lock);
h->interjob = calloc( sizeof( hb_interjob_t ), 1 );
+ /* opencl */
+#ifdef USE_OPENCL
+ //hb_opencl_init(); // FIXME: Ensure gui instances call this or hb_get_opencl_env() during startup if needed.
+#endif
+
/* Start library thread */
hb_log( "hb_init: starting libhb thread" );
h->die = 0;
#include "libavutil/opt.h"
#include "libswscale/swscale.h"
#include "libavresample/avresample.h"
+#include "common.h"
#define HB_FFMPEG_THREADS_AUTO (-1) // let hb_avcodec_open() decide thread_count
void *filter_details;
} qsv_details;
+#ifdef USE_OPENCL
+ struct cl_data
+ {
+ cl_mem buffer;
+ cl_event last_event;
+ enum { HOST, DEVICE } buffer_location;
+ } cl;
+#endif
+
// PICTURESUB subtitle packets:
// Video packets (after processing by the hb_sync_video work-object):
ifeq (1,$(FEATURE.fdk_aac))
LIBHB.GCC.D += USE_FDK_AAC
endif
+ifeq (1,$(FEATURE.opencl))
+LIBHB.GCC.D += USE_OPENCL
+endif
+
+ifeq (1,$(FEATURE.hwd))
+LIBHB.GCC.D += USE_HWD
+endif
+
ifeq (1,$(FEATURE.libav_aac))
LIBHB.GCC.D += USE_LIBAV_AAC
endif
LIBHB.GCC.D += __LIBHB__ USE_PTHREAD
LIBHB.GCC.I += $(LIBHB.build/) $(CONTRIB.build/)include
+ifeq (1,$(FEATURE.opencl))
+LIBHB.GCC.I += $(AMDAPPSDKROOT)/include
+LIBHB.GCC.l += OpenCL
+endif
+
ifeq ($(BUILD.system),cygwin)
LIBHB.GCC.D += SYS_CYGWIN
else ifeq ($(BUILD.system),darwin)
--- /dev/null
+/* oclnv12toyuv.c
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>
+ Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>
+ */
+
+#ifdef USE_OPENCL
+#ifdef USE_HWD
+#include "vadxva2.h"
+#include "oclnv12toyuv.h"
+
+/**
+ * It creates are opencl bufs w is input frame width, h is input frame height
+*/
+static int hb_nv12toyuv_create_cl_buf( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 );
+
+/**
+ * It creates are opencl kernel. kernel name is nv12toyuv
+*/
+static int hb_nv12toyuv_create_cl_kernel( KernelEnv *kenv, hb_va_dxva2_t *dxva2 );
+
+/**
+ * It set opencl arg, input data,output data, input width, output height
+*/
+static int hb_nv12toyuv_setkernelarg( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 );
+
+/**
+ * It initialize nv12 to yuv kernel.
+*/
+static int hb_init_nv12toyuv_ocl( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 );
+
+/**
+ * Run nv12 to yuv kernel.
+ */
+static int hb_nv12toyuv( void **userdata, KernelEnv *kenv );
+
+/**
+ * register nv12 to yuv kernel.
+ */
+static int hb_nv12toyuv_reg_kernel( void );
+
+/**
+ * It creates are opencl bufs w is input frame width, h is input frame height
+ */
+static int hb_nv12toyuv_create_cl_buf( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 )
+{
+ cl_int status = CL_SUCCESS;
+ int in_bytes = w*h*3/2;
+ CREATEBUF( dxva2->cl_mem_nv12, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, in_bytes );
+ CREATEBUF( dxva2->cl_mem_yuv, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, in_bytes );
+ return 0;
+}
+
+/**
+ * It creates are opencl kernel. kernel name is nv12toyuv
+ */
+static int hb_nv12toyuv_create_cl_kernel( KernelEnv *kenv, hb_va_dxva2_t *dxva2 )
+{
+ int ret;
+ dxva2->nv12toyuv = clCreateKernel( kenv->program, "nv12toyuv", &ret );
+ return ret;
+}
+
+/**
+ * It set opencl arg, input data,output data, input width, output height
+ */
+static int hb_nv12toyuv_setkernelarg( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 )
+{
+ int arg = 0, status;
+ kenv->kernel = dxva2->nv12toyuv;
+ OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(cl_mem), &dxva2->cl_mem_nv12 );
+ OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(cl_mem), &dxva2->cl_mem_yuv );
+ OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(int), &w );
+ OCLCHECK( clSetKernelArg, kenv->kernel, arg++, sizeof(int), &h );
+ return 0;
+}
+
+/**
+ * It initialize nv12 to yuv kernel.
+ */
+static int hb_init_nv12toyuv_ocl( KernelEnv *kenv, int w, int h, hb_va_dxva2_t *dxva2 )
+{
+ if( !dxva2->nv12toyuv )
+ {
+ if( hb_nv12toyuv_create_cl_buf( kenv, w, h, dxva2 ) )
+ {
+ hb_log( "OpenCL: nv12toyuv_create_cl_buf fail" );
+ return -1;
+ }
+ if (!dxva2->nv12toyuv_tmp_in)
+ {
+ dxva2->nv12toyuv_tmp_in = malloc (w*h*3/2);
+ }
+
+ if (!dxva2->nv12toyuv_tmp_out)
+ {
+ dxva2->nv12toyuv_tmp_out = malloc (w*h*3/2);
+ }
+
+ hb_nv12toyuv_create_cl_kernel( kenv, dxva2 );
+ }
+ return 0;
+}
+
+/**
+ * copy_plane
+ * @param dst -
+ * @param src -
+ * @param dstride -
+ * @param sstride -
+ * @param h -
+ */
+static uint8_t *copy_plane( uint8_t *dst, uint8_t* src, int dstride, int sstride,
+ int h )
+{
+ if ( dstride == sstride )
+ {
+ memcpy( dst, src, dstride * h );
+ return dst + dstride * h;
+ }
+
+ int lbytes = dstride <= sstride? dstride : sstride;
+ while ( --h >= 0 )
+ {
+ memcpy( dst, src, lbytes );
+ src += sstride;
+ dst += dstride;
+ }
+
+ return dst;
+}
+
+/**
+ * Run nv12 to yuv kernel.
+ */
+static int hb_nv12toyuv( void **userdata, KernelEnv *kenv )
+{
+ int status;
+ int w = (int)userdata[0];
+ int h = (int)userdata[1];
+ uint8_t *bufi1 = userdata[2];
+ int *crop = userdata[3];
+ hb_va_dxva2_t *dxva2 = userdata[4];
+
+ uint8_t *bufi2 = userdata[5];
+ int p = (int)userdata[6];
+ int decomb = (int)userdata[7];
+ int detelecine = (int)userdata[8];
+ int i;
+ if( hb_init_nv12toyuv_ocl( kenv, w, h, dxva2 ) )
+ {
+ return -1;
+ }
+
+ if( hb_nv12toyuv_setkernelarg( kenv, w, h, dxva2 ) )
+ {
+ return -1;
+ }
+
+ int in_bytes = w*h*3/2;
+ if( kenv->isAMD )
+ {
+ void *data = clEnqueueMapBuffer( kenv->command_queue, dxva2->cl_mem_nv12, CL_MAP_WRITE_INVALIDATE_REGION, CL_TRUE, 0, in_bytes, 0, NULL, NULL, NULL );
+
+ for ( i = 0; i < dxva2->height; i++ )
+ {
+ memcpy( data + i * dxva2->width, bufi1 + i * p, dxva2->width );
+ if ( i < dxva2->height >> 1 )
+ {
+ memcpy( data + ( dxva2->width * dxva2->height ) + i * dxva2->width, bufi2 + i * p, dxva2->width );
+ }
+ }
+ clEnqueueUnmapMemObject( kenv->command_queue, dxva2->cl_mem_nv12, data, 0, NULL, NULL );
+ }
+ else
+ {
+ uint8_t *tmp = (uint8_t*)malloc( dxva2->width * dxva2->height * 3 / 2 );
+ for( i = 0; i < dxva2->height; i++ )
+ {
+ memcpy( tmp + i * dxva2->width, bufi1 + i * p, dxva2->width );
+ if( i < dxva2->height >> 1 )
+ {
+ memcpy( tmp + (dxva2->width * dxva2->height) + i * dxva2->width, bufi2 + i * p, dxva2->width );
+ }
+ }
+ OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, dxva2->cl_mem_nv12, CL_TRUE, 0, in_bytes, tmp, 0, NULL, NULL );
+ free( tmp );
+ }
+
+ size_t gdim[2] = {w>>1, h>>1};
+ OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
+
+ if( (crop[0] || crop[1] || crop[2] || crop[3]) && (decomb == 0) && (detelecine == 0) )
+ {
+ AVPicture pic_in;
+ AVPicture pic_crop;
+ clEnqueueReadBuffer( kenv->command_queue, dxva2->cl_mem_yuv, CL_TRUE, 0, in_bytes, dxva2->nv12toyuv_tmp_out, 0, NULL, NULL );
+ hb_buffer_t *in = hb_video_buffer_init( w, h );
+
+ int wmp = in->plane[0].stride;
+ int hmp = in->plane[0].height;
+ copy_plane( in->plane[0].data, dxva2->nv12toyuv_tmp_out, wmp, w, hmp );
+ wmp = in->plane[1].stride;
+ hmp = in->plane[1].height;
+ copy_plane( in->plane[1].data, dxva2->nv12toyuv_tmp_out + w * h, wmp, w>>1, hmp );
+ wmp = in->plane[2].stride;
+ hmp = in->plane[2].height;
+ copy_plane( in->plane[2].data, dxva2->nv12toyuv_tmp_out + w * h +( ( w * h )>>2 ), wmp, w>>1, hmp );
+
+ hb_avpicture_fill( &pic_in, in );
+ av_picture_crop( &pic_crop, &pic_in, in->f.fmt, crop[0], crop[2] );
+ int i, ww = w - ( crop[2] + crop[3] ), hh = h - ( crop[0] + crop[1] );
+ for( i = 0; i< hh >> 1; i++ )
+ {
+ memcpy( dxva2->nv12toyuv_tmp_in + ( ( i << 1 ) + 0 ) * ww, pic_crop.data[0]+ ( ( i << 1 ) + 0 ) * pic_crop.linesize[0], ww );
+ memcpy( dxva2->nv12toyuv_tmp_in + ( ( i << 1 ) + 1 ) * ww, pic_crop.data[0]+ ( ( i << 1 ) + 1 ) * pic_crop.linesize[0], ww );
+ memcpy( dxva2->nv12toyuv_tmp_in + ( ww * hh ) + i * ( ww >> 1 ), pic_crop.data[1] + i * pic_crop.linesize[1], ww >> 1 );
+ memcpy( dxva2->nv12toyuv_tmp_in + ( ww * hh ) + ( ( ww * hh )>>2 ) + i * ( ww >> 1 ), pic_crop.data[2] + i * pic_crop.linesize[2], ww >> 1 );
+ }
+
+ if( kenv->isAMD )
+ {
+ void *data = clEnqueueMapBuffer( kenv->command_queue, dxva2->cl_mem_yuv, CL_MAP_WRITE_INVALIDATE_REGION, CL_TRUE, 0, ww * hh * 3 / 2, 0, NULL, NULL, NULL );
+ memcpy( data, dxva2->nv12toyuv_tmp_in, ww * hh * 3 / 2 );
+ clEnqueueUnmapMemObject( kenv->command_queue, dxva2->cl_mem_yuv, data, 0, NULL, NULL );
+ }
+ else
+ {
+ OCLCHECK( clEnqueueWriteBuffer, kenv->command_queue, dxva2->cl_mem_yuv, CL_TRUE, 0, in_bytes, dxva2->nv12toyuv_tmp_in, 0, NULL, NULL );
+ }
+
+ hb_buffer_close( &in );
+ }
+ return 0;
+}
+/**
+ * register nv12 to yuv kernel.
+ */
+static int hb_nv12toyuv_reg_kernel( void )
+{
+ int st = hb_register_kernel_wrapper( "nv12toyuv", hb_nv12toyuv );
+ if( !st )
+ {
+ hb_log( "OpenCL: register kernel[%s] failed", "nv12toyuv" );
+ return -1;
+ }
+ return 0;
+}
+/**
+ * nv12 to yuv interface
+ * bufi is input frame of nv12, w is input frame width, h is input frame height
+ */
+int hb_ocl_nv12toyuv( uint8_t *bufi[], int p, int w, int h, int *crop, hb_va_dxva2_t *dxva2, int decomb, int detelecine )
+{
+ void *userdata[9];
+ userdata[0] = (void*)w;
+ userdata[1] = (void*)h;
+ userdata[2] = bufi[0];
+ userdata[3] = crop;
+ userdata[4] = dxva2;
+ userdata[5] = bufi[1];
+ userdata[6] = (void*)p;
+ userdata[7] = decomb;
+ userdata[8] = detelecine;
+
+ if( hb_nv12toyuv_reg_kernel() )
+ {
+ return -1;
+ }
+
+ if( hb_run_kernel( "nv12toyuv", userdata ) )
+ {
+ hb_log( "OpenCL: run kernel[nv12toyuv] failed" );
+ return -1;
+ }
+ return 0;
+}
+#endif
+#endif
--- /dev/null
+/* oclnv12toyuv.h
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>
+ Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>
+
+ */
+
+#ifdef USE_OPENCL
+#ifndef RENDER_CL_H
+#define RENDER_CL_H
+
+#if defined(__APPLE__)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include "common.h"
+#include "openclwrapper.h"
+
+/**
+ * nv12 to yuv interface
+ * bufi is input frame of nv12, w is input frame width, h is input frame height
+ */
+#ifdef USE_HWD
+int hb_ocl_nv12toyuv( uint8_t *bufi[], int p, int w, int h, int *crop, hb_va_dxva2_t *dxva2, int decomb, int detelecine );
+#endif
+#endif
+#endif
--- /dev/null
+/* oclscale.c\r
+\r
+ Copyright (c) 2003-2012 HandBrake Team\r
+ This file is part of the HandBrake source code\r
+ Homepage: <http://handbrake.fr/>.\r
+ It may be used under the terms of the GNU General Public License v2.\r
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html\r
+\r
+ Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>\r
+ Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>\r
+\r
+ */\r
+\r
+#ifdef USE_OPENCL\r
+\r
+#include <math.h>
+#include "common.h"
+#include "openclwrapper.h"
+#define FILTER_LEN 4
+
+#define _A -0.5f
+
+cl_float cubic(cl_float x)
+{
+ if (x < 0)
+ x = -x;
+
+ if (x < 1)
+ return (_A + 2.0f) * (x * x * x) - (_A + 3.0f) * (x * x) + 0 + 1;
+ else if (x < 2)
+ return (_A) * (x * x * x) - (5.0f * _A) * (x * x) + (8.0f * _A) * x - (4.0f * _A);
+ else
+ return 0;
+}
+
+
+cl_float *hb_bicubic_weights(cl_float scale, int length)
+{
+ cl_float *weights = (cl_float*) malloc(length * sizeof(cl_float) * 4);
+
+ int i; // C rocks
+ cl_float *out = weights;
+ for (i = 0; i < length; ++i)
+ {
+ cl_float x = i / scale;
+ cl_float dx = x - (int)x;
+ *out++ = cubic(-dx - 1.0f);
+ *out++ = cubic(-dx);
+ *out++ = cubic(-dx + 1.0f);
+ *out++ = cubic(-dx + 2.0f);
+ }
+ return weights;
+}
+
+int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv);
+
+/**
+* executive scale using opencl
+* get filter args
+* create output buffer
+* create horizontal filter buffer
+* create vertical filter buffer
+* create kernels
+*/
+int hb_ocl_scale_func( void **data, KernelEnv *kenv )
+{
+ cl_int status;
+
+ cl_mem in_buf = data[0];
+ cl_mem out_buf = data[1];
+ int crop_top = data[2];
+ int crop_bottom = data[3];
+ int crop_left = data[4];
+ int crop_right = data[5];
+ int in_frame_w = (int)data[6];
+ int in_frame_h = (int)data[7];
+ int out_frame_w = (int)data[8];
+ int out_frame_h = (int)data[9];
+ hb_oclscale_t *os = data[10];
+ hb_buffer_t *in = data[11];
+ hb_buffer_t *out = data[12];
+
+ if (os->initialized == 0)
+ {
+ hb_log( "Scaling With OpenCL" );
+ if (kenv->isAMD != 0)
+ hb_log( "Using Zero Copy");
+ // create the block kernel
+ cl_int status;
+ os->m_kernel = clCreateKernel( kenv->program, "frame_scale", &status );
+
+ os->initialized = 1;
+ }
+
+ {
+ // Use the new kernel
+ cl_event events[5];
+ int eventCount = 0;
+
+ if (kenv->isAMD == 0) {
+ status = clEnqueueUnmapMemObject(kenv->command_queue, in->cl.buffer, in->data, 0, NULL, &events[eventCount++]);
+ status = clEnqueueUnmapMemObject(kenv->command_queue, out->cl.buffer, out->data, 0, NULL, &events[eventCount++]);
+ }
+
+ cl_int srcPlaneOffset0 = in->plane[0].data - in->data;
+ cl_int srcPlaneOffset1 = in->plane[1].data - in->data;
+ cl_int srcPlaneOffset2 = in->plane[2].data - in->data;
+ cl_int srcRowWords0 = in->plane[0].stride;
+ cl_int srcRowWords1 = in->plane[1].stride;
+ cl_int srcRowWords2 = in->plane[2].stride;
+ cl_int dstPlaneOffset0 = out->plane[0].data - out->data;
+ cl_int dstPlaneOffset1 = out->plane[1].data - out->data;
+ cl_int dstPlaneOffset2 = out->plane[2].data - out->data;
+ cl_int dstRowWords0 = out->plane[0].stride;
+ cl_int dstRowWords1 = out->plane[1].stride;
+ cl_int dstRowWords2 = out->plane[2].stride;
+
+ if (crop_top != 0 || crop_bottom != 0 || crop_left != 0 || crop_right != 0) {
+ srcPlaneOffset0 += crop_left + crop_top * srcRowWords0;
+ srcPlaneOffset1 += crop_left / 2 + (crop_top / 2) * srcRowWords1;
+ srcPlaneOffset2 += crop_left / 2 + (crop_top / 2) * srcRowWords2;
+ in_frame_w = in_frame_w - crop_right - crop_left;
+ in_frame_h = in_frame_h - crop_bottom - crop_top;
+ }
+
+ cl_float xscale = (out_frame_w * 1.0f) / in_frame_w;
+ cl_float yscale = (out_frame_h * 1.0f) / in_frame_h;
+ setupScaleWeights(xscale, yscale, out_frame_w, out_frame_h, os, kenv);
+
+ OCLCHECK( clSetKernelArg, os->m_kernel, 0, sizeof(cl_mem), &out_buf );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 1, sizeof(cl_mem), &in_buf );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 2, sizeof(cl_float), &xscale );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 3, sizeof(cl_float), &yscale );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 4, sizeof(cl_int), &srcPlaneOffset0 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 5, sizeof(cl_int), &srcPlaneOffset1 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 6, sizeof(cl_int), &srcPlaneOffset2 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 7, sizeof(cl_int), &dstPlaneOffset0 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 8, sizeof(cl_int), &dstPlaneOffset1 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 9, sizeof(cl_int), &dstPlaneOffset2 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 10, sizeof(cl_int), &srcRowWords0 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 11, sizeof(cl_int), &srcRowWords1 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 12, sizeof(cl_int), &srcRowWords2 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 13, sizeof(cl_int), &dstRowWords0 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 14, sizeof(cl_int), &dstRowWords1 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 15, sizeof(cl_int), &dstRowWords2 );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 16, sizeof(int), &in_frame_w ); // FIXME: type mismatch
+ OCLCHECK( clSetKernelArg, os->m_kernel, 17, sizeof(int), &in_frame_h ); //
+ OCLCHECK( clSetKernelArg, os->m_kernel, 18, sizeof(int), &out_frame_w ); //
+ OCLCHECK( clSetKernelArg, os->m_kernel, 19, sizeof(int), &out_frame_h ); //
+ OCLCHECK( clSetKernelArg, os->m_kernel, 20, sizeof(cl_mem), &os->bicubic_x_weights );
+ OCLCHECK( clSetKernelArg, os->m_kernel, 21, sizeof(cl_mem), &os->bicubic_y_weights );
+
+ size_t workOffset[] = { 0, 0, 0 };
+ size_t globalWorkSize[] = { 1, 1, 1 };
+ size_t localWorkSize[] = { 1, 1, 1 };
+
+ int xgroups = (out_frame_w + 63) / 64;
+ int ygroups = (out_frame_h + 15) / 16;
+
+ localWorkSize[0] = 64;
+ localWorkSize[1] = 1;
+ localWorkSize[2] = 1;
+ globalWorkSize[0] = xgroups * 64;
+ globalWorkSize[1] = ygroups;
+ globalWorkSize[2] = 3;
+
+ OCLCHECK( clEnqueueNDRangeKernel, kenv->command_queue, os->m_kernel, 3, workOffset, globalWorkSize, localWorkSize, eventCount, (eventCount == 0) ? NULL : &events[0], &events[eventCount] );
+ ++eventCount;
+
+ if (kenv->isAMD == 0) {
+ in->data = clEnqueueMapBuffer(kenv->command_queue, in->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, in->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount], &status);
+ out->data = clEnqueueMapBuffer(kenv->command_queue, out->cl.buffer, CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, out->alloc, (eventCount == 0) ? 0 : 1, (eventCount == 0) ? NULL : &events[eventCount - 1], &events[eventCount + 1], &status);
+ eventCount += 2;
+ }
+
+ clFlush(kenv->command_queue);
+ clWaitForEvents(eventCount, &events[0]);
+ int i;
+ for (i = 0; i < eventCount; ++i)
+ clReleaseEvent(events[i]);
+ }
+
+ return 1;
+}
+
+int setupScaleWeights(cl_float xscale, cl_float yscale, int width, int height, hb_oclscale_t *os, KernelEnv *kenv) {
+ cl_int status;
+ if (os->xscale != xscale || os->width < width) {
+ cl_float *xweights = hb_bicubic_weights(xscale, width);
+ CL_FREE(os->bicubic_x_weights);
+ CREATEBUF(os->bicubic_x_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * width * 4);
+ OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_x_weights, CL_TRUE, 0, sizeof(cl_float) * width * 4, xweights, 0, NULL, NULL );
+ os->width = width;
+ os->xscale = xscale;
+ free(xweights);
+ }
+
+ if ((os->yscale != yscale) || (os->height < height)) {
+ cl_float *yweights = hb_bicubic_weights(yscale, height);
+ CL_FREE(os->bicubic_y_weights);
+ CREATEBUF(os->bicubic_y_weights, CL_MEM_READ_ONLY, sizeof(cl_float) * height * 4);
+ OCLCHECK(clEnqueueWriteBuffer, kenv->command_queue, os->bicubic_y_weights, CL_TRUE, 0, sizeof(cl_float) * height * 4, yweights, 0, NULL, NULL );
+ os->height = height;
+ os->yscale = yscale;
+ free(yweights);
+ }
+ return 0;
+}
+
+
+/**
+* function describe: this function is used to scaling video frame. it uses the gausi scaling algorithm
+* parameter:
+* inputFrameBuffer: the source video frame opencl buffer\r
+* outputdata: the destination video frame buffer\r
+* inputWidth: the width of the source video frame\r
+* inputHeight: the height of the source video frame\r
+* outputWidth: the width of destination video frame
+* outputHeight: the height of destination video frame
+*/
+
+
+static int s_scale_init_flag = 0;
+
+int do_scale_init()
+{
+ if ( s_scale_init_flag==0 )
+ {
+ int st = hb_register_kernel_wrapper( "frame_scale", hb_ocl_scale_func );
+ if( !st )
+ {
+ hb_log( "register kernel[%s] failed", "frame_scale" );
+ return 0;
+ }
+ s_scale_init_flag++;
+ }
+ return 1;
+}
+
+
+int hb_ocl_scale(hb_buffer_t *in, hb_buffer_t *out, int *crop, hb_oclscale_t *os)
+{
+ void *data[13];
+
+ if (do_scale_init() == 0)
+ return 0;
+
+ data[0] = in->cl.buffer;
+ data[1] = out->cl.buffer;
+ data[2] = (void*)(crop[0]);
+ data[3] = (void*)(crop[1]);
+ data[4] = (void*)(crop[2]);
+ data[5] = (void*)(crop[3]);
+ data[6] = (void*)(in->f.width);
+ data[7] = (void*)(in->f.height);
+ data[8] = (void*)(out->f.width);
+ data[9] = (void*)(out->f.height);
+ data[10] = os;
+ data[11] = in;
+ data[12] = out;
+
+ if( !hb_run_kernel( "frame_scale", data ) )
+ hb_log( "run kernel[%s] failed", "frame_scale" );
+ return 0;
+}
+
+
+
+
+
+#endif
--- /dev/null
+/* openclkernels.h\r
+\r
+ Copyright (c) 2003-2012 HandBrake Team\r
+ This file is part of the HandBrake source code\r
+ Homepage: <http://handbrake.fr/>.\r
+ It may be used under the terms of the GNU General Public License v2.\r
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html\r
+ \r
+ Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>\r
+ Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>\r
+\r
+ */\r
+ \r
+#ifndef USE_EXTERNAL_KERNEL\r
+\r
+#define KERNEL( ... )# __VA_ARGS__\r
+\r
+\r
+char *kernel_src_hscale = KERNEL (\r
+\r
+ typedef unsigned char fixed8;\r
+\r
+/*******************************************************************************************************\r
+dst: Horizontal scale destination;\r
+src: YUV content in opencl buf;\r
+hf_Y: Horizontal filter coefficients for Y planes;\r
+hf_UV: Horizontal filter coefficients for UV planes;\r
+hi_Y: Horizontal filter index for Y planes;\r
+hi_UV: Horizontal filter index for UV planes;\r
+stride: Src width;\r
+filter_len: Length of filter;\r
+********************************************************************************************************/\r
+ kernel void frame_h_scale (\r
+ global fixed8 *src,\r
+ global float *hf_Y,\r
+ global float *hf_UV,\r
+ global int *hi_Y,\r
+ global int *hi_UV,\r
+ global fixed8 *dst,\r
+ int stride, //src_width\r
+ int filter_len\r
+ )\r
+ {\r
+ int x = get_global_id( 0 );\r
+ int y = get_global_id( 1 );\r
+ int width = get_global_size( 0 );\r
+ int height = get_global_size( 1 );\r
+ float result_Y = 0, result_U = 0, result_V = 0;\r
+ int i = 0;\r
+\r
+ global fixed8 *src_Y = src;\r
+ global fixed8 *src_U = src_Y + stride * height;\r
+ global fixed8 *src_V = src_U + (stride >> 1) * (height >> 1);\r
+\r
+ global fixed8 *dst_Y = dst;\r
+ global fixed8 *dst_U = dst_Y + width * height;\r
+ global fixed8 *dst_V = dst_U + (width >> 1) * (height >> 1);\r
+\r
+ int xy = y * width + x;\r
+ global fixed8 *rowdata_Y = src_Y + (y * stride);\r
+ for( int i = 0; i < filter_len; i++ )\r
+ {\r
+ result_Y += ( hf_Y[x + i * width] * rowdata_Y[hi_Y[x] + i]);\r
+ }\r
+ dst_Y[xy] = result_Y;\r
+\r
+ if( y < (height >> 1) && x < (width >> 1) )\r
+ {\r
+ int xy = y * (width >> 1) + x;\r
+ global fixed8 *rowdata_U = src_U + (y * (stride >> 1));\r
+ global fixed8 *rowdata_V = src_V + (y * (stride >> 1));\r
+ for( i = 0; i < filter_len; i++ )\r
+ {\r
+ result_U += ( hf_UV[x + i * (width >> 1)] * rowdata_U[hi_UV[x] + i]);\r
+ result_V += ( hf_UV[x + i * (width >> 1)] * rowdata_V[hi_UV[x] + i]);\r
+ }\r
+ dst_U[xy] = result_U;\r
+ dst_V[xy] = result_V;\r
+ }\r
+ }\r
+ );\r
+\r
+/*******************************************************************************************************\r
+dst: Vertical scale destination;\r
+src: YUV content in opencl buf;\r
+hf_Y: Vertical filter coefficients for Y planes;\r
+hf_UV: Vertical filter coefficients for UV planes;\r
+hi_Y: Vertical filter index for Y planes;\r
+hi_UV: Vertical filter index for UV planes;\r
+stride: Src height;\r
+filter_len: Length of filter;\r
+********************************************************************************************************/\r
+char *kernel_src_vscale = KERNEL (\r
+\r
+ kernel void frame_v_scale (\r
+ global fixed8 *src,\r
+ global float *vf_Y,\r
+ global float *vf_UV,\r
+ global int *vi_Y,\r
+ global int *vi_UV,\r
+ global fixed8 *dst,\r
+ int src_height,\r
+ int filter_len\r
+ )\r
+ {\r
+ int x = get_global_id( 0 );\r
+ int y = get_global_id( 1 );\r
+ int width = get_global_size( 0 );\r
+ int height = get_global_size( 1 );\r
+ float result_Y = 0, result_U = 0, result_V = 0;\r
+ int i = 0;\r
+\r
+ global fixed8 *src_Y = src;\r
+ global fixed8 *src_U = src_Y + src_height * width;\r
+ global fixed8 *src_V = src_U + (src_height >> 1) * (width >> 1);\r
+\r
+ global fixed8 *dst_Y = dst;\r
+ global fixed8 *dst_U = dst_Y + height * width;\r
+ global fixed8 *dst_V = dst_U + (height >> 1) * (width >> 1);\r
+\r
+ int xy = y * width + x;\r
+ for( i = 0; i < filter_len; i++ )\r
+ {\r
+ result_Y += vf_Y[y + i * height] * src_Y[(vi_Y[y] + i) * width + x];\r
+ }\r
+ dst_Y[xy] = result_Y;\r
+\r
+ if( y < (height >> 1) && x < (width >> 1) )\r
+ {\r
+ int xy = y * (width >> 1) + x;\r
+ for( i = 0; i < filter_len; i++ )\r
+ {\r
+ result_U += vf_UV[y + i * (height >> 1)] * src_U[(vi_UV[y] + i) * (width >> 1) + x];\r
+ result_V += vf_UV[y + i * (height >> 1)] * src_V[(vi_UV[y] + i) * (width >> 1) + x];\r
+ }\r
+ dst_U[xy] = result_U;\r
+ dst_V[xy] = result_V;\r
+ }\r
+ }\r
+ );\r
+\r
+/*******************************************************************************************************\r
+input: Input buffer;\r
+output: Output buffer;\r
+w: Width of frame;\r
+h: Height of frame;\r
+********************************************************************************************************/\r
+char *kernel_src_nvtoyuv = KERNEL (\r
+\r
+ kernel void nv12toyuv ( global char *input, global char* output, int w, int h )\r
+ {\r
+ int x = get_global_id( 0 );\r
+ int y = get_global_id( 1 );\r
+ int idx = y * (w >> 1) + x;\r
+ vstore4((vload4( 0, input + (idx << 2))), 0, output + (idx << 2)); //Y\r
+ char2 uv = vload2( 0, input + (idx << 1) + w * h );\r
+ output[idx + w * h] = uv.s0;\r
+ output[idx + w * h + ((w * h) >> 2)] = uv.s1;\r
+ }\r
+ );\r
+\r
+/*******************************************************************************************************\r
+dst: Horizontal scale destination;\r
+src: YUV content in opencl buf;\r
+yfilter: Opencl memory of horizontal filter coefficients for luma/alpha planes;\r
+yfilterPos: Opencl memory of horizontal filter starting positions for each dst[i] for luma/alpha planes;\r
+yfilterSize: Horizontal filter size for luma/alpha pixels;\r
+cfilter: Opencl memory of horizontal filter coefficients for chroma planes;\r
+cfilterPos: Opencl memory of horizontal filter starting positions for each dst[i] for chroma planes;\r
+cfilterSize: Horizontal filter size for chroma pixels;\r
+dstStride: Width of destination luma/alpha planes;\r
+dstChrStride: Width of destination chroma planes;\r
+********************************************************************************************************/\r
+\r
+char *kernel_src_hscaleall = KERNEL (\r
+\r
+ kernel void hscale_all_opencl (\r
+ global short *dst,\r
+ const global unsigned char *src,\r
+ const global short *yfilter,\r
+ const global int *yfilterPos,\r
+ int yfilterSize,\r
+ const global short *cfilter,\r
+ const global int *cfilterPos,\r
+ int cfilterSize,\r
+ int dstWidth,\r
+ int dstHeight,\r
+ int srcWidth,\r
+ int srcHeight,\r
+ int dstStride,\r
+ int dstChrStride,\r
+ int srcStride,\r
+ int srcChrStride)\r
+ {\r
+ int w = get_global_id(0);\r
+ int h = get_global_id(1);\r
+\r
+ int chrWidth = get_global_size(0);\r
+ int chrHeight = get_global_size(1);\r
+\r
+ int srcPos1 = h * srcStride + yfilterPos[w];\r
+ int srcPos2 = h * srcStride + yfilterPos[w + chrWidth];\r
+ int srcPos3 = (h + (srcHeight >> 1)) * srcStride + yfilterPos[w];\r
+ int srcPos4 = (h + (srcHeight >> 1)) * srcStride + yfilterPos[w + chrWidth];\r
+ int srcc1Pos = srcStride * srcHeight + (h) * (srcChrStride) + cfilterPos[w];\r
+ int srcc2Pos = srcc1Pos + ((srcChrStride)*(chrHeight));\r
+\r
+ int val1 = 0;\r
+ int val2 = 0;\r
+ int val3 = 0;\r
+ int val4 = 0;\r
+ int val5 = 0;\r
+ int val6 = 0;\r
+\r
+ int filterPos1 = yfilterSize * w;\r
+ int filterPos2 = yfilterSize * (w + chrWidth);\r
+ int cfilterPos1 = cfilterSize * w;\r
+\r
+ int j;\r
+ for (j = 0; j < yfilterSize; j++)\r
+ {\r
+ val1 += src[srcPos1 + j] * yfilter[filterPos1+ j];\r
+ val2 += src[srcPos2 + j] * yfilter[filterPos2 + j];\r
+ val3 += src[srcPos3 + j] * yfilter[filterPos1 + j];\r
+ val4 += src[srcPos4 + j] * yfilter[filterPos2 + j];\r
+ val5 += src[srcc1Pos+j] * cfilter[cfilterPos1 + j];\r
+ val6 += src[srcc2Pos+j] * cfilter[cfilterPos1 + j];\r
+ }\r
+ int dstPos1 = h *dstStride;\r
+ int dstPos2 = (h + chrHeight) * dstStride;\r
+\r
+ dst[dstPos1 + w] = ((val1 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val1 >> 7));\r
+ dst[dstPos1 + w + chrWidth] = ((val2 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val2 >> 7));\r
+ dst[dstPos2 + w] = ((val3 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val3 >> 7));\r
+ dst[dstPos2 + w + chrWidth] = ((val4 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val4 >> 7));\r
+\r
+ int dstPos3 = h * (dstChrStride) + w + dstStride * dstHeight;\r
+ int dstPos4 = h * (dstChrStride) + w + dstStride * dstHeight + ((dstChrStride) * chrHeight);\r
+ dst[dstPos3] = ((val5 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val5 >> 7));\r
+ dst[dstPos4] = ((val6 >> 7) > ((1 << 15) - 1) ? ((1 << 15) - 1) : (val6 >> 7));\r
+ }\r
+ );\r
+\r
+char *kernel_src_hscalefast = KERNEL (\r
+\r
+ kernel void hscale_fast_opencl (\r
+ global short *dst,\r
+ const global unsigned char *src,\r
+ int xInc,\r
+ int chrXInc,\r
+ int dstWidth,\r
+ int dstHeight,\r
+ int srcWidth,\r
+ int srcHeight,\r
+ int dstStride,\r
+ int dstChrStride,\r
+ int srcStride,\r
+ int srcChrStride)\r
+ {\r
+\r
+ int w = get_global_id(0);\r
+ int h = get_global_id(1);\r
+\r
+ int chrWidth = get_global_size(0);\r
+ int chrHeight = get_global_size(1);\r
+ int xpos1 = 0;\r
+ int xpos2 = 0;\r
+ int xx = xpos1 >> 16;\r
+ int xalpha = (xpos1 & 0xFFFF) >> 9;\r
+ dst[h * dstStride + w] = (src[h * srcStride + xx] << 7) + (src[h * srcStride + xx + 1] -src[h * srcStride + xx]) * xalpha;\r
+ int lowpart = h + (chrHeight);\r
+ dst[lowpart * dstStride + w] = (src[lowpart * srcStride + xx] << 7) + (src[lowpart * srcStride + xx + 1] - src[lowpart * srcStride + xx]) * xalpha;\r
+\r
+ int inv_i = w * xInc >> 16;\r
+ if( inv_i >= srcWidth - 1)\r
+ {\r
+ dst[h*dstStride + w] = src[h*srcStride + srcWidth-1]*128;\r
+ dst[lowpart*dstStride + w] = src[lowpart*srcStride + srcWidth - 1] * 128;\r
+ }\r
+\r
+ int rightpart = w + (chrWidth);\r
+ xx = xpos2 >> 16;\r
+ xalpha = (xpos2 & 0xFFFF) >> 9;\r
+ dst[h * dstStride + rightpart] = (src[h *srcStride + xx] << 7) + (src[h * srcStride + xx + 1] - src[h * srcStride + xx]) * xalpha;\r
+ dst[lowpart * dstStride + rightpart] = (src[lowpart * srcStride + xx] << 7) + (src[lowpart * srcStride + xx + 1] - src[lowpart * srcStride + xx]) * xalpha;\r
+ inv_i = rightpart * xInc >> 16;\r
+ if( inv_i >= srcWidth - 1)\r
+ {\r
+ dst[h * dstStride + rightpart] = src[h * srcStride + srcWidth - 1] * 128;\r
+ dst[lowpart * dstStride + rightpart] = src[lowpart * srcStride + srcWidth - 1] * 128;\r
+ }\r
+\r
+ int xpos = 0;\r
+ xpos = chrXInc * w;\r
+ xx = xpos >> 16;\r
+ xalpha = (xpos & 0xFFFF) >> 9;\r
+ src += srcStride * srcHeight;\r
+ dst += dstStride * dstHeight;\r
+ dst[h * (dstChrStride) + w] = (src[h * (srcChrStride) + xx] * (xalpha^127) + src[h * (srcChrStride) + xx + 1] * xalpha);\r
+ inv_i = w * xInc >> 16;\r
+ if( inv_i >= (srcWidth >> 1) - 1)\r
+ {\r
+ dst[h * (dstChrStride) + w] = src[h * (srcChrStride) + (srcWidth >> 1) -1]*128;\r
+ }\r
+\r
+ xpos = chrXInc * (w);\r
+ xx = xpos >> 16;\r
+ src += srcChrStride * srcHeight >> 1;\r
+ dst += (dstChrStride * chrHeight);\r
+ dst[h * (dstChrStride) + w] = (src[h * (srcChrStride) + xx] * (xalpha^127) + src[h * (srcChrStride) + xx + 1 ] * xalpha);\r
+\r
+ if( inv_i >= (srcWidth >> 1) - 1)\r
+ {\r
+ //v channel:\r
+ dst[h * (dstChrStride) + w] = src[h * (srcChrStride) + (srcWidth >> 1) -1] * 128;\r
+ }\r
+ }\r
+ );\r
+\r
+char *kernel_src_vscalealldither = KERNEL (\r
+\r
+ kernel void vscale_all_dither_opencl (\r
+ global unsigned char *dst,\r
+ const global short *src,\r
+ const global short *yfilter,\r
+ int yfilterSize,\r
+ const global short *cfilter,\r
+ int cfilterSize,\r
+ const global int *yfilterPos,\r
+ const global int *cfilterPos,\r
+ int dstWidth,\r
+ int dstHeight,\r
+ int srcWidth,\r
+ int srcHeight,\r
+ int dstStride,\r
+ int dstChrStride,\r
+ int srcStride,\r
+ int srcChrStride)\r
+ {\r
+ const unsigned char hb_dither_8x8_128[8][8] = {\r
+ { 36, 68, 60, 92, 34, 66, 58, 90, },\r
+ { 100, 4, 124, 28, 98, 2, 122, 26, },\r
+ { 52, 84, 44, 76, 50, 82, 42, 74, },\r
+ { 116, 20, 108, 12, 114, 18, 106, 10, },\r
+ { 32, 64, 56, 88, 38, 70, 62, 94, },\r
+ { 96, 0, 120, 24, 102, 6, 126, 30, },\r
+ { 48, 80, 40, 72, 54, 86, 46, 78, },\r
+ { 112, 16, 104, 8, 118, 22, 110, 14, },\r
+ };\r
+\r
+\r
+ int w = get_global_id(0);\r
+ int h = get_global_id(1);\r
+\r
+ int chrWidth = get_global_size(0);\r
+ int chrHeight = get_global_size(1);\r
+ const unsigned char *local_up_dither;\r
+ const unsigned char *local_down_dither;\r
+\r
+ local_up_dither = hb_dither_8x8_128[h & 7];\r
+ local_down_dither = hb_dither_8x8_128[(h + chrHeight) & 7];\r
+\r
+ //yscale;\r
+ int srcPos1 = (yfilterPos[h]) * srcStride + w;\r
+ int srcPos2 = (yfilterPos[h]) * srcStride + w + (chrWidth);\r
+ int srcPos3 = (yfilterPos[h + chrHeight]) * srcStride + w;\r
+ int srcPos4 = (yfilterPos[h + chrHeight]) * srcStride + w + chrWidth;\r
+ int src1Pos = dstStride * srcHeight + (cfilterPos[h]) * dstChrStride + (w);\r
+ int src2Pos = dstStride * srcHeight + (dstChrStride*(srcHeight>>1)) + (cfilterPos[h]) * dstChrStride + w;\r
+\r
+ int val1 = (local_up_dither[w & 7] << 12); //y offset is 0;\r
+ int val2 = (local_up_dither[(w + chrWidth) & 7] << 12);\r
+ int val3 = (local_down_dither[w &7] << 12);\r
+ int val4 = (local_down_dither[(w + chrWidth) & 7] << 12);\r
+ int val5 = (local_up_dither[w & 7] << 12);\r
+ int val6 = (local_up_dither[(w + 3) & 7] << 12); // 3 is offset of the chrome channel.\r
+\r
+ int j;\r
+ int filterPos1 = h * yfilterSize;\r
+ int filterPos2 = ( h + chrHeight ) * yfilterSize;\r
+ for(j = 0; j < yfilterSize; j++)\r
+ {\r
+ val1 += src[srcPos1] * yfilter[filterPos1 + j];\r
+ srcPos1 += srcStride;\r
+ val2 += src[srcPos2] * yfilter[filterPos1 + j];\r
+ srcPos2 += srcStride;\r
+ val3 += src[srcPos3] * yfilter[filterPos2 + j];\r
+ srcPos3 += srcStride;\r
+ val4 += src[srcPos4] * yfilter[filterPos2 + j];\r
+ srcPos4 += srcStride;\r
+ val5 += src[src1Pos] * cfilter[filterPos1 + j];\r
+ val6 += src[src2Pos] * cfilter[filterPos1 + j];\r
+ src1Pos += dstChrStride;\r
+ src2Pos += dstChrStride;\r
+ }\r
+ dst[h * dstStride + w] = (((val1 >> 19)&(~0xFF)) ? ((-(val1 >> 19)) >> 31) : (val1 >> 19));\r
+ dst[h * dstStride + w + chrWidth] = (((val2 >> 19)&(~0xFF)) ? ((-(val2 >> 19)) >> 31) : (val2 >> 19));\r
+ dst[(h + chrHeight) * dstStride + w] = (((val3 >> 19)&(~0xFF)) ? ((-(val3 >> 19)) >> 31) : (val3 >> 19));\r
+ dst[(h + chrHeight) * dstStride + w + chrWidth] = (((val4 >> 19)&(~0xFF)) ? ((-(val4 >> 19)) >> 31) : (val4 >> 19));\r
+\r
+ int dst1Pos = dstStride * dstHeight + h*(dstChrStride)+(w);\r
+ int dst2Pos = (dstChrStride * chrHeight) + dst1Pos;\r
+ dst[dst1Pos] = (((val5 >> 19)&(~0xFF)) ? ((-(val5 >> 19)) >> 31) : (val5 >> 19));\r
+ dst[dst2Pos] = (((val6 >> 19)&(~0xFF)) ? ((-(val6 >> 19)) >> 31) : (val6 >> 19));\r
+ }\r
+ );\r
+\r
+char *kernel_src_vscaleallnodither = KERNEL (\r
+\r
+ kernel void vscale_all_nodither_opencl (\r
+ global unsigned char *dst,\r
+ const global short *src,\r
+ const global short *yfilter,\r
+ int yfilterSize,\r
+ const global short *cfilter,\r
+ int cfilterSize,\r
+ const global int *yfilterPos,\r
+ const global int *cfilterPos,\r
+ int dstWidth,\r
+ int dstHeight,\r
+ int srcWidth,\r
+ int srcHeight,\r
+ int dstStride,\r
+ int dstChrStride,\r
+ int srcStride,\r
+ int srcChrStride)\r
+ {\r
+ const unsigned char hb_sws_pb_64[8] = {\r
+ 64, 64, 64, 64, 64, 64, 64, 64\r
+ };\r
+\r
+ int w = get_global_id(0);\r
+ int h = get_global_id(1);\r
+\r
+ int chrWidth = get_global_size(0);\r
+ int chrHeight = get_global_size(1);\r
+ const unsigned char *local_up_dither;\r
+ const unsigned char *local_down_dither;\r
+\r
+ local_up_dither = hb_sws_pb_64;\r
+ local_down_dither = hb_sws_pb_64;\r
+\r
+\r
+ //yscale;\r
+ int srcPos1 = (yfilterPos[h]) * srcStride + w;\r
+ int srcPos2 = (yfilterPos[h]) * srcStride + w + (chrWidth);\r
+ int srcPos3 = (yfilterPos[h + chrHeight]) * srcStride + w;\r
+ int srcPos4 = (yfilterPos[h + chrHeight]) * srcStride + w + chrWidth;\r
+ int src1Pos = dstStride * srcHeight + (cfilterPos[h]) * dstChrStride + (w);\r
+ int src2Pos = dstStride * srcHeight + (dstChrStride*(srcHeight>>1)) + (cfilterPos[h]) * dstChrStride + w;\r
+\r
+ int val1 = (local_up_dither[w & 7] << 12); //y offset is 0;\r
+ int val2 = (local_up_dither[(w + chrWidth) & 7] << 12);\r
+ int val3 = (local_down_dither[w &7] << 12);\r
+ int val4 = (local_down_dither[(w + chrWidth) & 7] << 12);\r
+ int val5 = (local_up_dither[w & 7] << 12);\r
+ int val6 = (local_up_dither[(w + 3) & 7] << 12); // 3 is offset of the chrome channel.\r
+\r
+\r
+ int j;\r
+ int filterPos1 = h * yfilterSize;\r
+ int filterPos2 = ( h + chrHeight ) * yfilterSize;\r
+ for(j = 0; j < yfilterSize; j++)\r
+ {\r
+ val1 += src[srcPos1] * yfilter[filterPos1 + j];\r
+ srcPos1 += srcStride;\r
+ val2 += src[srcPos2] * yfilter[filterPos1 + j];\r
+ srcPos2 += srcStride;\r
+ val3 += src[srcPos3] * yfilter[filterPos2 + j];\r
+ srcPos3 += srcStride;\r
+ val4 += src[srcPos4] * yfilter[filterPos2 + j];\r
+ srcPos4 += srcStride;\r
+ val5 += src[src1Pos] * cfilter[filterPos1 + j];\r
+ val6 += src[src2Pos] * cfilter[filterPos1 + j];\r
+ src1Pos += dstChrStride;\r
+ src2Pos += dstChrStride;\r
+ }\r
+ dst[h * dstStride + w] = (((val1 >> 19)&(~0xFF)) ? ((-(val1 >> 19)) >> 31) : (val1 >> 19));\r
+ dst[h * dstStride + w + chrWidth] = (((val2 >> 19)&(~0xFF)) ? ((-(val2 >> 19)) >> 31) : (val2 >> 19));\r
+ dst[(h + chrHeight) * dstStride + w] = (((val3 >> 19)&(~0xFF)) ? ((-(val3 >> 19)) >> 31) : (val3 >> 19));\r
+ dst[(h + chrHeight) * dstStride + w + chrWidth] = (((val4 >> 19)&(~0xFF)) ? ((-(val4 >> 19)) >> 31) : (val4 >> 19));;\r
+\r
+ int dst1Pos = dstStride * dstHeight + h * (dstChrStride) + (w);\r
+ int dst2Pos = (dstChrStride * chrHeight) + dst1Pos;\r
+ dst[dst1Pos] = (((val5 >> 19)&(~0xFF)) ? ((-(val5 >> 19)) >> 31) : (val5 >> 19));\r
+ dst[dst2Pos] = (((val6 >> 19)&(~0xFF)) ? ((-(val6 >> 19)) >> 31) : (val6 >> 19));\r
+ }\r
+ );\r
+\r
+char *kernel_src_vscalefast = KERNEL (\r
+\r
+ kernel void vscale_fast_opencl (\r
+ global unsigned char *dst,\r
+ const global short *src,\r
+ const global int *yfilterPos,\r
+ const global int *cfilterPos,\r
+ int dstWidth,\r
+ int dstHeight,\r
+ int srcWidth,\r
+ int srcHeight,\r
+ int dstStride,\r
+ int dstChrStride,\r
+ int srcStride,\r
+ int srcChrStride)\r
+ {\r
+ const unsigned char hb_sws_pb_64[8] = {\r
+ 64, 64, 64, 64, 64, 64, 64, 64\r
+ };\r
+\r
+ int w = get_global_id(0);\r
+ int h = get_global_id(1);\r
+\r
+ int chrWidth = get_global_size(0);\r
+ int chrHeight = get_global_size(1);\r
+\r
+ const unsigned char *local_up_dither;\r
+ const unsigned char *local_down_dither;\r
+\r
+ local_up_dither = hb_sws_pb_64;\r
+ local_down_dither = hb_sws_pb_64;\r
+\r
+\r
+ int rightpart = w + chrWidth;\r
+ int bh = h + chrHeight; // bottom part\r
+ short val1 = (src[(yfilterPos[h]) * dstStride + w] + local_up_dither[(w + 0) & 7]) >> 7; //lum offset is 0;\r
+ short val2 = (src[(yfilterPos[h]) * dstStride + rightpart] + local_up_dither[rightpart & 7]) >> 7;\r
+ short val3 = (src[(yfilterPos[bh]) * dstStride + w] + local_down_dither[w & 7]) >> 7;\r
+ short val4 = (src[(yfilterPos[bh]) * dstStride + rightpart] + local_down_dither[rightpart & 7]) >> 7;\r
+ dst[h * dstStride + w] = ((val1&(~0xFF)) ? ((-val1) >> 31) : (val1));\r
+ dst[h * dstStride + rightpart] = ((val2&(~0xFF)) ? ((-val2) >> 31) : (val2));\r
+ dst[bh * dstStride + w] = ((val3&(~0xFF)) ? ((-val3) >> 31) : (val3));\r
+ dst[bh * dstStride + rightpart] = ((val4&(~0xFF)) ? ((-val4) >> 31) : (val4));\r
+\r
+ src += dstStride * srcHeight;\r
+ dst += dstStride * dstHeight;\r
+ val1 = (src[cfilterPos[h] * (dstChrStride) + w] + local_up_dither[ w & 7]) >> 7;\r
+ dst[h * (dstChrStride) + w] = ((val1&(~0xFF)) ? ((-val1) >> 31) : (val1));\r
+\r
+ src += dstChrStride * (srcHeight >> 1);\r
+ dst += dstChrStride * chrHeight;\r
+ val1 = (src[cfilterPos[h] * dstChrStride + w] + local_up_dither[ (w + 3) & 7] ) >> 7;\r
+ dst[h * dstChrStride + w] = ((val1&(~0xFF)) ? ((-val1) >> 31) : (val1));\r
+\r
+ }
+ );
+
+char *kernel_src_scale = KERNEL (
+
+__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void frame_scale(__global uchar *dst,
+ __global const uchar *src,
+ const float xscale,
+ const float yscale,
+ const int srcPlaneOffset0,
+ const int srcPlaneOffset1,
+ const int srcPlaneOffset2,
+ const int dstPlaneOffset0,
+ const int dstPlaneOffset1,
+ const int dstPlaneOffset2,
+ const int srcRowWords0,
+ const int srcRowWords1,
+ const int srcRowWords2,
+ const int dstRowWords0,
+ const int dstRowWords1,
+ const int dstRowWords2,
+ const int srcWidth,
+ const int srcHeight,
+ const int dstWidth,
+ const int dstHeight,
+ __global const float4* restrict xweights,
+ __global const float4* restrict yweights
+ )
+{
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
+
+ // Abort work items outside the dst image bounds.
+
+ if ((get_group_id(0) * 64 >= (dstWidth >> ((z == 0) ? 0 : 1))) || (get_group_id(1) * 16 >= (dstHeight >> ((z == 0) ? 0 : 1))))
+ return;
+
+ const int srcPlaneOffset = (z == 0) ? srcPlaneOffset0 : ((z == 1) ? srcPlaneOffset1 : srcPlaneOffset2);
+ const int dstPlaneOffset = (z == 0) ? dstPlaneOffset0 : ((z == 1) ? dstPlaneOffset1 : dstPlaneOffset2);
+ const int srcRowWords = (z == 0) ? srcRowWords0: ((z == 1) ? srcRowWords1 : srcRowWords2);
+ const int dstRowWords = (z == 0) ? dstRowWords0: ((z == 1) ? dstRowWords1 : dstRowWords2);
+
+ __local uchar pixels[64 * 36];
+ const int localRowPixels = 64;
+ const int groupHeight = 16; // src pixel height output by the workgroup
+ const int ypad = 2;
+ const int localx = get_local_id(0);
+
+ const int globalStartRow = floor((get_group_id(1) * groupHeight) / yscale);
+ const int globalRowCount = ceil(groupHeight / yscale) + 2 * ypad;
+
+ float4 weights = xweights[x];
+ int4 woffs = floor(x / xscale);
+ woffs += (int4)(-1, 0, 1, 2);
+ woffs = clamp(woffs, 0, (srcWidth >> ((z == 0) ? 0 : 1)) - 1);
+ const int maxy = (srcHeight >> ((z == 0) ? 0 : 1)) - 1;
+
+ // Scale x from global into LDS
+
+ for (int i = 0; i <= globalRowCount; ++i) {
+ int4 offs = srcPlaneOffset + clamp(globalStartRow - ypad + i, 0, maxy) * srcRowWords;
+ offs += woffs;
+ pixels[localx + i * localRowPixels] = convert_uchar(clamp(round(dot(weights,
+ (float4)(src[offs.x], src[offs.y], src[offs.z], src[offs.w]))), 0.0f, 255.0f));
+ }
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Scale y from LDS into global
+
+ if (x >= dstWidth >> ((z == 0) ? 0 : 1))
+ return;
+
+ int off = dstPlaneOffset + x + (get_group_id(1) * groupHeight) * dstRowWords;
+
+ for (int i = 0; i < groupHeight; ++i) {
+ if (y >= dstHeight >> ((z == 0) ? 0 : 1))
+ break;
+ int localy = floor((get_group_id(1) * groupHeight + i) / yscale);
+ localy = localy - globalStartRow + ypad;
+ int loff = localx + localy * localRowPixels;
+ dst[off] = convert_uchar(clamp(round(dot(yweights[get_group_id(1) * groupHeight + i],
+ (float4)(pixels[loff - localRowPixels], pixels[loff], pixels[loff + localRowPixels]
+ , pixels[loff + localRowPixels * 2]))), 0.0f, 255.0f));
+ off += dstRowWords;
+ }
+}
+);
+
+
+char *kernel_src_yadif_filter = KERNEL(
+ void filter_v6(
+ global unsigned char *dst,
+ global unsigned char *prev, \r
+ global unsigned char *cur, \r
+ global unsigned char *next,\r
+ int x,\r
+ int y,\r
+ int width,\r
+ int height,\r
+ int parity,\r
+ int inlinesize,\r
+ int outlinesize,\r
+ int inmode,\r
+ int uvflag\r
+ )\r
+ {\r
+\r
+ int flag = uvflag * (y >=height) * height; \r
+ int prefs = select(-(inlinesize), inlinesize,((y+1) - flag) <height);\r
+ int mrefs = select(inlinesize, -(inlinesize),y - flag);\r
+ int mode = select(inmode,2,(y - flag==1) || (y - flag + 2==height));\r
+ int score;\r
+ \r
+ global unsigned char *prev2 = parity ? prev : cur ;\r
+ global unsigned char *next2 = parity ? cur : next;\r
+ int index = x + y * inlinesize;\r
+ int outindex = x + y * outlinesize;\r
+ int c = cur[index + mrefs]; \r
+ int d = (prev2[index] + next2[index])>>1; \r
+ int e = cur[index + prefs]; \r
+ int temporal_diff0 = abs((prev2[index]) - (next2[index])); \r
+ int temporal_diff1 =(abs(prev[index + mrefs] - c) + abs(prev[index + prefs] - e) )>>1; \r
+ int temporal_diff2 =(abs(next[index + mrefs] - c) + abs(next[index + prefs] - e) )>>1; \r
+ int diff = max(max(temporal_diff0>>1, temporal_diff1), temporal_diff2); \r
+ int spatial_pred = (c+e)>>1; \r
+ int spatial_score = abs(cur[index + mrefs-1] - cur[index + prefs-1]) + abs(c-e) + abs(cur[index + mrefs+1] - cur[index + prefs+1]) - 1; \r
+ //check -1\r
+ score = abs(cur[index + mrefs-2] - cur[index + prefs])\r
+ + abs(cur[index + mrefs-1] - cur[index + prefs+1])\r
+ + abs(cur[index + mrefs] - cur[index + prefs+2]);\r
+ if (score < spatial_score)\r
+ {\r
+ spatial_score= score;\r
+ spatial_pred= (cur[index + mrefs-1] + cur[index + prefs+1])>>1;\r
+ }\r
+ //check -2\r
+ score = abs(cur[index + mrefs-3] - cur[index + prefs+1])\r
+ + abs(cur[index + mrefs-2] - cur[index + prefs+2])\r
+ + abs(cur[index + mrefs-1] - cur[index + prefs+3]);\r
+ if (score < spatial_score)\r
+ {\r
+ spatial_score= score;\r
+ spatial_pred= (cur[index + mrefs-2] + cur[index + prefs+2])>>1;\r
+ }\r
+ //check 1\r
+ score = abs(cur[index + mrefs] - cur[index + prefs-2])\r
+ + abs(cur[index + mrefs+1] - cur[index + prefs-1])\r
+ + abs(cur[index + mrefs+2] - cur[index + prefs]);\r
+ if (score < spatial_score)\r
+ {\r
+ spatial_score= score;\r
+ spatial_pred= (cur[index + mrefs+1] + cur[index + prefs-1])>>1;\r
+ }\r
+ //check 2\r
+ score = abs(cur[index + mrefs+1] - cur[index + prefs-3])\r
+ + abs(cur[index + mrefs+2] - cur[index + prefs-2])\r
+ + abs(cur[index + mrefs+3] - cur[index + prefs-1]);\r
+ if (score < spatial_score)\r
+ {\r
+ spatial_score= score;\r
+ spatial_pred= (cur[index + mrefs+2] + cur[index + prefs-2])>>1;\r
+ }\r
+ if (mode < 2)\r
+ { \r
+ int b = (prev2[index + (mrefs<<1)] + next2[index + (mrefs<<1)])>>1; \r
+ int f = (prev2[index + (prefs<<1)] + next2[index + (prefs<<1)])>>1; \r
+ int diffmax = max(max(d-e, d-c), min(b-c, f-e)); \r
+ int diffmin = min(min(d-e, d-c), max(b-c, f-e)); \r
+\r
+ diff = max(max(diff, diffmin), -diffmax); \r
+ } \r
+ if (spatial_pred > d + diff) \r
+ {\r
+ spatial_pred = d + diff; \r
+ }\r
+ else if (spatial_pred < d - diff) \r
+ {\r
+ spatial_pred = d - diff; \r
+ }\r
+\r
+ dst[outindex] = spatial_pred; \r
+ }\r
+\r
+ kernel void yadif_filter(\r
+ global unsigned char *dst,\r
+ global unsigned char *prev,\r
+ global unsigned char *cur,\r
+ global unsigned char *next,\r
+ int parity,\r
+ int inlinesizeY,\r
+ int inlinesizeUV,\r
+ int outlinesizeY,\r
+ int outlinesizeUV,\r
+ int mode)\r
+ {\r
+ int x=get_global_id(0);\r
+ int y=(get_global_id(1)<<1) + (!parity);\r
+ int width=(get_global_size(0)<<1)/3;\r
+ int height=get_global_size(1)<<1;\r
+ \r
+\r
+ global unsigned char *dst_Y=dst;\r
+ global unsigned char *dst_U=dst_Y+height*outlinesizeY;\r
+\r
+ global unsigned char *prev_Y=prev;\r
+ global unsigned char *prev_U=prev_Y+height*inlinesizeY;\r
+\r
+ global unsigned char *cur_Y=cur;\r
+ global unsigned char *cur_U=cur_Y+height*inlinesizeY;\r
+\r
+ global unsigned char *next_Y=next;\r
+ global unsigned char *next_U=next_Y+height*inlinesizeY;\r
+\r
+ if(x < width)\r
+ {\r
+ filter_v6(dst_Y,prev_Y,cur_Y,next_Y,x,y,width,height,parity,inlinesizeY,outlinesizeY,mode,0);\r
+ }\r
+ else\r
+ {\r
+ x = x - width;\r
+ filter_v6(dst_U,prev_U,cur_U,next_U,x,y,width>>1,height>>1,parity,inlinesizeUV,outlinesizeUV,mode,1);\r
+ }\r
+ }\r
+ );\r
+\r
+#endif\r
--- /dev/null
+/* openclwrapper.c\r
+\r
+ Copyright (c) 2003-2012 HandBrake Team\r
+ This file is part of the HandBrake source code\r
+ Homepage: <http://handbrake.fr/>.\r
+ It may be used under the terms of the GNU General Public License v2.\r
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html\r
+\r
+ Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>\r
+ Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>\r
+ */\r
+ \r
+#ifdef USE_OPENCL\r
+\r
+#include <stdio.h>\r
+#include <stdlib.h>\r
+#include <string.h>\r
+#include "openclwrapper.h"\r
+#include "openclkernels.h"\r
+\r
+//#define USE_EXTERNAL_KERNEL\r
+#ifdef SYS_MINGW\r
+#include <windows.h>\r
+#endif\r
+\r
+#if defined(__APPLE__)\r
+#include <OpenCL/cl.h>\r
+#else\r
+#include <CL/cl.h>\r
+#endif\r
+\r
+#if defined(_MSC_VER)\r
+#define strcasecmp strcmpi\r
+#endif\r
+\r
+#define MAX_KERNEL_STRING_LEN 64\r
+#define MAX_CLFILE_NUM 50\r
+#define MAX_CLKERNEL_NUM 200\r
+#define MAX_CLFILE_PATH 255\r
+#define MAX_KERNEL_NUM 50\r
+#define MAX_KERNEL_NAME_LEN 64\r
+\r
+#ifndef INVALID_HANDLE_VALUE\r
+#define INVALID_HANDLE_VALUE NULL\r
+#endif\r
+\r
+//#define THREAD_PRIORITY_TIME_CRITICAL 15\r
+\r
+enum VENDOR\r
+{\r
+ AMD = 0,\r
+ Intel,\r
+ NVIDIA,\r
+ others\r
+};\r
+typedef struct _GPUEnv\r
+{\r
+ //share vb in all modules in hb library\r
+ cl_platform_id platform;\r
+ cl_device_type dType;\r
+ cl_context context;\r
+ cl_device_id * devices;\r
+ cl_device_id dev;\r
+ cl_command_queue command_queue;\r
+ cl_kernel kernels[MAX_CLFILE_NUM];\r
+ cl_program programs[MAX_CLFILE_NUM]; //one program object maps one kernel source file\r
+ char kernelSrcFile[MAX_CLFILE_NUM][256]; //the max len of kernel file name is 256\r
+ int file_count; // only one kernel file\r
+\r
+ char kernel_names[MAX_CLKERNEL_NUM][MAX_KERNEL_STRING_LEN+1];\r
+ cl_kernel_function kernel_functions[MAX_CLKERNEL_NUM];\r
+ int kernel_count;\r
+ int isUserCreated; // 1: created , 0:no create and needed to create by opencl wrapper\r
+ enum VENDOR vendor;\r
+}GPUEnv;\r
+\r
+typedef struct\r
+{\r
+ char kernelName[MAX_KERNEL_NAME_LEN+1];\r
+ char * kernelStr;\r
+}hb_kernel_node;\r
+
+static GPUEnv gpu_env;
+static int isInited = 0;
+static int useBuffers = 0;
+static hb_kernel_node gKernels[MAX_KERNEL_NUM];
+
+#define ADD_KERNEL_CFG( idx, s, p ){\
+ strcpy( gKernels[idx].kernelName, s );\\r
+ gKernels[idx].kernelStr = p;\\r
+ strcpy( gpu_env.kernel_names[idx], s );\\r
+ gpu_env.kernel_count++; }\r
+\r
+\r
+/**\r
+ * hb_confirm_gpu_type\r
+ */\r
+int hb_confirm_gpu_type()\r
+{\r
+ int status = 1;\r
+ unsigned int i, j;\r
+ cl_uint numPlatforms = 0; \r
+ status = clGetPlatformIDs(0,NULL,&numPlatforms); \r
+ if(status != 0) \r
+ { \r
+ goto end; \r
+ } \r
+ if(numPlatforms > 0) \r
+ { \r
+ cl_platform_id* platforms = (cl_platform_id* )malloc (numPlatforms * sizeof(cl_platform_id)); \r
+ status = clGetPlatformIDs (numPlatforms, platforms, NULL); \r
+ if (status != 0) \r
+ { \r
+ goto end; \r
+ } \r
+ for (i=0; i < numPlatforms; i++)\r
+ { \r
+ char pbuff[100];\r
+ cl_uint numDevices;\r
+ status = clGetPlatformInfo( platforms[i], \r
+ CL_PLATFORM_VENDOR, \r
+ sizeof (pbuff), \r
+ pbuff,\r
+ NULL); \r
+ if (status)\r
+ continue;\r
+ status = clGetDeviceIDs( platforms[i], \r
+ CL_DEVICE_TYPE_GPU , \r
+ 0 , \r
+ NULL , \r
+ &numDevices); \r
+ \r
+ cl_device_id *devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));\r
+ status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);\r
+ for (j = 0; j < numDevices; j++)\r
+ {\r
+ char dbuff[100];\r
+ status = clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, sizeof(dbuff), dbuff, NULL); \r
+ if (!strcmp(dbuff, "Advanced Micro Devices, Inc.") ||\r
+ !strcmp(dbuff, "Intel(R) Corporation") ||\r
+#ifdef __APPLE__\r
+ !strcmp(dbuff, "AMD") ||\r
+ /* MacBook Pro, AMD ATI Radeon HD 6750M, OS X 10.8.3 */\r
+ !strcmp(dbuff, "NVIDIA") ||\r
+ /* MacBook Pro, NVIDIA GeForce GT 330M, OS X 10.7.4 */\r
+#endif\r
+ !strcmp(dbuff, "NVIDIA Corporation"))\r
+ {\r
+ return 0;\r
+ }\r
+ }\r
+\r
+ if ( status != CL_SUCCESS )\r
+ continue;\r
+ if( numDevices ) \r
+ break; \r
+ } \r
+ free( platforms ); \r
+ } \r
+ end:\r
+ return -1;\r
+}\r
+\r
+/**\r
+ * hb_regist_opencl_kernel\r
+ */
+int hb_regist_opencl_kernel()
+{
+ //if( !gpu_env.isUserCreated )
+ // memset( &gpu_env, 0, sizeof(gpu_env) );
+ //Comment for posterity: When in doubt just zero out a structure full of pointers to allocated resources.
+
+ gpu_env.file_count = 0; //argc;
+ gpu_env.kernel_count = 0UL;
+
+ ADD_KERNEL_CFG( 0, "frame_scale", NULL )
+ ADD_KERNEL_CFG( 1, "yadif_filter", NULL )
+
+ return 0;
+}
+\r
+/**\r
+ * hb_regist_opencl_kernel\r
+ * @param filename -\r
+ * @param source -\r
+ * @param gpu_info -\r
+ * @param int idx -\r
+ */\r
+int hb_convert_to_string( const char *filename, char **source, GPUEnv *gpu_info, int idx )\r
+{\r
+ int file_size;\r
+ size_t result;\r
+ FILE * file = NULL;\r
+ file_size = 0;\r
+ result = 0;\r
+ file = fopen( filename, "rb+" );\r
+\r
+ if( file!=NULL )\r
+ {\r
+ fseek( file, 0, SEEK_END );\r
+\r
+ file_size = ftell( file );\r
+ rewind( file );\r
+ *source = (char*)malloc( sizeof(char) * file_size + 1 );\r
+ if( *source == (char*)NULL )\r
+ {\r
+ return(0);\r
+ }\r
+ result = fread( *source, 1, file_size, file );\r
+ if( result != file_size )\r
+ {\r
+ free( *source );\r
+ return(0);\r
+ }\r
+ (*source)[file_size] = '\0';\r
+ fclose( file );\r
+\r
+ return(1);\r
+ }\r
+ return(0);\r
+}\r
+\r
+/**\r
+ * hb_binary_generated\r
+ * @param context -\r
+ * @param cl_file_name -\r
+ * @param fhandle -\r
+ */\r
+int hb_binary_generated( cl_context context, const char * cl_file_name, FILE ** fhandle )\r
+{\r
+ int i = 0;\r
+ cl_int status;\r
+ cl_uint numDevices;\r
+ cl_device_id *devices;\r
+ char * str = NULL;\r
+ FILE * fd = NULL;\r
+\r
+ status = clGetContextInfo( context,\r
+ CL_CONTEXT_NUM_DEVICES,\r
+ sizeof(numDevices),\r
+ &numDevices,\r
+ NULL );\r
+ if( status != CL_SUCCESS )\r
+ {\r
+ hb_log( "OpenCL: Get context info failed" );\r
+ return 0;\r
+ }\r
+\r
+ devices = (cl_device_id*)malloc( sizeof(cl_device_id) * numDevices );\r
+ if( devices == NULL )\r
+ {\r
+ hb_log( "OpenCL: No device found" );\r
+ return 0;\r
+ }\r
+\r
+ /* grab the handles to all of the devices in the context. */\r
+ status = clGetContextInfo( context,\r
+ CL_CONTEXT_DEVICES,\r
+ sizeof(cl_device_id) * numDevices,\r
+ devices,\r
+ NULL );\r
+\r
+ status = 0;\r
+ /* dump out each binary into its own separate file. */\r
+ for (i = 0; i < numDevices; i++)\r
+ {\r
+ char fileName[256] = { 0 };\r
+ char cl_name[128] = { 0 };\r
+ if (devices[i])\r
+ {\r
+ char deviceName[1024];\r
+ status = clGetDeviceInfo(devices[i],\r
+ CL_DEVICE_NAME,\r
+ sizeof(deviceName),\r
+ deviceName,\r
+ NULL);\r
+\r
+ str = (char*)strstr(cl_file_name, ".cl");\r
+ memcpy(cl_name, cl_file_name, str - cl_file_name);\r
+ cl_name[str - cl_file_name] = '\0';\r
+ sprintf(fileName, "./%s - %s.bin", cl_name, deviceName);\r
+ fd = fopen(fileName, "rb");\r
+ status = fd != NULL;\r
+ }\r
+ }\r
+\r
+ if( devices != NULL )\r
+ {\r
+ free( devices );\r
+ devices = NULL;\r
+ }\r
+\r
+ if( fd != NULL )\r
+ *fhandle = fd;\r
+\r
+ return status;\r
+}\r
+\r
+/**\r
+ * hb_write_binary_to_file\r
+ * @param fileName -\r
+ * @param birary -\r
+ * @param numBytes -\r
+ */\r
+int hb_write_binary_to_file( const char* fileName, const char* birary, size_t numBytes )\r
+{\r
+ FILE *output = NULL;\r
+ output = fopen( fileName, "wb" );\r
+ if( output == NULL )\r
+ return 0;\r
+\r
+ fwrite( birary, sizeof(char), numBytes, output );\r
+ fclose( output );\r
+\r
+ return 1;\r
+}\r
+\r
+/**\r
+ * hb_generat_bin_from_kernel_source\r
+ * @param program -\r
+ * @param cl_file_name -\r
+ */\r
+int hb_generat_bin_from_kernel_source( cl_program program, const char * cl_file_name )\r
+{\r
+ int i = 0;\r
+ cl_int status;\r
+ cl_uint numDevices;\r
+ size_t *binarySizes;\r
+ cl_device_id *devices;\r
+ char **binaries;\r
+ char *str = NULL;\r
+\r
+ status = clGetProgramInfo( program,\r
+ CL_PROGRAM_NUM_DEVICES,\r
+ sizeof(numDevices),\r
+ &numDevices,\r
+ NULL );\r
+ if( status != CL_SUCCESS )\r
+ {\r
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_NUM_DEVICES failed");\r
+ return 0;\r
+ }\r
+\r
+ devices = (cl_device_id*)malloc( sizeof(cl_device_id) * numDevices );\r
+ if( devices == NULL )\r
+ {\r
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: no device found");\r
+ return 0;\r
+ }\r
+\r
+ /* grab the handles to all of the devices in the program. */\r
+ status = clGetProgramInfo( program,\r
+ CL_PROGRAM_DEVICES,\r
+ sizeof(cl_device_id) * numDevices,\r
+ devices,\r
+ NULL );\r
+ if( status != CL_SUCCESS )\r
+ {\r
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_DEVICES failed");\r
+ return 0;\r
+ }\r
+\r
+ /* figure out the sizes of each of the binaries. */\r
+ binarySizes = (size_t*)malloc( sizeof(size_t) * numDevices );\r
+\r
+ status = clGetProgramInfo( program,\r
+ CL_PROGRAM_BINARY_SIZES,\r
+ sizeof(size_t) * numDevices,\r
+ binarySizes, NULL );\r
+ if( status != CL_SUCCESS )\r
+ {\r
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_BINARY_SIZES failed");\r
+ return 0;\r
+ }\r
+\r
+ /* copy over all of the generated binaries. */\r
+ binaries = (char**)malloc( sizeof(char *) * numDevices );\r
+ if( binaries == NULL )\r
+ {\r
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: malloc for binaries failed");\r
+ return 0;\r
+ }\r
+\r
+ for( i = 0; i < numDevices; i++ )\r
+ {\r
+ if( binarySizes[i] != 0 )\r
+ {\r
+ binaries[i] = (char*)malloc( sizeof(char) * binarySizes[i] );\r
+ if( binaries[i] == NULL )\r
+ {\r
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: malloc for binaries[%d] failed", i);\r
+ return 0;\r
+ }\r
+ }\r
+ else\r
+ {\r
+ binaries[i] = NULL;\r
+ }\r
+ }\r
+\r
+ status = clGetProgramInfo( program,\r
+ CL_PROGRAM_BINARIES,\r
+ sizeof(char *) * numDevices,\r
+ binaries,\r
+ NULL );\r
+ if( status != CL_SUCCESS )\r
+ {\r
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: clGetProgramInfo for CL_PROGRAM_BINARIES failed");\r
+ return 0;\r
+ }\r
+\r
+ /* dump out each binary into its own separate file. */\r
+ for (i = 0; i < numDevices; i++)\r
+ {\r
+ char fileName[256] = {0};\r
+ char cl_name[128] = {0};\r
+ if (binarySizes[i])\r
+ {\r
+ char deviceName[1024];\r
+ status = clGetDeviceInfo(devices[i],\r
+ CL_DEVICE_NAME,\r
+ sizeof(deviceName),\r
+ deviceName,\r
+ NULL);\r
+\r
+ str = (char*)strstr( cl_file_name, (char*)".cl" );\r
+ memcpy(cl_name, cl_file_name, str - cl_file_name);\r
+ cl_name[str - cl_file_name] = '\0';\r
+ sprintf(fileName, "./%s - %s.bin", cl_name, deviceName);\r
+\r
+ if (!hb_write_binary_to_file(fileName, binaries[i], binarySizes[i]))\r
+ {\r
+ hb_log("OpenCL: hb_generat_bin_from_kernel_source: unable to write kernel, writing to temporary directory instead.");\r
+ return 0;\r
+ }\r
+ }\r
+ }\r
+\r
+ // Release all resouces and memory\r
+ for( i = 0; i < numDevices; i++ )\r
+ {\r
+ if( binaries[i] != NULL )\r
+ {\r
+ free( binaries[i] );\r
+ binaries[i] = NULL;\r
+ }\r
+ }\r
+\r
+ if( binaries != NULL )\r
+ {\r
+ free( binaries );\r
+ binaries = NULL;\r
+ }\r
+\r
+ if( binarySizes != NULL )\r
+ {\r
+ free( binarySizes );\r
+ binarySizes = NULL;\r
+ }\r
+\r
+ if( devices != NULL )\r
+ {\r
+ free( devices );\r
+ devices = NULL;\r
+ }\r
+ return 1;\r
+}\r
+\r
+\r
+/**\r
+ * hb_init_opencl_attr\r
+ * @param env -\r
+ */\r
+int hb_init_opencl_attr( OpenCLEnv * env )\r
+{\r
+ if( gpu_env.isUserCreated )\r
+ return 1;\r
+\r
+ gpu_env.context = env->context;\r
+ gpu_env.platform = env->platform;\r
+ gpu_env.dev = env->devices;\r
+ gpu_env.command_queue = env->command_queue;\r
+\r
+ gpu_env.isUserCreated = 1;\r
+\r
+ return 0;\r
+}\r
+\r
+/**\r
+ * hb_create_kernel\r
+ * @param kernelname -\r
+ * @param env -\r
+ */\r
+int hb_create_kernel( char * kernelname, KernelEnv * env )\r
+{\r
+ int status;\r
+ env->kernel = clCreateKernel( gpu_env.programs[0], kernelname, &status );\r
+ env->context = gpu_env.context;\r
+ env->command_queue = gpu_env.command_queue;\r
+ return status != CL_SUCCESS ? 1 : 0;\r
+}\r
+\r
+/**\r
+ * hb_release_kernel\r
+ * @param env -\r
+ */\r
+int hb_release_kernel( KernelEnv * env )\r
+{\r
+ int status = clReleaseKernel( env->kernel );\r
+ return status != CL_SUCCESS ? 1 : 0;\r
+}\r
+\r
+/**\r
+ * hb_init_opencl_env
+ * @param gpu_info -
+ */
+
+static int init_once = 0;
+int hb_init_opencl_env( GPUEnv *gpu_info )
+{
+ size_t length;
+ cl_int status;\r
+ cl_uint numPlatforms, numDevices;\r
+ cl_platform_id *platforms;\r
+ cl_context_properties cps[3];\r
+ char platformName[100];\r
+ unsigned int i;
+ void *handle = INVALID_HANDLE_VALUE;
+
+
+ if (init_once != 0)
+ return 0;
+ else
+ init_once = 1;
+ /*
+ * Have a look at the available platforms.
+ */
+ if( !gpu_info->isUserCreated )\r
+ {\r
+ status = clGetPlatformIDs( 0, NULL, &numPlatforms );\r
+ if( status != CL_SUCCESS )\r
+ {\r
+ hb_log( "OpenCL: OpenCL device platform not found." );\r
+ return(1);\r
+ }\r
+\r
+ gpu_info->platform = NULL;\r
+ if( 0 < numPlatforms )\r
+ {\r
+ platforms = (cl_platform_id*)malloc(\r
+ numPlatforms * sizeof(cl_platform_id));\r
+ if( platforms == (cl_platform_id*)NULL )\r
+ {\r
+ return(1);\r
+ }\r
+ status = clGetPlatformIDs( numPlatforms, platforms, NULL );\r
+\r
+ if( status != CL_SUCCESS )\r
+ {\r
+ hb_log( "OpenCL: Specific opencl platform not found." );\r
+ return(1);\r
+ }\r
+\r
+ for( i = 0; i < numPlatforms; i++ )\r
+ {\r
+ status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR,\r
+ sizeof(platformName), platformName,\r
+ NULL );\r
+\r
+ if( status != CL_SUCCESS )\r
+ {\r
+ continue;\r
+ }\r
+ gpu_info->platform = platforms[i];\r
+\r
+ if (!strcmp(platformName, "Advanced Micro Devices, Inc.") ||\r
+ !strcmp(platformName, "AMD"))\r
+ gpu_info->vendor = AMD;\r
+ else \r
+ gpu_info->vendor = others;\r
+ \r
+ gpu_info->platform = platforms[i];\r
+\r
+ status = clGetDeviceIDs( gpu_info->platform /* platform */,\r
+ CL_DEVICE_TYPE_GPU /* device_type */,\r
+ 0 /* num_entries */,\r
+ NULL /* devices */,\r
+ &numDevices );\r
+\r
+ if( status != CL_SUCCESS )\r
+ {\r
+ continue;\r
+ }\r
+\r
+ if( numDevices )\r
+ break;\r
+ \r
+ }\r
+ free( platforms );\r
+ }\r
+\r
+ if( NULL == gpu_info->platform )\r
+ {\r
+ hb_log( "OpenCL: No OpenCL-compatible GPU found." );\r
+ return(1);\r
+ }\r
+\r
+ if( status != CL_SUCCESS )\r
+ {\r
+ hb_log( "OpenCL: No OpenCL-compatible GPU found." );\r
+ return(1);\r
+ }\r
+\r
+ /*\r
+ * Use available platform.\r
+ */\r
+ cps[0] = CL_CONTEXT_PLATFORM;\r
+ cps[1] = (cl_context_properties)gpu_info->platform;\r
+ cps[2] = 0;\r
+ /* Check for GPU. */\r
+ gpu_info->dType = CL_DEVICE_TYPE_GPU;\r
+ gpu_info->context = clCreateContextFromType(\r
+ cps, gpu_info->dType, NULL, NULL, &status );\r
+\r
+ if( (gpu_info->context == (cl_context)NULL) || (status != CL_SUCCESS) )\r
+ {\r
+ gpu_info->dType = CL_DEVICE_TYPE_CPU;\r
+ gpu_info->context = clCreateContextFromType(\r
+ cps, gpu_info->dType, NULL, NULL, &status );\r
+ }\r
+\r
+ if( (gpu_info->context == (cl_context)NULL) || (status != CL_SUCCESS) )\r
+ {\r
+ gpu_info->dType = CL_DEVICE_TYPE_DEFAULT;\r
+ gpu_info->context = clCreateContextFromType(\r
+ cps, gpu_info->dType, NULL, NULL, &status );\r
+ }\r
+\r
+ if( (gpu_info->context == (cl_context)NULL) || (status != CL_SUCCESS) )\r
+ {\r
+ hb_log( "OpenCL: Unable to create opencl context." );\r
+ return(1);\r
+ }\r
+\r
+ /* Detect OpenCL devices. */\r
+ /* First, get the size of device list data */\r
+ status = clGetContextInfo( gpu_info->context, CL_CONTEXT_DEVICES,\r
+ 0, NULL, &length );\r
+ if((status != CL_SUCCESS) || (length == 0))\r
+ {\r
+ hb_log( "OpenCL: Unable to get the list of devices in context." );\r
+ return(1);\r
+ }\r
+\r
+ /* Now allocate memory for device list based on the size we got earlier */\r
+ gpu_info->devices = (cl_device_id*)malloc( length );\r
+ if( gpu_info->devices == (cl_device_id*)NULL )\r
+ {\r
+ return(1);\r
+ }\r
+\r
+ /* Now, get the device list data */\r
+ status = clGetContextInfo( gpu_info->context, CL_CONTEXT_DEVICES, length,\r
+ gpu_info->devices, NULL );\r
+ if( status != CL_SUCCESS )\r
+ {\r
+ hb_log( "OpenCL: Unable to get the device list data in context." );\r
+ return(1);\r
+ }\r
+\r
+ /* Create OpenCL command queue. */\r
+ gpu_info->command_queue = clCreateCommandQueue( gpu_info->context,\r
+ gpu_info->devices[0],\r
+ 0, &status );\r
+ if( status != CL_SUCCESS )\r
+ {\r
+ hb_log( "OpenCL: Unable to create opencl command queue." );\r
+ return(1);\r
+ }\r
+ }\r
+\r
+ if( clGetCommandQueueInfo( gpu_info->command_queue,\r
+ CL_QUEUE_THREAD_HANDLE_AMD, sizeof(handle),\r
+ &handle, NULL ) == CL_SUCCESS && handle != INVALID_HANDLE_VALUE )\r
+ {\r
+#ifdef SYS_MINGW \r
+ SetThreadPriority( handle, THREAD_PRIORITY_TIME_CRITICAL );\r
+#endif\r
+ }\r
+\r
+ return 0;\r
+}\r
+\r
+\r
+/**\r
+ * hb_release_opencl_env\r
+ * @param gpu_info -\r
+ */\r
+int hb_release_opencl_env( GPUEnv *gpu_info )\r
+{\r
+ if( !isInited )\r
+ return 1;\r
+ int i;\r
+\r
+ for( i = 0; i<gpu_env.file_count; i++ )\r
+ {\r
+ if( gpu_env.programs[i] ) ;\r
+ {\r
+ clReleaseProgram( gpu_env.programs[i] );\r
+ gpu_env.programs[i] = NULL;\r
+ }\r
+ }\r
+\r
+ if( gpu_env.command_queue )\r
+ {\r
+ clReleaseCommandQueue( gpu_env.command_queue );\r
+ gpu_env.command_queue = NULL;\r
+ }\r
+\r
+ if( gpu_env.context )\r
+ {\r
+ clReleaseContext( gpu_env.context );\r
+ gpu_env.context = NULL;\r
+ }\r
+\r
+ isInited = 0;\r
+ gpu_info->isUserCreated = 0;\r
+ return 1;\r
+}\r
+\r
+\r
+/**\r
+ * hb_register_kernel_wrapper\r
+ * @param kernel_name -\r
+ * @param function -\r
+ */\r
+int hb_register_kernel_wrapper( const char *kernel_name, cl_kernel_function function )\r
+{\r
+ int i;\r
+ for( i = 0; i < gpu_env.kernel_count; i++ )\r
+ {\r
+ if( strcasecmp( kernel_name, gpu_env.kernel_names[i] ) == 0 )\r
+ {\r
+ gpu_env.kernel_functions[i] = function;\r
+ return(1);\r
+ }\r
+ }\r
+ return(0);\r
+}\r
+\r
+/**\r
+ * hb_cached_of_kerner_prg\r
+ * @param gpu_env -\r
+ * @param cl_file_name -\r
+ */\r
+int hb_cached_of_kerner_prg( const GPUEnv *gpu_env, const char * cl_file_name )\r
+{\r
+ int i;\r
+ for( i = 0; i < gpu_env->file_count; i++ )\r
+ {\r
+ if( strcasecmp( gpu_env->kernelSrcFile[i], cl_file_name ) == 0 )\r
+ {\r
+ if( gpu_env->programs[i] != NULL )\r
+ return(1);\r
+ }\r
+ }\r
+\r
+ return(0);\r
+}\r
+\r
+/**\r
+ * hb_compile_kernel_file\r
+ * @param filename -\r
+ * @param gpu_info -\r
+ * @param indx -\r
+ * @param build_option -\r
+ */\r
+int hb_compile_kernel_file( const char *filename, GPUEnv *gpu_info,\r
+ int indx, const char *build_option )\r
+{\r
+ cl_int status;\r
+ size_t length;\r
+ char *source_str;\r
+ const char *source;\r
+ size_t source_size[1];\r
+ char *buildLog = NULL;\r
+ int b_error, binary_status, binaryExisted;\r
+ char * binary;\r
+ cl_uint numDevices;\r
+ cl_device_id *devices;\r
+ FILE * fd;\r
+ FILE * fd1;\r
+ int idx;\r
+\r
+ if( hb_cached_of_kerner_prg( gpu_info, filename ) == 1 )\r
+ return (1);\r
+\r
+ idx = gpu_info->file_count;\r
+\r
+#ifdef USE_EXTERNAL_KERNEL\r
+ status = hb_convert_to_string( filename, &source_str, gpu_info, idx );\r
+ if( status == 0 )
+ return(0);
+#else
+ int kernel_src_size = strlen(kernel_src_scale) + strlen(kernel_src_yadif_filter);
+
+// char *scale_src;
+// status = hb_convert_to_string("./scale_kernels.cl", &scale_src, gpu_info, idx);
+// if (status != 0)
+// kernel_src_size += strlen(scale_src);
+
+ source_str = (char*)malloc( kernel_src_size + 2 );
+ strcpy( source_str, kernel_src_scale );
+// strcat( source_str, scale_src ); //
+ strcat( source_str, kernel_src_yadif_filter );
+#endif
+
+ source = source_str;\r
+ source_size[0] = strlen( source );\r
+\r
+ if ((binaryExisted = hb_binary_generated(gpu_info->context, filename, &fd)) == 1)\r
+ {\r
+ status = clGetContextInfo(gpu_info->context,\r
+ CL_CONTEXT_NUM_DEVICES,\r
+ sizeof(numDevices),\r
+ &numDevices,\r
+ NULL);\r
+ if (status != CL_SUCCESS)\r
+ {\r
+ hb_log("OpenCL: Unable to get the number of devices in context.");\r
+ return 0;\r
+ }\r
+\r
+ devices = (cl_device_id*)malloc(sizeof(cl_device_id) * numDevices);\r
+ if (devices == NULL)\r
+ return 0;\r
+\r
+ length = 0;\r
+ b_error = 0;\r
+ b_error |= fseek(fd, 0, SEEK_END) < 0;\r
+ b_error |= (length = ftell(fd)) <= 0;\r
+ b_error |= fseek(fd, 0, SEEK_SET) < 0;\r
+ if (b_error)\r
+ return 0;\r
+\r
+ binary = (char*)calloc(length + 2, sizeof(char));\r
+ if (binary == NULL)\r
+ return 0;\r
+\r
+ b_error |= fread(binary, 1, length, fd) != length;\r
+#if 0 // this doesn't work under OS X and/or with some non-AMD GPUs\r
+ if (binary[length-1] != '\n')\r
+ binary[length++] = '\n;\r
+#endif\r
+\r
+ if (b_error)\r
+ return 0;\r
+\r
+ /* grab the handles to all of the devices in the context. */\r
+ status = clGetContextInfo(gpu_info->context,\r
+ CL_CONTEXT_DEVICES,\r
+ sizeof(cl_device_id) * numDevices,\r
+ devices,\r
+ NULL);\r
+\r
+ gpu_info->programs[idx] = clCreateProgramWithBinary(gpu_info->context,\r
+ numDevices,\r
+ devices,\r
+ &length,\r
+ (const unsigned char**)&binary,\r
+ &binary_status,\r
+ &status);\r
+\r
+ fclose(fd);\r
+ free(devices);\r
+ fd = NULL;\r
+ devices = NULL;\r
+ }\r
+ else\r
+ {\r
+ /* create a CL program using the kernel source */\r
+ gpu_info->programs[idx] = clCreateProgramWithSource(\r
+ gpu_info->context, 1, &source, source_size, &status );\r
+ }\r
+\r
+ if((gpu_info->programs[idx] == (cl_program)NULL) || (status != CL_SUCCESS)){\r
+ hb_log( "OpenCL: Unable to get list of devices in context." );\r
+ return(0);\r
+ }\r
+\r
+ /* create a cl program executable for all the devices specified */\r
+ if( !gpu_info->isUserCreated ) \r
+ {\r
+ status = clBuildProgram( gpu_info->programs[idx], 1, gpu_info->devices,\r
+ build_option, NULL, NULL );\r
+ }\r
+ else\r
+ {\r
+ status = clBuildProgram( gpu_info->programs[idx], 1, &(gpu_info->dev),\r
+ build_option, NULL, NULL );\r
+ }\r
+\r
+ if( status != CL_SUCCESS )\r
+ {\r
+ if( !gpu_info->isUserCreated ) \r
+ {\r
+ status = clGetProgramBuildInfo( gpu_info->programs[idx],\r
+ gpu_info->devices[0],\r
+ CL_PROGRAM_BUILD_LOG, 0, NULL, &length );\r
+ }\r
+ else\r
+ {\r
+ status = clGetProgramBuildInfo( gpu_info->programs[idx],\r
+ gpu_info->dev,\r
+ CL_PROGRAM_BUILD_LOG, 0, NULL, &length );\r
+ }\r
+\r
+ if( status != CL_SUCCESS )\r
+ {\r
+ hb_log( "OpenCL: Unable to get GPU build information." );\r
+ return(0);\r
+ }\r
+\r
+ buildLog = (char*)malloc( length );\r
+ if( buildLog == (char*)NULL )\r
+ {\r
+ return(0);\r
+ }\r
+\r
+ if( !gpu_info->isUserCreated )\r
+ {\r
+ status = clGetProgramBuildInfo( gpu_info->programs[idx], gpu_info->devices[0],\r
+ CL_PROGRAM_BUILD_LOG, length, buildLog, &length );\r
+ }\r
+ else\r
+ {\r
+ status = clGetProgramBuildInfo( gpu_info->programs[idx], gpu_info->dev,\r
+ CL_PROGRAM_BUILD_LOG, length, buildLog, &length );\r
+ }\r
+\r
+ fd1 = fopen( "kernel-build.log", "w+" );\r
+ if( fd1 != NULL ) {\r
+ fwrite( buildLog, sizeof(char), length, fd1 );\r
+ fclose( fd1 );\r
+ }\r
+\r
+ free( buildLog );\r
+ return(0);\r
+ }\r
+\r
+ strcpy( gpu_env.kernelSrcFile[idx], filename );\r
+
+ if (binaryExisted != 1)
+ {
+ //hb_generat_bin_from_kernel_source(gpu_env.programs[idx], filename);
+ }
+
+ gpu_info->file_count += 1;
+\r
+ return(1);\r
+}\r
+\r
+\r
+/**\r
+ * hb_get_kernel_env_and_func\r
+ * @param kernel_name -\r
+ * @param env -\r
+ * @param function -\r
+ */\r
+int hb_get_kernel_env_and_func( const char *kernel_name,\r
+ KernelEnv *env,\r
+ cl_kernel_function *function )\r
+{\r
+ int i;\r
+ for( i = 0; i < gpu_env.kernel_count; i++ )\r
+ {\r
+ if( strcasecmp( kernel_name, gpu_env.kernel_names[i] ) == 0 )\r
+ {\r
+ env->context = gpu_env.context;\r
+ env->command_queue = gpu_env.command_queue;\r
+ env->program = gpu_env.programs[0];\r
+ env->kernel = gpu_env.kernels[i];\r
+ env->isAMD = ( gpu_env.vendor == AMD ) ? 1 : 0;\r
+ *function = gpu_env.kernel_functions[i];\r
+ return(1);\r
+ }\r
+ }\r
+ return(0);\r
+}\r
+\r
+/**\r
+ * hb_get_kernel_env_and_func\r
+ * @param kernel_name -\r
+ * @param userdata -\r
+ */\r
+int hb_run_kernel( const char *kernel_name, void **userdata )\r
+{\r
+ KernelEnv env;\r
+ cl_kernel_function function;\r
+ int status;\r
+ memset( &env, 0, sizeof(KernelEnv));\r
+ status = hb_get_kernel_env_and_func( kernel_name, &env, &function );\r
+ strcpy( env.kernel_name, kernel_name );\r
+ if( status == 1 ) \r
+ {\r
+ return(function( userdata, &env ));\r
+ }\r
+\r
+ return(0);\r
+}\r
+\r
+/**\r
+ * hb_init_opencl_run_env\r
+ * @param argc -\r
+ * @param argv -\r
+ * @param build_option -\r
+ */\r
+int hb_init_opencl_run_env( int argc, char **argv, const char *build_option )\r
+{\r
+ int status = 0;\r
+ if( MAX_CLKERNEL_NUM <= 0 )\r
+ {\r
+ return 1;\r
+ }\r
+\r
+ if((argc > MAX_CLFILE_NUM) || (argc<0))\r
+ {\r
+ return 1;\r
+ }\r
+\r
+ if( !isInited )\r
+ {\r
+ hb_regist_opencl_kernel();\r
+\r
+ /*initialize devices, context, comand_queue*/\r
+ status = hb_init_opencl_env( &gpu_env );\r
+ if( status )\r
+ return(1);\r
+\r
+ /*initialize program, kernel_name, kernel_count*/\r
+ status = hb_compile_kernel_file("hb-opencl-kernels.cl",\r
+ &gpu_env, 0, build_option);\r
+\r
+ if( status == 0 || gpu_env.kernel_count == 0 )\r
+ {\r
+ return(1);\r
+
+ }
+
+ useBuffers = 1;
+ isInited = 1;
+ }
+
+ return(0);\r
+}\r
+\r
+/**\r
+ * hb_release_opencl_run_env\r
+ */\r
+int hb_release_opencl_run_env()\r
+{\r
+ return hb_release_opencl_env( &gpu_env );\r
+}\r
+\r
+/**\r
+ * hb_opencl_stats\r
+ */\r
+int hb_opencl_stats()\r
+{\r
+ return isInited;\r
+}\r
+\r
+/**\r
+ * hb_get_opencl_env\r
+ */\r
+int hb_get_opencl_env()\r
+{\r
+ int i = 0;\r
+ cl_int status;\r
+ cl_uint numDevices;\r
+ cl_device_id *devices;\r
+\r
+ /*initialize devices, context, comand_queue*/\r
+ status = hb_init_opencl_env( &gpu_env );\r
+ if( status )\r
+ return(1);\r
+ status = clGetContextInfo( gpu_env.context,\r
+ CL_CONTEXT_NUM_DEVICES,\r
+ sizeof(numDevices),\r
+ &numDevices,\r
+ NULL );\r
+ if( status != CL_SUCCESS )\r
+ return 0;\r
+\r
+ devices = (cl_device_id*)malloc( sizeof(cl_device_id) * numDevices );\r
+ if( devices == NULL )\r
+ return 0;\r
+\r
+ /* grab the handles to all of the devices in the context. */\r
+ status = clGetContextInfo( gpu_env.context,\r
+ CL_CONTEXT_DEVICES,\r
+ sizeof(cl_device_id) * numDevices,\r
+ devices,\r
+ NULL );\r
+\r
+ for (i = 0; i < numDevices; i++)\r
+ {\r
+ if (devices[i] != NULL)\r
+ {\r
+ char deviceVendor[100], deviceName[1024], driverVersion[1024];\r
+ clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(deviceVendor),\r
+ deviceVendor, NULL);\r
+ clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(deviceName),\r
+ deviceName, NULL);\r
+ clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(driverVersion),\r
+ driverVersion, NULL);\r
+ hb_log("hb_get_opencl_env: GPU #%d, Device Vendor: %s", i + 1, deviceVendor);\r
+ hb_log("hb_get_opencl_env: GPU #%d, Device Name: %s", i + 1, deviceName);\r
+ hb_log("hb_get_opencl_env: GPU #%d, Driver Version: %s", i + 1, driverVersion);\r
+ }\r
+ }\r
+\r
+ if( devices != NULL )\r
+ {\r
+ free( devices );\r
+ devices = NULL;\r
+ }\r
+\r
+ return status;\r
+}\r
+\r
+/**\r
+ * hb_create_buffer\r
+ * @param cl_inBuf -\r
+ * @param flags -\r
+ * @param size -\r
+ */\r
+int hb_create_buffer( cl_mem *cl_Buf, int flags, int size )\r
+{\r
+ int status;\r
+ *cl_Buf = clCreateBuffer( gpu_env.context, (flags), (size), NULL, &status );\r
+ \r
+ if( status != CL_SUCCESS )\r
+ { \r
+ hb_log( "OpenCL: clCreateBuffer error '%d'", status );\r
+ return 0; \r
+ }\r
+\r
+ return 1;\r
+}\r
+\r
+\r
+/**\r
+ * hb_read_opencl_buffer\r
+ * @param cl_inBuf -\r
+ * @param outbuf -\r
+ * @param size -\r
+ */\r
+int hb_read_opencl_buffer( cl_mem cl_inBuf, unsigned char *outbuf, int size )\r
+{\r
+ int status;\r
+\r
+ status = clEnqueueReadBuffer( gpu_env.command_queue, cl_inBuf, CL_TRUE, 0, size, outbuf, 0, 0, 0 );\r
+ if( status != CL_SUCCESS )\r
+ { \r
+ hb_log( "OpenCL: av_read_opencl_buffer error '%d'", status );\r
+ return 0; \r
+ }\r
+\r
+ return 1;
+}
+
+int hb_cl_create_mapped_buffer(cl_mem *mem, unsigned char **addr, int size)
+{
+ int status;
+ int flags = CL_MEM_ALLOC_HOST_PTR;
+ //cl_event event;
+ *mem = clCreateBuffer(gpu_env.context, flags, size, NULL, &status);
+ *addr = clEnqueueMapBuffer(gpu_env.command_queue, *mem, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, size, 0, NULL, NULL/*&event*/, &status);
+
+ //hb_log("\t **** context: %.8x cmdqueue: %.8x cl_mem: %.8x mapaddr: %.8x size: %d status: %d", gpu_env.context, gpu_env.command_queue, mem, addr, size, status);
+
+ return (status == CL_SUCCESS) ? 1 : 0;
+}
+
+int hb_cl_free_mapped_buffer(cl_mem mem, unsigned char *addr)
+{
+ cl_event event;
+ int status = clEnqueueUnmapMemObject(gpu_env.command_queue, mem, addr, 0, NULL, &event);
+ if (status == CL_SUCCESS)
+ clWaitForEvents(1, &event);
+ else
+ hb_log("hb_free_mapped_buffer: error %d", status);
+ return (status == CL_SUCCESS) ? 1 : 0;
+}
+
+void hb_opencl_init()
+{
+ hb_get_opencl_env();
+}
+
+int hb_use_buffers()
+{
+ return useBuffers;
+}
+
+int hb_copy_buffer(cl_mem src_buffer,cl_mem dst_buffer,size_t src_offset,size_t dst_offset,size_t cb)
+{
+ int status = clEnqueueCopyBuffer(gpu_env.command_queue,
+ src_buffer,\r
+ dst_buffer,\r
+ src_offset, dst_offset, cb,\r
+ 0, 0, 0);\r
+ if( status != CL_SUCCESS )\r
+ { \r
+ av_log(NULL,AV_LOG_ERROR, "hb_read_opencl_buffer error '%d'\n", status ); \r
+ return 0; \r
+ }\r
+ return 1;\r
+}\r
+\r
+int hb_read_opencl_frame_buffer(cl_mem cl_inBuf,unsigned char *Ybuf,unsigned char *Ubuf,unsigned char *Vbuf,int linesize0,int linesize1,int linesize2,int height)\r
+{\r
+\r
+ int chrH = -(-height >> 1);\r
+ unsigned char *temp = (unsigned char *)av_malloc(sizeof(uint8_t) * (linesize0 * height + linesize1 * chrH * 2));\r
+ if(hb_read_opencl_buffer(cl_inBuf,temp,sizeof(uint8_t)*(linesize0 + linesize1)*height))\r
+ {\r
+ memcpy(Ybuf,temp,linesize0 * height);\r
+ memcpy(Ubuf,temp + linesize0 * height,linesize1 *chrH);\r
+ memcpy(Vbuf,temp + linesize0 * height + linesize1 * chrH,linesize2 * chrH);\r
+ \r
+ }\r
+ av_free(temp);\r
+\r
+ return 1;\r
+}\r
+\r
+int hb_write_opencl_frame_buffer(cl_mem cl_inBuf,unsigned char *Ybuf,unsigned char *Ubuf,unsigned char *Vbuf,int linesize0,int linesize1,int linesize2,int height,int offset)\r
+{\r
+ int status;\r
+ void *mapped = clEnqueueMapBuffer( gpu_env.command_queue, cl_inBuf, CL_TRUE,CL_MAP_WRITE, 0, sizeof(uint8_t) * (linesize0 + linesize1)*height + offset, 0, NULL, NULL, NULL );\r
+ uint8_t *temp = (uint8_t *)mapped;\r
+ temp += offset;\r
+ memcpy(temp,Ybuf,sizeof(uint8_t) * linesize0 * height);\r
+ memcpy(temp + sizeof(uint8_t) * linesize0 * height,Ubuf,sizeof(uint8_t) * linesize1 * height/2);\r
+ memcpy(temp + sizeof(uint8_t) * (linesize0 * height + linesize1 * height/2),Vbuf,sizeof(uint8_t) * linesize2 * height/2);\r
+ clEnqueueUnmapMemObject(gpu_env.command_queue, cl_inBuf, mapped, 0, NULL, NULL );\r
+ return 1;\r
+}\r
+\r
+cl_command_queue hb_get_command_queue()\r
+{\r
+ return gpu_env.command_queue;\r
+}\r
+\r
+cl_context hb_get_context()\r
+{\r
+ return gpu_env.context;\r
+}\r
+#endif\r
--- /dev/null
+/* openclwrapper.h\r
+\r
+ Copyright (c) 2003-2012 HandBrake Team\r
+ This file is part of the HandBrake source code\r
+ Homepage: <http://handbrake.fr/>.\r
+ It may be used under the terms of the GNU General Public License v2.\r
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html\r
+\r
+ Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>\r
+ Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>\r
+\r
+\r
+ */\r
+#ifndef __OPENCL_WRAPPER_H\r
+#define __OPENCL_WRAPPER_H\r
+#ifdef USE_OPENCL\r
+#include "common.h"\r
+\r
+//support AMD opencl\r
+#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E\r
+#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2)\r
+\r
+typedef struct _KernelEnv\r
+{\r
+ cl_context context;\r
+ cl_command_queue command_queue;\r
+ cl_program program;\r
+ cl_kernel kernel;\r
+ char kernel_name[150];\r
+ int isAMD;\r
+}KernelEnv;\r
+\r
+typedef struct _OpenCLEnv\r
+{\r
+ cl_platform_id platform;\r
+ cl_context context;\r
+ cl_device_id devices;\r
+ cl_command_queue command_queue;\r
+}OpenCLEnv;\r
+\r
+\r
+//user defined, this is function wrapper which is used to set the input parameters ,\r
+//luanch kernel and copy data from GPU to CPU or CPU to GPU.\r
+typedef int (*cl_kernel_function)( void **userdata, KernelEnv *kenv );\r
+\r
+// registe a wapper for running the kernel specified by the kernel name\r
+int hb_register_kernel_wrapper( const char *kernel_name, cl_kernel_function function );\r
+\r
+// run kernel , user call this function to luanch kernel.\r
+// kernel_name: this kernel name is used to find the kernel in opencl runtime environment\r
+// userdata: this userdata is the all parameters for running the kernel specified by kernel name\r
+int hb_run_kernel( const char *kernel_name, void **userdata );\r
+\r
+// init the run time environment , this function must be called befor calling any function related to opencl\r
+// the argc must be set zero , argv must be set NULL, build_option is the options for build the kernel.\r
+int hb_init_opencl_run_env( int argc, char **argv, const char *build_option );\r
+\r
+//relase all resource about the opencl , this function must be called after calling any functions related to opencl\r
+int hb_release_opencl_run_env();\r
+\r
+// get the opencl status , 0: not init ; 1, inited; this function is used the check whether or not the opencl run time has been created\r
+int hb_opencl_stats();\r
+\r
+// update opencl run time environments , such as commandqueue , platforme, context. program\r
+int hb_init_opencl_attr( OpenCLEnv * env );\r
+\r
+// create kernel object by a kernel name on the specified opencl run time indicated by env parameter\r
+int hb_create_kernel( char * kernelname, KernelEnv * env );\r
+\r
+// release kernel object which is generated by calling the hb_create_kernel api
+int hb_release_kernel( KernelEnv * env );
+
+void hb_opencl_init();
+
+int hb_get_opencl_env();
+
+int hb_create_buffer(cl_mem *cl_Buf,int flags,int size);
+
+int hb_read_opencl_buffer(cl_mem cl_inBuf,unsigned char *outbuf,int size);
+
+int hb_cl_create_mapped_buffer(cl_mem *mem, unsigned char **addr, int size);
+
+int hb_cl_free_mapped_buffer(cl_mem mem, unsigned char *addr);
+
+int hb_use_buffers();
+
+int hb_confirm_gpu_type();
+#endif
+#endif
--- /dev/null
+/* scale.c\r
+\r
+ Copyright (c) 2003-2012 HandBrake Team\r
+ This file is part of the HandBrake source code\r
+ Homepage: <http://handbrake.fr/>.\r
+ It may be used under the terms of the GNU General Public License v2.\r
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html\r
+\r
+ Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>\r
+ Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>\r
+\r
+\r
+ */\r
+ \r
+#ifdef USE_OPENCL\r
+#include <assert.h>\r
+#include <stdio.h>\r
+#include <stdlib.h>\r
+#include <string.h>\r
+\r
+#include "hb.h"\r
+#include "scale.h"\r
+#include "scale_kernel.h"\r
+#include "libavutil/pixdesc.h"\r
+\r
+#define isScaleRGBinInt(x) \\r
+ ( \\r
+ (x)==AV_PIX_FMT_RGB48BE || \\r
+ (x)==AV_PIX_FMT_RGB48LE || \\r
+ (x)==AV_PIX_FMT_RGB32 || \\r
+ (x)==AV_PIX_FMT_RGB32_1 || \\r
+ (x)==AV_PIX_FMT_RGB24 || \\r
+ (x)==AV_PIX_FMT_RGB565BE || \\r
+ (x)==AV_PIX_FMT_RGB565LE || \\r
+ (x)==AV_PIX_FMT_RGB555BE || \\r
+ (x)==AV_PIX_FMT_RGB555LE || \\r
+ (x)==AV_PIX_FMT_RGB444BE || \\r
+ (x)==AV_PIX_FMT_RGB444LE || \\r
+ (x)==AV_PIX_FMT_RGB8 || \\r
+ (x)==AV_PIX_FMT_RGB4 || \\r
+ (x)==AV_PIX_FMT_RGB4_BYTE || \\r
+ (x)==AV_PIX_FMT_MONOBLACK || \\r
+ (x)==AV_PIX_FMT_MONOWHITE \\r
+ )\r
+#define isScaleBGRinInt(x) \\r
+ ( \\r
+ (x)==AV_PIX_FMT_BGR48BE || \\r
+ (x)==AV_PIX_FMT_BGR48LE || \\r
+ (x)==AV_PIX_FMT_BGR32 || \\r
+ (x)==AV_PIX_FMT_BGR32_1 || \\r
+ (x)==AV_PIX_FMT_BGR24 || \\r
+ (x)==AV_PIX_FMT_BGR565BE || \\r
+ (x)==AV_PIX_FMT_BGR565LE || \\r
+ (x)==AV_PIX_FMT_BGR555BE || \\r
+ (x)==AV_PIX_FMT_BGR555LE || \\r
+ (x)==AV_PIX_FMT_BGR444BE || \\r
+ (x)==AV_PIX_FMT_BGR444LE || \\r
+ (x)==AV_PIX_FMT_BGR8 || \\r
+ (x)==AV_PIX_FMT_BGR4 || \\r
+ (x)==AV_PIX_FMT_BGR4_BYTE|| \\r
+ (x)==AV_PIX_FMT_MONOBLACK|| \\r
+ (x)==AV_PIX_FMT_MONOWHITE \\r
+ )\r
+\r
+#define isScaleAnyRGB(x) \\r
+ ( \\r
+ isScaleRGBinInt(x) || \\r
+ isScaleBGRinInt(x) \\r
+ )\r
+\r
+#define isScaleGray(x) \\r
+ ((x) == AV_PIX_FMT_GRAY8 || \\r
+ (x) == AV_PIX_FMT_Y400A || \\r
+ (x) == AV_PIX_FMT_GRAY16BE || \\r
+ (x) == AV_PIX_FMT_GRAY16LE)\r
+\r
+static ScaleContext *g_scale;\r
+\r
+static double getScaleSplineCoeff(double a, double b, double c, double d, double dist)\r
+{\r
+ if (dist <= 1.0)\r
+ return ((d * dist + c) * dist + b) * dist + a;\r
+ else\r
+ return getScaleSplineCoeff(0.0,\r
+ b + 2.0 * c + 3.0 * d,\r
+ c + 3.0 * d,\r
+ -b - 3.0 * c - 6.0 * d,\r
+ dist - 1.0);\r
+}\r
+\r
+static int initScaleFilter(int16_t **outFilter, int32_t **filterPos,\r
+ int *outFilterSize, int xInc, int srcW, int dstW,\r
+ int filterAlign, int one, int flags, int cpu_flags,\r
+ ScaleVector *srcFilter, ScaleVector *dstFilter,\r
+ double param[2])\r
+{\r
+ int i;\r
+ int filterSize;\r
+ int filter2Size;\r
+ int minFilterSize;\r
+ int64_t *filter = NULL;\r
+ int64_t *filter2 = NULL;\r
+ const int64_t fone = 1LL << 54;\r
+ int ret = -1;\r
+\r
+ *filterPos = (int32_t *)av_malloc((dstW + 3) * sizeof(**filterPos));\r
+ if (*filterPos == NULL && ((dstW + 3) * sizeof(**filterPos)) != 0)\r
+ {\r
+ hb_log("Cannot allocate memory."); \r
+ goto fail;\r
+ }\r
+\r
+ if (FFABS(xInc - 0x10000) < 10) \r
+ { // unscaled\r
+ int i;\r
+ filterSize = 1;\r
+ filter = (int64_t *)av_mallocz(dstW * sizeof(*filter) * filterSize);\r
+ if (filter == NULL && (dstW * sizeof(*filter) * filterSize) != 0) \r
+ {\r
+ hb_log("Cannot allocate memory."); \r
+ goto fail;\r
+ }\r
+\r
+ for (i = 0; i < dstW; i++) \r
+ {\r
+ filter[i * filterSize] = fone;\r
+ (*filterPos)[i] = i;\r
+ }\r
+ } \r
+ else if (flags & SWS_POINT) \r
+ { // lame looking point sampling mode\r
+ int i;\r
+ int64_t xDstInSrc;\r
+ filterSize = 1;\r
+ filter = (int64_t *)av_malloc(dstW * sizeof(*filter) * filterSize);\r
+ if(filter == NULL && (dstW * sizeof(*filter) * filterSize) != 0)\r
+ {\r
+ hb_log("Cannot allocate memory."); \r
+ goto fail;\r
+ }\r
+\r
+ xDstInSrc = xInc / 2 - 0x8000;\r
+ for (i = 0; i < dstW; i++) \r
+ {\r
+ int xx = (xDstInSrc - ((filterSize - 1) << 15) + (1 << 15)) >> 16;\r
+\r
+ (*filterPos)[i] = xx;\r
+ filter[i] = fone;\r
+ xDstInSrc += xInc;\r
+ }\r
+ } \r
+ else if ((xInc <= (1 << 16) && (flags & SWS_AREA)) ||\r
+ (flags & SWS_FAST_BILINEAR)) \r
+ { // bilinear upscale\r
+ int i;\r
+ int64_t xDstInSrc;\r
+ filterSize = 2;\r
+ filter = (int64_t *)av_malloc(dstW * sizeof(*filter) * filterSize);\r
+ if(filter == NULL && (dstW * sizeof(*filter) * filterSize) != 0)\r
+ {\r
+ hb_log("Cannot allocate memory."); \r
+ goto fail;\r
+ }\r
+\r
+ xDstInSrc = xInc / 2 - 0x8000;\r
+ for (i = 0; i < dstW; i++) \r
+ {\r
+ int xx = (xDstInSrc - ((filterSize - 1) << 15) + (1 << 15)) >> 16;\r
+ int j;\r
+\r
+ (*filterPos)[i] = xx;\r
+ // bilinear upscale / linear interpolate / area averaging\r
+ for (j = 0; j < filterSize; j++) \r
+ {\r
+ int64_t coeff= fone - FFABS(((int64_t)xx<<16) - xDstInSrc)*(fone>>16);\r
+ if (coeff < 0)\r
+ coeff = 0;\r
+ filter[i * filterSize + j] = coeff;\r
+ xx++;\r
+ }\r
+ xDstInSrc += xInc;\r
+ }\r
+ } \r
+ else \r
+ {\r
+ int64_t xDstInSrc;\r
+ int sizeFactor;\r
+\r
+ if (flags & SWS_BICUBIC)\r
+ sizeFactor = 4;\r
+ else if (flags & SWS_X)\r
+ sizeFactor = 8;\r
+ else if (flags & SWS_AREA)\r
+ sizeFactor = 1; // downscale only, for upscale it is bilinear\r
+ else if (flags & SWS_GAUSS)\r
+ sizeFactor = 8; // infinite ;)\r
+ else if (flags & SWS_LANCZOS)\r
+ sizeFactor = param[0] != SWS_PARAM_DEFAULT ? ceil(2 * param[0]) : 6;\r
+ else if (flags & SWS_SINC)\r
+ sizeFactor = 20; // infinite ;)\r
+ else if (flags & SWS_SPLINE)\r
+ sizeFactor = 20; // infinite ;)\r
+ else if (flags & SWS_BILINEAR)\r
+ sizeFactor = 2;\r
+ else \r
+ {\r
+ sizeFactor = 0; // GCC warning killer\r
+ assert(0);\r
+ }\r
+\r
+ if (xInc <= 1 << 16)\r
+ filterSize = 1 + sizeFactor; // upscale\r
+ else\r
+ filterSize = 1 + (sizeFactor * srcW + dstW - 1) / dstW;\r
+\r
+\r
+ filterSize = FFMIN(filterSize, srcW - 2);\r
+ filterSize = FFMAX(filterSize, 1);\r
+\r
+ filter = (int64_t *)av_malloc(dstW * sizeof(*filter) * filterSize);\r
+ if(filter == NULL && (dstW * sizeof(*filter) * filterSize) != 0)\r
+ {\r
+ hb_log("Cannot allocate memory."); \r
+ goto fail;\r
+ }\r
+\r
+ xDstInSrc = xInc - 0x10000;\r
+ for (i = 0; i < dstW; i++) \r
+ {\r
+ int xx = (xDstInSrc - ((filterSize - 2) << 16)) / (1 << 17);\r
+ int j;\r
+ (*filterPos)[i] = xx;\r
+ for (j = 0; j < filterSize; j++) \r
+ {\r
+ int64_t d = (FFABS(((int64_t)xx << 17) - xDstInSrc)) << 13;\r
+ double floatd;\r
+ int64_t coeff;\r
+\r
+ if (xInc > 1 << 16)\r
+ d = d * dstW / srcW;\r
+ floatd = d * (1.0 / (1 << 30));\r
+\r
+ if (flags & SWS_BICUBIC) \r
+ {\r
+ int64_t B = (param[0] != SWS_PARAM_DEFAULT ? param[0] : 0) * (1 << 24);\r
+ int64_t C = (param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6) * (1 << 24);\r
+\r
+ if (d >= 1LL << 31) \r
+ {\r
+ coeff = 0.0;\r
+ }\r
+ else \r
+ {\r
+ int64_t dd = (d * d) >> 30;\r
+ int64_t ddd = (dd * d) >> 30;\r
+\r
+ if (d < 1LL << 30)\r
+ coeff = (12 * (1 << 24) - 9 * B - 6 * C) * ddd +\r
+ (-18 * (1 << 24) + 12 * B + 6 * C) * dd +\r
+ (6 * (1 << 24) - 2 * B) * (1 << 30);\r
+ else\r
+ coeff = (-B - 6 * C) * ddd +\r
+ (6 * B + 30 * C) * dd +\r
+ (-12 * B - 48 * C) * d +\r
+ (8 * B + 24 * C) * (1 << 30);\r
+ }\r
+ coeff *= fone >> (30 + 24);\r
+ }\r
+#if 0\r
+ else if (flags & SWS_X) \r
+ {\r
+ double p = param ? param * 0.01 : 0.3;\r
+ coeff = d ? sin(d * M_PI) / (d * M_PI) : 1.0;\r
+ coeff *= pow (2.0, -p * d * d);\r
+ }\r
+#endif\r
+ else if (flags & SWS_X) \r
+ {\r
+ double A = param[0] != SWS_PARAM_DEFAULT ? param[0] : 1.0;\r
+ double c;\r
+\r
+ if (floatd < 1.0)\r
+ c = cos(floatd * M_PI);\r
+ else\r
+ c = -1.0;\r
+ if (c < 0.0)\r
+ c = -pow(-c, A);\r
+ else\r
+ c = pow(c, A);\r
+ coeff = (c * 0.5 + 0.5) * fone;\r
+ } \r
+ else if (flags & SWS_AREA) \r
+ {\r
+ int64_t d2 = d - (1 << 29);\r
+ if (d2 * xInc < -(1LL << (29 + 16)))\r
+ coeff = 1.0 * (1LL << (30 + 16));\r
+ else if (d2 * xInc < (1LL << (29 + 16)))\r
+ coeff = -d2 * xInc + (1LL << (29 + 16));\r
+ else\r
+ coeff = 0.0;\r
+ coeff *= fone >> (30 + 16);\r
+ } \r
+ else if (flags & SWS_GAUSS) \r
+ {\r
+ double p = param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;\r
+ coeff = (pow(2.0, -p * floatd * floatd)) * fone;\r
+ }\r
+ else if (flags & SWS_SINC) \r
+ {\r
+ coeff = (d ? sin(floatd * M_PI) / (floatd * M_PI) : 1.0) * fone;\r
+ } \r
+ else if (flags & SWS_LANCZOS) \r
+ {\r
+ double p = param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;\r
+ coeff = (d ? sin(floatd * M_PI) * sin(floatd * M_PI / p) /\r
+ (floatd * floatd * M_PI * M_PI / p) : 1.0) * fone;\r
+ if (floatd > p)\r
+ coeff = 0;\r
+ }\r
+ else if (flags & SWS_BILINEAR) \r
+ {\r
+ coeff = (1 << 30) - d;\r
+ if (coeff < 0)\r
+ coeff = 0;\r
+ coeff *= fone >> 30;\r
+ }\r
+ else if (flags & SWS_SPLINE) \r
+ {\r
+ double p = -2.196152422706632;\r
+ coeff = getScaleSplineCoeff(1.0, 0.0, p, -p - 1.0, floatd) * fone;\r
+ }\r
+ else \r
+ {\r
+ coeff = 0.0; // GCC warning killer\r
+ assert(0);\r
+ }\r
+\r
+ filter[i * filterSize + j] = coeff;\r
+ xx++;\r
+ }\r
+ xDstInSrc += 2 * xInc;\r
+ }\r
+ }\r
+\r
+ assert(filterSize > 0);\r
+ filter2Size = filterSize;\r
+ if (srcFilter)\r
+ filter2Size += srcFilter->length - 1;\r
+ if (dstFilter)\r
+ filter2Size += dstFilter->length - 1;\r
+ assert(filter2Size > 0);\r
+ filter2 = (int64_t *)av_mallocz(filter2Size * dstW * sizeof(*filter2));\r
+ if (filter2 == NULL && (filter2Size * dstW * sizeof(*filter2)) != 0)\r
+ {\r
+ hb_log("Can't alloc memory.");\r
+ goto fail;\r
+ }\r
+\r
+ for (i = 0; i < dstW; i++) \r
+ {\r
+ int j, k;\r
+\r
+ if (srcFilter) \r
+ {\r
+ for (k = 0; k < srcFilter->length; k++) \r
+ {\r
+ for (j = 0; j < filterSize; j++)\r
+ filter2[i * filter2Size + k + j] +=\r
+ srcFilter->coeff[k] * filter[i * filterSize + j];\r
+ }\r
+ } \r
+ else \r
+ {\r
+ for (j = 0; j < filterSize; j++)\r
+ filter2[i * filter2Size + j] = filter[i * filterSize + j];\r
+ }\r
+ // FIXME dstFilter\r
+\r
+ (*filterPos)[i] += (filterSize - 1) / 2 - (filter2Size - 1) / 2;\r
+ }\r
+ av_freep(&filter);\r
+\r
+ // Assume it is near normalized (*0.5 or *2.0 is OK but * 0.001 is not).\r
+ minFilterSize = 0;\r
+ for (i = dstW - 1; i >= 0; i--) \r
+ {\r
+ int min = filter2Size;\r
+ int j;\r
+ int64_t cutOff = 0.0;\r
+\r
+ for (j = 0; j < filter2Size; j++) \r
+ {\r
+ int k;\r
+ cutOff += FFABS(filter2[i * filter2Size]);\r
+\r
+ if (cutOff > SWS_MAX_REDUCE_CUTOFF * fone)\r
+ break;\r
+\r
+ if (i < dstW - 1 && (*filterPos)[i] >= (*filterPos)[i + 1])\r
+ break;\r
+\r
+ // move filter coefficients left\r
+ for (k = 1; k < filter2Size; k++)\r
+ filter2[i * filter2Size + k - 1] = filter2[i * filter2Size + k];\r
+ filter2[i * filter2Size + k - 1] = 0;\r
+ (*filterPos)[i]++;\r
+ }\r
+\r
+ cutOff = 0;\r
+ for (j = filter2Size - 1; j > 0; j--) \r
+ {\r
+ cutOff += FFABS(filter2[i * filter2Size + j]);\r
+\r
+ if (cutOff > SWS_MAX_REDUCE_CUTOFF * fone)\r
+ break;\r
+ min--;\r
+ }\r
+\r
+ if (min > minFilterSize)\r
+ minFilterSize = min;\r
+ }\r
+\r
+\r
+ assert(minFilterSize > 0);\r
+ filterSize = (minFilterSize + (filterAlign - 1)) & (~(filterAlign - 1));\r
+ assert(filterSize > 0);\r
+ filter = (int64_t *)av_malloc(filterSize * dstW * sizeof(*filter));\r
+ if (filterSize >= MAX_FILTER_SIZE * 16 /\r
+ ((flags & SWS_ACCURATE_RND) ? APCK_SIZE : 16) || !filter)\r
+ goto fail;\r
+ *outFilterSize = filterSize;\r
+\r
+ if (flags & SWS_PRINT_INFO)\r
+ hb_log("SwScaler: reducing / aligning filtersize %d -> %d",filter2Size,filterSize);\r
+ for (i = 0; i < dstW; i++) \r
+ {\r
+ int j;\r
+\r
+ for (j = 0; j < filterSize; j++) \r
+ {\r
+ if (j >= filter2Size)\r
+ filter[i * filterSize + j] = 0;\r
+ else\r
+ filter[i * filterSize + j] = filter2[i * filter2Size + j];\r
+ if ((flags & SWS_BITEXACT) && j >= minFilterSize)\r
+ filter[i * filterSize + j] = 0;\r
+ }\r
+ }\r
+\r
+ // FIXME try to align filterPos if possible\r
+\r
+ // fix borders\r
+ for (i = 0; i < dstW; i++) \r
+ {\r
+ int j;\r
+ if ((*filterPos)[i] < 0) \r
+ {\r
+ // move filter coefficients left to compensate for filterPos\r
+ for (j = 1; j < filterSize; j++) \r
+ {\r
+ int left = FFMAX(j + (*filterPos)[i], 0);\r
+ filter[i * filterSize + left] += filter[i * filterSize + j];\r
+ filter[i * filterSize + j] = 0;\r
+ }\r
+ (*filterPos)[i]= 0;\r
+ }\r
+\r
+ if ((*filterPos)[i] + filterSize > srcW) \r
+ {\r
+ int shift = (*filterPos)[i] + filterSize - srcW;\r
+ // move filter coefficients right to compensate for filterPos\r
+ for (j = filterSize - 2; j >= 0; j--) \r
+ {\r
+ int right = FFMIN(j + shift, filterSize - 1);\r
+ filter[i * filterSize + right] += filter[i * filterSize + j];\r
+ filter[i * filterSize + j] = 0;\r
+ }\r
+ (*filterPos)[i]= srcW - filterSize;\r
+ }\r
+ }\r
+\r
+ // Note the +1 is for the MMX scaler which reads over the end\r
+ // FF_ALLOCZ_OR_GOTO(NULL, *outFilter,\r
+ // *outFilterSize * (dstW + 3) * sizeof(int16_t), fail);\r
+ *outFilter = (int16_t *)av_mallocz(*outFilterSize * (dstW + 3) * sizeof(int16_t));\r
+ if( *outFilter == NULL && (*outFilterSize * (dstW + 3) * sizeof(int16_t)) != 0)\r
+ {\r
+ hb_log("Can't alloc memory");\r
+ goto fail;\r
+ }\r
+\r
+ for (i = 0; i < dstW; i++) \r
+ {\r
+ int j;\r
+ int64_t error = 0;\r
+ int64_t sum = 0;\r
+\r
+ for (j = 0; j < filterSize; j++) \r
+ {\r
+ sum += filter[i * filterSize + j];\r
+ }\r
+ sum = (sum + one / 2) / one;\r
+ for (j = 0; j < *outFilterSize; j++) \r
+ {\r
+ int64_t v = filter[i * filterSize + j] + error;\r
+ int intV = ROUNDED_DIV(v, sum);\r
+ (*outFilter)[i * (*outFilterSize) + j] = intV;\r
+ error = v - intV * sum;\r
+ }\r
+ }\r
+\r
+ (*filterPos)[dstW + 0] =\r
+ (*filterPos)[dstW + 1] =\r
+ (*filterPos)[dstW + 2] = (*filterPos)[dstW - 1]; \r
+ for (i = 0; i < *outFilterSize; i++) \r
+ {\r
+ int k = (dstW - 1) * (*outFilterSize) + i;\r
+ (*outFilter)[k + 1 * (*outFilterSize)] =\r
+ (*outFilter)[k + 2 * (*outFilterSize)] =\r
+ (*outFilter)[k + 3 * (*outFilterSize)] = (*outFilter)[k];\r
+ }\r
+\r
+ ret = 0;\r
+\r
+fail:\r
+ av_free(filter);\r
+ av_free(filter2);\r
+ return ret;\r
+}\r
+\r
+static int handle_scale_jpeg(enum PixelFormat *format)\r
+{\r
+ switch (*format) \r
+ { \r
+ case AV_PIX_FMT_YUVJ420P:\r
+ *format = AV_PIX_FMT_YUV420P;\r
+ return 1;\r
+ case AV_PIX_FMT_YUVJ422P:\r
+ *format = AV_PIX_FMT_YUV422P;\r
+ return 1;\r
+ case AV_PIX_FMT_YUVJ444P:\r
+ *format = AV_PIX_FMT_YUV444P;\r
+ return 1;\r
+ case AV_PIX_FMT_YUVJ440P:\r
+ *format = AV_PIX_FMT_YUV440P;\r
+ return 1;\r
+ default:\r
+ return 0;\r
+ } \r
+}\r
+\r
+static void scaleGetSubSampleFactors(int *h, int *v, enum PixelFormat format)\r
+{\r
+ *h = av_pix_fmt_descriptors[format].log2_chroma_w;\r
+ *v = av_pix_fmt_descriptors[format].log2_chroma_h;\r
+}\r
+\r
+typedef struct FormatEntry {\r
+ int is_supported_in, is_supported_out;\r
+} FormatEntry;\r
+\r
+static const FormatEntry format_entries[AV_PIX_FMT_NB] = {\r
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },\r
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 0 },\r
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 0, 0 }, { 1, 1 },\r
+ { 0, 1 }, { 1, 1 }, { 1, 1 }, { 0, 1 }, { 1, 1 }, { 1, 1 },\r
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },\r
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },\r
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },\r
+ { 1, 0 }, { 1, 0 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },\r
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },\r
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },\r
+ { 1, 1 }, { 1, 1 }, { 1, 0 }, { 1, 1 }, { 1, 1 }, { 0, 0 },\r
+ { 0, 0 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },\r
+ { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },\r
+ { 1, 1 }, { 1, 0 }, { 1, 0 }, { 1, 0 }, { 1, 0 }, { 1, 0 },\r
+ { 1, 0 }, { 1, 0 },\r
+};\r
+\r
+int scale_isSupportedInput( enum PixelFormat pix_fmt )\r
+{\r
+ return (unsigned)pix_fmt < AV_PIX_FMT_NB ?\r
+ format_entries[pix_fmt].is_supported_in : 0;\r
+}\r
+\r
+int scale_isSupportedOutput( enum PixelFormat pix_fmt )\r
+{\r
+ return (unsigned)pix_fmt < AV_PIX_FMT_NB ?\r
+ format_entries[pix_fmt].is_supported_out : 0;\r
+}\r
+\r
+static void hcscale_fast_c( ScaleContext *c, int16_t *dst1, int16_t *dst2,\r
+ int dstWidth, const uint8_t *src1,\r
+ const uint8_t *src2, int srcW, int xInc )\r
+{\r
+ int i;\r
+ unsigned int xpos = 0;\r
+ for (i = 0; i < dstWidth; i++) \r
+ {\r
+ register unsigned int xx = xpos >> 16;\r
+ register unsigned int xalpha = (xpos & 0xFFFF) >> 9;\r
+ dst1[i] = (src1[xx] * (xalpha ^ 127) + src1[xx + 1] * xalpha);\r
+ dst2[i] = (src2[xx] * (xalpha ^ 127) + src2[xx + 1] * xalpha);\r
+ xpos += xInc;\r
+ }\r
+ for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) \r
+ {\r
+ dst1[i] = src1[srcW-1]*128;\r
+ dst2[i] = src2[srcW-1]*128;\r
+ }\r
+}\r
+\r
+static void hyscale_fast_c(ScaleContext *c, int16_t *dst, int dstWidth,\r
+ const uint8_t *src, int srcW, int xInc)\r
+{\r
+ int i;\r
+ unsigned int xpos = 0;\r
+ for (i = 0; i < dstWidth; i++) \r
+ {\r
+ register unsigned int xx = xpos >> 16;\r
+ register unsigned int xalpha = (xpos & 0xFFFF) >> 9;\r
+ dst[i] = (src[xx] << 7) + (src[xx + 1] - src[xx]) * xalpha;\r
+ xpos += xInc;\r
+ }\r
+ for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)\r
+ dst[i] = src[srcW-1]*128;\r
+}\r
+\r
+static void hScale16To19_c(ScaleContext *c, int16_t *_dst, int dstW,\r
+ const uint8_t *_src, const int16_t *filter,\r
+ const int32_t *filterPos, int filterSize)\r
+{\r
+ int i;\r
+ int32_t *dst = (int32_t *) _dst;\r
+ const uint16_t *src = (const uint16_t *) _src;\r
+ int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;\r
+ int sh = bits - 4;\r
+\r
+ if((isScaleAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) \r
+ && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)\r
+ sh= 9;\r
+\r
+ for (i = 0; i < dstW; i++) \r
+ {\r
+ int j;\r
+ int srcPos = filterPos[i];\r
+ int val = 0;\r
+\r
+ for (j = 0; j < filterSize; j++) \r
+ {\r
+ val += src[srcPos + j] * filter[filterSize * i + j];\r
+ }\r
+ dst[i] = FFMIN(val >> sh, (1 << 19) - 1);\r
+ }\r
+}\r
+\r
+static void hScale16To15_c(ScaleContext *c, int16_t *dst, int dstW,\r
+ const uint8_t *_src, const int16_t *filter,\r
+ const int32_t *filterPos, int filterSize)\r
+{\r
+ int i;\r
+ const uint16_t *src = (const uint16_t *) _src;\r
+ int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;\r
+\r
+ if(sh<15)\r
+ sh= isScaleAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 \r
+ ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;\r
+\r
+ for (i = 0; i < dstW; i++) \r
+ {\r
+ int j;\r
+ int srcPos = filterPos[i];\r
+ int val = 0;\r
+\r
+ for (j = 0; j < filterSize; j++) \r
+ {\r
+ val += src[srcPos + j] * filter[filterSize * i + j];\r
+ }\r
+ // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit\r
+ dst[i] = FFMIN(val >> sh, (1 << 15) - 1);\r
+ }\r
+}\r
+\r
+static void hScale8To15_c(ScaleContext *c, int16_t *dst, int dstW,\r
+ const uint8_t *src, const int16_t *filter,\r
+ const int32_t *filterPos, int filterSize)\r
+{\r
+ int i;\r
+ for (i = 0; i < dstW; i++) \r
+ {\r
+ int j;\r
+ int srcPos = filterPos[i];\r
+ int val = 0;\r
+ for (j = 0; j < filterSize; j++) \r
+ {\r
+ val += ((int)src[srcPos + j]) * filter[filterSize * i + j];\r
+ }\r
+ dst[i] = FFMIN(val >> 7, (1 << 15) - 1); // the cubic equation does overflow ...\r
+ }\r
+}\r
+\r
+static void hScale8To19_c(ScaleContext *c, int16_t *_dst, int dstW,\r
+ const uint8_t *src, const int16_t *filter,\r
+ const int32_t *filterPos, int filterSize)\r
+{\r
+ int i;\r
+ int32_t *dst = (int32_t *) _dst;\r
+ for (i = 0; i < dstW; i++) \r
+ {\r
+ int j;\r
+ int srcPos = filterPos[i];\r
+ int val = 0;\r
+ for (j = 0; j < filterSize; j++) \r
+ {\r
+ val += ((int)src[srcPos + j]) * filter[filterSize * i + j];\r
+ }\r
+ dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ...\r
+ }\r
+}\r
+\r
+static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)\r
+{\r
+ int i;\r
+ for (i = 0; i < width; i++) \r
+ {\r
+ dstU[i] = (FFMIN(dstU[i], 30775) * 4663 - 9289992) >> 12; // -264\r
+ dstV[i] = (FFMIN(dstV[i], 30775) * 4663 - 9289992) >> 12; // -264\r
+ }\r
+}\r
+\r
+static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)\r
+{\r
+ int i;\r
+ for (i = 0; i < width; i++) \r
+ {\r
+ dstU[i] = (dstU[i] * 1799 + 4081085) >> 11; // 1469\r
+ dstV[i] = (dstV[i] * 1799 + 4081085) >> 11; // 1469\r
+ }\r
+}\r
+\r
+static void lumRangeToJpeg_c(int16_t *dst, int width)\r
+{\r
+ int i;\r
+ for (i = 0; i < width; i++)\r
+ dst[i] = (FFMIN(dst[i], 30189) * 19077 - 39057361) >> 14;\r
+}\r
+\r
+static void lumRangeFromJpeg_c(int16_t *dst, int width)\r
+{\r
+ int i;\r
+ for (i = 0; i < width; i++)\r
+ dst[i] = (dst[i] * 14071 + 33561947) >> 14;\r
+}\r
+\r
+static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)\r
+{\r
+ int i;\r
+ int32_t *dstU = (int32_t *) _dstU;\r
+ int32_t *dstV = (int32_t *) _dstV;\r
+ for (i = 0; i < width; i++) \r
+ {\r
+ dstU[i] = (FFMIN(dstU[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264\r
+ dstV[i] = (FFMIN(dstV[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264\r
+ }\r
+}\r
+\r
+static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)\r
+{\r
+ int i;\r
+ int32_t *dstU = (int32_t *) _dstU;\r
+ int32_t *dstV = (int32_t *) _dstV;\r
+ for (i = 0; i < width; i++) \r
+ {\r
+ dstU[i] = (dstU[i] * 1799 + (4081085 << 4)) >> 11; // 1469\r
+ dstV[i] = (dstV[i] * 1799 + (4081085 << 4)) >> 11; // 1469\r
+ }\r
+}\r
+\r
+static void lumRangeToJpeg16_c(int16_t *_dst, int width)\r
+{\r
+ int i;\r
+ int32_t *dst = (int32_t *) _dst;\r
+ for (i = 0; i < width; i++)\r
+ dst[i] = (FFMIN(dst[i], 30189 << 4) * 4769 - (39057361 << 2)) >> 12;\r
+}\r
+\r
+static void lumRangeFromJpeg16_c(int16_t *_dst, int width)\r
+{\r
+ int i;\r
+ int32_t *dst = (int32_t *) _dst;\r
+ for (i = 0; i < width; i++)\r
+ dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;\r
+}\r
+\r
+static av_cold void sws_init_swScale_c(ScaleContext *c)\r
+{\r
+ enum PixelFormat srcFormat = c->srcFormat;\r
+\r
+ ff_sws_init_output_funcs(c, &c->yuv2plane1, &c->yuv2planeX,\r
+ &c->yuv2nv12cX, &c->yuv2packed1,\r
+ &c->yuv2packed2, &c->yuv2packedX);\r
+\r
+ ff_sws_init_input_funcs(c);\r
+\r
+ if (c->srcBpc == 8) \r
+ {\r
+ if (c->dstBpc <= 10) \r
+ {\r
+ c->hyScale = c->hcScale = hScale8To15_c;\r
+ if (c->flags & SWS_FAST_BILINEAR) \r
+ {\r
+ c->hyscale_fast = hyscale_fast_c;\r
+ c->hcscale_fast = hcscale_fast_c;\r
+ }\r
+ } \r
+ else \r
+ {\r
+ c->hyScale = c->hcScale = hScale8To19_c;\r
+ }\r
+ } \r
+ else \r
+ {\r
+ c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c\r
+ : hScale16To15_c;\r
+ }\r
+\r
+ if (c->srcRange != c->dstRange && !isScaleAnyRGB(c->dstFormat)) \r
+ {\r
+ if (c->dstBpc <= 10) \r
+ {\r
+ if (c->srcRange) \r
+ {\r
+ c->lumConvertRange = lumRangeFromJpeg_c;\r
+ c->chrConvertRange = chrRangeFromJpeg_c;\r
+ } \r
+ else \r
+ {\r
+ c->lumConvertRange = lumRangeToJpeg_c;\r
+ c->chrConvertRange = chrRangeToJpeg_c;\r
+ }\r
+ } \r
+ else \r
+ {\r
+ if (c->srcRange) \r
+ {\r
+ c->lumConvertRange = lumRangeFromJpeg16_c;\r
+ c->chrConvertRange = chrRangeFromJpeg16_c;\r
+ } \r
+ else \r
+ {\r
+ c->lumConvertRange = lumRangeToJpeg16_c;\r
+ c->chrConvertRange = chrRangeToJpeg16_c;\r
+ }\r
+ }\r
+ }\r
+\r
+ if (!(isScaleGray(srcFormat) || isScaleGray(c->dstFormat) ||\r
+ srcFormat == AV_PIX_FMT_MONOBLACK || srcFormat == AV_PIX_FMT_MONOWHITE))\r
+ c->needs_hcscale = 1;\r
+}\r
+\r
+int scale_init_context(ScaleContext *c, ScaleFilter *srcFilter, ScaleFilter *dstFilter)\r
+{\r
+ ScaleFilter dummyFilter = { NULL, NULL, NULL, NULL };\r
+ int srcW = c->srcW;\r
+ int srcH = c->srcH;\r
+ int dstW = c->dstW;\r
+ int dstH = c->dstH;\r
+ int flags, cpu_flags;\r
+ enum PixelFormat srcFormat = c->srcFormat;\r
+ enum PixelFormat dstFormat = c->dstFormat;\r
+\r
+ cpu_flags = 0;\r
+ flags = c->flags;\r
+\r
+ if(srcFormat != c->srcFormat || dstFormat != c->dstFormat)\r
+ {\r
+ hb_log("deprecated pixel format used, make sure you did set range correctly.");\r
+ c->srcFormat = srcFormat;\r
+ c->dstFormat = dstFormat;\r
+ }\r
+\r
+ if (srcW < 4 || srcH < 1 || dstW < 8 || dstH < 1) \r
+ {\r
+ hb_log("%dx%d -> %dx%d is invalid scaling dimension.",srcW,srcH,dstW,dstH);\r
+ return -1;\r
+ }\r
+\r
+ if (!dstFilter)\r
+ dstFilter = &dummyFilter;\r
+ if (!srcFilter)\r
+ srcFilter = &dummyFilter;\r
+\r
+ c->lumXInc = (((int64_t)srcW << 16) + (dstW >> 1)) / dstW;\r
+ c->lumYInc = (((int64_t)srcH << 16) + (dstH >> 1)) / dstH;\r
+ c->dstFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[dstFormat]);\r
+ c->srcFormatBpp = av_get_bits_per_pixel(&av_pix_fmt_descriptors[srcFormat]);\r
+ c->vRounder = 4 * 0x0001000100010001ULL;\r
+\r
+ scaleGetSubSampleFactors(&c->chrSrcHSubSample, &c->chrSrcVSubSample, srcFormat);\r
+ scaleGetSubSampleFactors(&c->chrDstHSubSample, &c->chrDstVSubSample, dstFormat);\r
+\r
+ // drop some chroma lines if the user wants it\r
+ c->vChrDrop = (flags & SWS_SRC_V_CHR_DROP_MASK) >> SWS_SRC_V_CHR_DROP_SHIFT;\r
+ c->chrSrcVSubSample += c->vChrDrop;\r
+ c->chrSrcW = -((-srcW) >> c->chrSrcHSubSample);\r
+ c->chrSrcH = -((-srcH) >> c->chrSrcVSubSample);\r
+ c->chrDstW = -((-dstW) >> c->chrDstHSubSample);\r
+ c->chrDstH = -((-dstH) >> c->chrDstVSubSample);\r
+ c->chrXInc = (((int64_t)c->chrSrcW << 16) + (c->chrDstW >> 1)) / c->chrDstW;\r
+ c->chrYInc = (((int64_t)c->chrSrcH << 16) + (c->chrDstH >> 1)) / c->chrDstH;\r
+\r
+ const int filterAlign = 1;\r
+\r
+ if (initScaleFilter(&c->hLumFilter, &c->hLumFilterPos,\r
+ &c->hLumFilterSize, c->lumXInc,\r
+ srcW, dstW, filterAlign, 1 << 14,\r
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags,\r
+ cpu_flags, srcFilter->lumH, dstFilter->lumH,\r
+ c->param) < 0)\r
+ goto fail;\r
+\r
+ if (initScaleFilter(&c->hChrFilter, &c->hChrFilterPos,\r
+ &c->hChrFilterSize, c->chrXInc,\r
+ c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,\r
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags,\r
+ cpu_flags, srcFilter->chrH, dstFilter->chrH,\r
+ c->param) < 0)\r
+ goto fail;\r
+\r
+ if (initScaleFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize,\r
+ c->lumYInc, srcH, dstH, filterAlign, (1 << 12),\r
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags,\r
+ cpu_flags, srcFilter->lumV, dstFilter->lumV,\r
+ c->param) < 0)\r
+ goto fail;\r
+\r
+ if (initScaleFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize,\r
+ c->chrYInc, c->chrSrcH, c->chrDstH,\r
+ filterAlign, (1 << 12),\r
+ (flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags,\r
+ cpu_flags, srcFilter->chrV, dstFilter->chrV,\r
+ c->param) < 0)\r
+ goto fail;\r
+ return 0;\r
+fail:\r
+ return -1;\r
+}\r
+\r
+ScaleContext *scale_getContext(int srcW, int srcH, enum PixelFormat srcFormat,\r
+ int dstW, int dstH, enum PixelFormat dstFormat,\r
+ int flags, ScaleFilter *srcFilter,\r
+ ScaleFilter *dstFilter, const double *param)\r
+{\r
+ ScaleContext *sc = (ScaleContext*)malloc(sizeof(ScaleContext));\r
+ sc->flags = flags;\r
+ sc->srcW = srcW;\r
+ sc->srcH = srcH;\r
+ sc->dstW = dstW;\r
+ sc->dstH = dstH;\r
+ sc->srcRange = handle_scale_jpeg(&srcFormat);\r
+ sc->dstRange = handle_scale_jpeg(&dstFormat);\r
+ sc->srcFormat = srcFormat;\r
+ sc->dstFormat = dstFormat;\r
+ sc->hyscale_fast = 0;\r
+ sc->hcscale_fast = 0;\r
+\r
+ if (param) \r
+ {\r
+ sc->param[0] = param[0];\r
+ sc->param[1] = param[1];\r
+ } \r
+\r
+ if (scale_init_context(sc, srcFilter, dstFilter) < 0) \r
+ { \r
+ sws_freeContext(sc);\r
+ return NULL;\r
+ }\r
+ return sc;\r
+}\r
+\r
+int scale_opencl(ScaleContext *c, void *cl_inbuf, void *cl_outbuf, int *srcStride, int *dstStride)\r
+{\r
+ int should_dither = is9_OR_10BPS(c->srcFormat) || is16BPS(c->srcFormat);\r
+ av_scale_frame(c,cl_outbuf,cl_inbuf,srcStride,dstStride,&should_dither);\r
+ return 1;\r
+}\r
+\r
+void scale_init( int width, int height, int dstwidth, int dstheight )\r
+{\r
+ int srcW = width;\r
+ int srcH = height;\r
+ int dstW = dstwidth;\r
+ int dstH = dstheight;\r
+ enum PixelFormat inputfmt = AV_PIX_FMT_YUV420P;\r
+ enum PixelFormat outputfmt = AV_PIX_FMT_YUV420P;\r
+ int flags = SWS_BILINEAR;\r
+ g_scale = scale_getContext(srcW,srcH,inputfmt,dstW,dstH,outputfmt,flags,NULL,NULL,NULL);\r
+}\r
+\r
+void scale_release()\r
+{\r
+ sws_freeContext( g_scale );\r
+}\r
+#ifdef USE_OPENCL\r
+int scale_run( cl_mem inbuf, cl_mem outbuf, int linesizey, int linesizeuv, int height )\r
+{\r
+ g_scale->cl_src = inbuf;\r
+ g_scale->cl_dst = outbuf;\r
+\r
+ int src_stride[4] = { linesizey, linesizeuv, linesizeuv, 0 };\r
+ int dst_stride[4] = { g_scale->dstW, g_scale->chrDstW, g_scale->chrDstW, 0 };\r
+ int ret = -1;\r
+\r
+ ret = scale_opencl( g_scale, inbuf, outbuf, src_stride, dst_stride );\r
+\r
+ return ret;\r
+}\r
+#endif\r
+#endif\r
--- /dev/null
+/* scale.h\r
+\r
+ Copyright (c) 2003-2012 HandBrake Team\r
+ This file is part of the HandBrake source code\r
+ Homepage: <http://handbrake.fr/>.\r
+ It may be used under the terms of the GNU General Public License v2.\r
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html\r
+\r
+ Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>\r
+ Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>\r
+\r
+\r
+ */\r
+ \r
+#ifndef SCALE_H\r
+#define SCALE_H\r
+#ifdef USE_OPENCL\r
+#include <stdint.h>\r
+#include "vadxva2.h"\r
+#include "libavutil/pixfmt.h"\r
+#include "hbffmpeg.h"\r
+\r
+#define YUVRGB_TABLE_HEADROOM 128\r
+#define MAX_FILTER_SIZE 256\r
+#define is16BPS(x) \\r
+ (av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 15)\r
+\r
+#define is9_OR_10BPS(x) \\r
+ (av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 8 || \\r
+ av_pix_fmt_descriptors[x].comp[0].depth_minus1 == 9)\r
+\r
+#if ARCH_X86_64\r
+# define APCK_PTR2 8\r
+# define APCK_COEF 16\r
+# define APCK_SIZE 24\r
+#else\r
+# define APCK_PTR2 4\r
+# define APCK_COEF 8\r
+# define APCK_SIZE 16\r
+#endif\r
+\r
+typedef void (*yuv2planar1_fn)(const int16_t *src, uint8_t *dest, int dstW,\r
+ const uint8_t *dither, int offset);\r
+\r
+typedef void (*yuv2planarX_fn)(const int16_t *filter, int filterSize,\r
+ const int16_t **src, uint8_t *dest, int dstW,\r
+ const uint8_t *dither, int offset);\r
+\r
+typedef void (*yuv2interleavedX_fn)(struct ScaleContext *c,\r
+ const int16_t *chrFilter,\r
+ int chrFilterSize,\r
+ const int16_t **chrUSrc,\r
+ const int16_t **chrVSrc,\r
+ uint8_t *dest, int dstW);\r
+\r
+typedef void (*yuv2packed1_fn)(struct ScaleContext *c, const int16_t *lumSrc,\r
+ const int16_t *chrUSrc[2],\r
+ const int16_t *chrVSrc[2],\r
+ const int16_t *alpSrc, uint8_t *dest,\r
+ int dstW, int uvalpha, int y);\r
+\r
+typedef void (*yuv2packed2_fn)(struct SCaleContext *c, const int16_t *lumSrc[2],\r
+ const int16_t *chrUSrc[2],\r
+ const int16_t *chrVSrc[2],\r
+ const int16_t *alpSrc[2],\r
+ uint8_t *dest,\r
+ int dstW, int yalpha, int uvalpha, int y);\r
+\r
+typedef void (*yuv2packedX_fn)(struct SCaleContext *c, const int16_t *lumFilter,\r
+ const int16_t **lumSrc, int lumFilterSize,\r
+ const int16_t *chrFilter,\r
+ const int16_t **chrUSrc,\r
+ const int16_t **chrVSrc, int chrFilterSize,\r
+ const int16_t **alpSrc, uint8_t *dest,\r
+ int dstW, int y);\r
+\r
+typedef int (*SwsFunc)(struct ScaleContext *context, const uint8_t *src[],\r
+ int srcStride[], int srcSliceY, int srcSliceH,\r
+ uint8_t *dst[], int dstStride[]);\r
+\r
+typedef struct {\r
+ double *coeff; ///< pointer to the list of coefficients\r
+ int length; ///< number of coefficients in the vector\r
+} ScaleVector;\r
+\r
+typedef struct {\r
+ ScaleVector *lumH;\r
+ ScaleVector *lumV;\r
+ ScaleVector *chrH;\r
+ ScaleVector *chrV;\r
+} ScaleFilter;\r
+\r
+typedef struct ScaleContext {\r
+ SwsFunc swScale;\r
+ int srcW; ///< Width of source luma/alpha planes.\r
+ int srcH; ///< Height of source luma/alpha planes.\r
+ int dstH; ///< Height of destination luma/alpha planes.\r
+ int chrSrcW; ///< Width of source chroma planes.\r
+ int chrSrcH; ///< Height of source chroma planes.\r
+ int chrDstW; ///< Width of destination chroma planes.\r
+ int chrDstH; ///< Height of destination chroma planes.\r
+ int lumXInc, chrXInc;\r
+ int lumYInc, chrYInc;\r
+ enum PixelFormat dstFormat; ///< Destination pixel format.\r
+ enum PixelFormat srcFormat; ///< Source pixel format.\r
+ int dstFormatBpp; ///< Number of bits per pixel of the destination pixel format.\r
+ int srcFormatBpp; ///< Number of bits per pixel of the source pixel format.\r
+ int dstBpc, srcBpc;\r
+ int chrSrcHSubSample; ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in source image.\r
+ int chrSrcVSubSample; ///< Binary logarithm of vertical subsampling factor between luma/alpha and chroma planes in source image.\r
+ int chrDstHSubSample; ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in destination image.\r
+ int chrDstVSubSample; ///< Binary logarithm of vertical subsampling factor between luma/alpha and chroma planes in destination image.\r
+ int vChrDrop; ///< Binary logarithm of extra vertical subsampling factor in source image chroma planes specified by user.\r
+ int sliceDir; ///< Direction that slices are fed to the scaler (1 = top-to-bottom, -1 = bottom-to-top).\r
+ double param[2]; ///< Input parameters for scaling algorithms that need them.\r
+\r
+ uint32_t pal_yuv[256];\r
+ uint32_t pal_rgb[256];\r
+\r
+ int16_t **lumPixBuf; ///< Ring buffer for scaled horizontal luma plane lines to be fed to the vertical scaler.\r
+ int16_t **chrUPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.\r
+ int16_t **chrVPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.\r
+ int16_t **alpPixBuf; ///< Ring buffer for scaled horizontal alpha plane lines to be fed to the vertical scaler.\r
+ int vLumBufSize; ///< Number of vertical luma/alpha lines allocated in the ring buffer.\r
+ int vChrBufSize; ///< Number of vertical chroma lines allocated in the ring buffer.\r
+ int lastInLumBuf; ///< Last scaled horizontal luma/alpha line from source in the ring buffer.\r
+ int lastInChrBuf; ///< Last scaled horizontal chroma line from source in the ring buffer.\r
+ int lumBufIndex; ///< Index in ring buffer of the last scaled horizontal luma/alpha line from source.\r
+ int chrBufIndex; ///< Index in ring buffer of the last scaled horizontal chroma line from source.\r
+\r
+ uint8_t *formatConvBuffer;\r
+ int16_t *hLumFilter; ///< Array of horizontal filter coefficients for luma/alpha planes.\r
+ int16_t *hChrFilter; ///< Array of horizontal filter coefficients for chroma planes.\r
+ int16_t *vLumFilter; ///< Array of vertical filter coefficients for luma/alpha planes.\r
+ int16_t *vChrFilter; ///< Array of vertical filter coefficients for chroma planes.\r
+ int32_t *hLumFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for luma/alpha planes.\r
+ int32_t *hChrFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for chroma planes.\r
+ int32_t *vLumFilterPos; ///< Array of vertical filter starting positions for each dst[i] for luma/alpha planes.\r
+ int32_t *vChrFilterPos; ///< Array of vertical filter starting positions for each dst[i] for chroma planes.\r
+ int hLumFilterSize; ///< Horizontal filter size for luma/alpha pixels.\r
+ int hChrFilterSize; ///< Horizontal filter size for chroma pixels.\r
+ int vLumFilterSize; ///< Vertical filter size for luma/alpha pixels.\r
+ int vChrFilterSize; ///< Vertical filter size for chroma pixels.\r
+\r
+ int lumMmx2FilterCodeSize; ///< Runtime-generated MMX2 horizontal fast bilinear scaler code size for luma/alpha planes.\r
+ int chrMmx2FilterCodeSize; ///< Runtime-generated MMX2 horizontal fast bilinear scaler code size for chroma planes.\r
+ uint8_t *lumMmx2FilterCode; ///< Runtime-generated MMX2 horizontal fast bilinear scaler code for luma/alpha planes.\r
+ uint8_t *chrMmx2FilterCode; ///< Runtime-generated MMX2 horizontal fast bilinear scaler code for chroma planes.\r
+\r
+ int canMMX2BeUsed;\r
+\r
+ unsigned char *dest;\r
+ unsigned char *source;\r
+\r
+ int dstY; ///< Last destination vertical line output from last slice.\r
+ int flags; ///< Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...\r
+ void *yuvTable; ///<s pointer to the yuv->rgb table start so it can be freed()\r
+ uint8_t *table_rV[256 + 2*YUVRGB_TABLE_HEADROOM];\r
+ uint8_t *table_gU[256 + 2*YUVRGB_TABLE_HEADROOM];\r
+ int table_gV[256 + 2*YUVRGB_TABLE_HEADROOM];\r
+ uint8_t *table_bU[256 + 2*YUVRGB_TABLE_HEADROOM];\r
+\r
+ //Colorspace stuff\r
+ int contrast, brightness, saturation; // for sws_getColorspaceDetails\r
+ int srcColorspaceTable[4];\r
+ int dstColorspaceTable[4];\r
+ int srcRange; ///< 0 = MPG YUV range, 1 = JPG YUV range (source image).\r
+ int dstRange; ///< 0 = MPG YUV range, 1 = JPG YUV range (destination image).\r
+ int src0Alpha;\r
+ int dst0Alpha;\r
+ int yuv2rgb_y_offset;\r
+ int yuv2rgb_y_coeff;\r
+ int yuv2rgb_v2r_coeff;\r
+ int yuv2rgb_v2g_coeff;\r
+ int yuv2rgb_u2g_coeff;\r
+ int yuv2rgb_u2b_coeff;\r
+\r
+#define RED_DITHER "0*8"\r
+#define GREEN_DITHER "1*8"\r
+#define BLUE_DITHER "2*8"\r
+#define Y_COEFF "3*8"\r
+#define VR_COEFF "4*8"\r
+#define UB_COEFF "5*8"\r
+#define VG_COEFF "6*8"\r
+#define UG_COEFF "7*8"\r
+#define Y_OFFSET "8*8"\r
+#define U_OFFSET "9*8"\r
+#define V_OFFSET "10*8"\r
+#define LUM_MMX_FILTER_OFFSET "11*8"\r
+#define CHR_MMX_FILTER_OFFSET "11*8+4*4*256"\r
+#define DSTW_OFFSET "11*8+4*4*256*2" //do not change, it is hardcoded in the ASM\r
+#define ESP_OFFSET "11*8+4*4*256*2+8"\r
+#define VROUNDER_OFFSET "11*8+4*4*256*2+16"\r
+#define U_TEMP "11*8+4*4*256*2+24"\r
+#define V_TEMP "11*8+4*4*256*2+32"\r
+#define Y_TEMP "11*8+4*4*256*2+40"\r
+#define ALP_MMX_FILTER_OFFSET "11*8+4*4*256*2+48"\r
+#define UV_OFF_PX "11*8+4*4*256*3+48"\r
+#define UV_OFF_BYTE "11*8+4*4*256*3+56"\r
+#define DITHER16 "11*8+4*4*256*3+64"\r
+#define DITHER32 "11*8+4*4*256*3+80"\r
+\r
+ DECLARE_ALIGNED(8, uint64_t, redDither);\r
+ DECLARE_ALIGNED(8, uint64_t, greenDither);\r
+ DECLARE_ALIGNED(8, uint64_t, blueDither);\r
+\r
+ DECLARE_ALIGNED(8, uint64_t, yCoeff);\r
+ DECLARE_ALIGNED(8, uint64_t, vrCoeff);\r
+ DECLARE_ALIGNED(8, uint64_t, ubCoeff);\r
+ DECLARE_ALIGNED(8, uint64_t, vgCoeff);\r
+ DECLARE_ALIGNED(8, uint64_t, ugCoeff);\r
+ DECLARE_ALIGNED(8, uint64_t, yOffset);\r
+ DECLARE_ALIGNED(8, uint64_t, uOffset);\r
+ DECLARE_ALIGNED(8, uint64_t, vOffset);\r
+ int32_t lumMmxFilter[4 * MAX_FILTER_SIZE];\r
+ int32_t chrMmxFilter[4 * MAX_FILTER_SIZE];\r
+ int dstW; ///< Width of destination luma/alpha planes.\r
+ DECLARE_ALIGNED(8, uint64_t, esp);\r
+ DECLARE_ALIGNED(8, uint64_t, vRounder);\r
+ DECLARE_ALIGNED(8, uint64_t, u_temp);\r
+ DECLARE_ALIGNED(8, uint64_t, v_temp);\r
+ DECLARE_ALIGNED(8, uint64_t, y_temp);\r
+ int32_t alpMmxFilter[4 * MAX_FILTER_SIZE];\r
+\r
+ DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes\r
+ DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes\r
+ DECLARE_ALIGNED(8, uint16_t, dither16)[8];\r
+ DECLARE_ALIGNED(8, uint32_t, dither32)[8];\r
+\r
+ const uint8_t *chrDither8, *lumDither8;\r
+\r
+#if HAVE_ALTIVEC\r
+ vector signed short CY;\r
+ vector signed short CRV;\r
+ vector signed short CBU;\r
+ vector signed short CGU;\r
+ vector signed short CGV;\r
+ vector signed short OY;\r
+ vector unsigned short CSHIFT;\r
+ vector signed short *vYCoeffsBank, *vCCoeffsBank;\r
+#endif\r
+\r
+#if ARCH_BFIN\r
+ DECLARE_ALIGNED(4, uint32_t, oy);\r
+ DECLARE_ALIGNED(4, uint32_t, oc);\r
+ DECLARE_ALIGNED(4, uint32_t, zero);\r
+ DECLARE_ALIGNED(4, uint32_t, cy);\r
+ DECLARE_ALIGNED(4, uint32_t, crv);\r
+ DECLARE_ALIGNED(4, uint32_t, rmask);\r
+ DECLARE_ALIGNED(4, uint32_t, cbu);\r
+ DECLARE_ALIGNED(4, uint32_t, bmask);\r
+ DECLARE_ALIGNED(4, uint32_t, cgu);\r
+ DECLARE_ALIGNED(4, uint32_t, cgv);\r
+ DECLARE_ALIGNED(4, uint32_t, gmask);\r
+#endif\r
+\r
+#if HAVE_VIS\r
+ DECLARE_ALIGNED(8, uint64_t, sparc_coeffs)[10];\r
+#endif\r
+ int use_mmx_vfilter;\r
+\r
+ /* function pointers for swScale() */\r
+ yuv2planar1_fn yuv2plane1;\r
+ yuv2planarX_fn yuv2planeX;\r
+ yuv2interleavedX_fn yuv2nv12cX;\r
+ yuv2packed1_fn yuv2packed1;\r
+ yuv2packed2_fn yuv2packed2;\r
+ yuv2packedX_fn yuv2packedX;\r
+\r
+ /// Unscaled conversion of luma plane to YV12 for horizontal scaler.\r
+ void (*lumToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,\r
+ int width, uint32_t *pal);\r
+ /// Unscaled conversion of alpha plane to YV12 for horizontal scaler.\r
+ void (*alpToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,\r
+ int width, uint32_t *pal);\r
+ /// Unscaled conversion of chroma planes to YV12 for horizontal scaler.\r
+ void (*chrToYV12)(uint8_t *dstU, uint8_t *dstV,\r
+ const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,\r
+ int width, uint32_t *pal);\r
+\r
+ void (*readLumPlanar)(uint8_t *dst, const uint8_t *src[4], int width);\r
+ void (*readChrPlanar)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4],\r
+ int width);\r
+\r
+ void (*hyscale_fast)(struct SwsContext *c,\r
+ int16_t *dst, int dstWidth,\r
+ const uint8_t *src, int srcW, int xInc);\r
+ void (*hcscale_fast)(struct SwsContext *c,\r
+ int16_t *dst1, int16_t *dst2, int dstWidth,\r
+ const uint8_t *src1, const uint8_t *src2,\r
+ int srcW, int xInc);\r
+\r
+ void (*hyScale)(struct SwsContext *c, int16_t *dst, int dstW,\r
+ const uint8_t *src, const int16_t *filter,\r
+ const int32_t *filterPos, int filterSize);\r
+ void (*hcScale)(struct SwsContext *c, int16_t *dst, int dstW,\r
+ const uint8_t *src, const int16_t *filter,\r
+ const int32_t *filterPos, int filterSize);\r
+\r
+ void (*lumConvertRange)(int16_t *dst, int width);\r
+ void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width);\r
+\r
+ int needs_hcscale; ///< Set if there are chroma planes to be converted.\r
+\r
+ cl_mem cl_hLumFilter;\r
+ cl_mem cl_hLumFilterPos;\r
+ cl_mem cl_hChrFilter;\r
+ cl_mem cl_hChrFilterPos;\r
+ cl_mem cl_vLumFilter;\r
+ cl_mem cl_vLumFilterPos;\r
+ cl_mem cl_vChrFilter;\r
+ cl_mem cl_vChrFilterPos;\r
+\r
+ cl_mem cl_intermediaBuf;\r
+\r
+ cl_mem cl_src;\r
+ cl_mem cl_dst;\r
+} ScaleContext;\r
+\r
+void scale_init(int, int, int, int);\r
+void scale_release();\r
+int scale_run(cl_mem inbuf, cl_mem outbuf, int linesizey, int linesizeuv, int height);\r
+#endif\r
+#endif\r
--- /dev/null
+/* scale_kernel.h\r
+\r
+ Copyright (c) 2003-2012 HandBrake Team\r
+ This file is part of the HandBrake source code\r
+ Homepage: <http://handbrake.fr/>.\r
+ It may be used under the terms of the GNU General Public License v2.\r
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html\r
+\r
+ Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>\r
+ Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>\r
+\r
+\r
+ */\r
+ \r
+#ifdef USE_OPENCL\r
+#include <assert.h>\r
+#include <math.h>\r
+#include <stdio.h>\r
+#include <string.h>\r
+#include <time.h>\r
+#include "scale.h"\r
+#include "openclwrapper.h"\r
+\r
+#define OCLCHECK( method, ...) \\r
+ status = method(__VA_ARGS__); if(status != CL_SUCCESS) { \\r
+ hb_log(" error %s %d",# method, status); assert(0); return status; }\r
+ \r
+#define CREATEBUF( out, flags, size, ptr)\\r
+ out = clCreateBuffer( kenv->context, (flags), (size), ptr, &status );\\r
+ if( status != CL_SUCCESS ) { hb_log( "clCreateBuffer faild %d", status ); return -1; }\r
+\r
+ #define CL_PARAM_NUM 20\r
+\r
+/****************************************************************************************************************************/\r
+/*************************Combine the hscale and yuv2plane into scaling******************************************************/\r
+/****************************************************************************************************************************/\r
+static int CreateCLBuffer( ScaleContext *c, KernelEnv *kenv )\r
+{\r
+ cl_int status;\r
+\r
+ if(!c->hyscale_fast || !c->hcscale_fast)\r
+ {\r
+ CREATEBUF(c->cl_hLumFilter, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->dstW*c->hLumFilterSize*sizeof(cl_short), c->hLumFilter);\r
+ CREATEBUF(c->cl_hLumFilterPos, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->dstW*sizeof(cl_int), c->hLumFilterPos);\r
+ CREATEBUF(c->cl_hChrFilter, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->chrDstW*c->hChrFilterSize*sizeof(cl_short), c->hChrFilter);\r
+ CREATEBUF(c->cl_hChrFilterPos, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->chrDstW*sizeof(cl_int), c->hChrFilterPos);\r
+ }\r
+ if( c->vLumFilterSize > 1 && c->vChrFilterSize > 1 )\r
+ {\r
+ CREATEBUF(c->cl_vLumFilter, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->dstH*c->vLumFilterSize*sizeof(cl_short), c->vLumFilter);\r
+ CREATEBUF(c->cl_vChrFilter, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->chrDstH*c->vChrFilterSize*sizeof(cl_short), c->vChrFilter);\r
+ }\r
+ CREATEBUF(c->cl_vLumFilterPos, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->dstH*sizeof(cl_int), c->vLumFilterPos);\r
+ CREATEBUF(c->cl_vChrFilterPos, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, c->chrDstH*sizeof(cl_int), c->vChrFilterPos);\r
+ \r
+ return 1;\r
+}\r
+\r
+int av_scale_frame_func( void **userdata, KernelEnv *kenv )\r
+{\r
+ ScaleContext *c = (ScaleContext *)userdata[0];\r
+\r
+ c->cl_src = (cl_mem)userdata[2];\r
+ c->cl_dst = (cl_mem)userdata[1];\r
+\r
+ /*frame size*/\r
+ int *tmp = (int *)userdata[3];\r
+ int srcStride = tmp[0];\r
+ int srcChrStride = tmp[1];\r
+ int srcW = c->srcW;\r
+ int srcH = c->srcH;\r
+ \r
+ tmp = (int *)userdata[4];\r
+ int dstStride = tmp[0];\r
+ int dstChrStride = tmp[1];\r
+ int dstW = c->dstW;\r
+ int dstH = c->dstH;\r
+ \r
+ /* local variable */\r
+ cl_int status;\r
+ size_t global_work_size[2];\r
+\r
+ int intermediaSize;\r
+\r
+ int st = CreateCLBuffer(c,kenv);\r
+ if( !st )\r
+ {\r
+ hb_log( "CreateBuffer[%s] faild %d", "scale_opencl",st );\r
+ return -1;\r
+ }\r
+\r
+ intermediaSize = dstStride * srcH + dstChrStride * srcH;\r
+\r
+ CREATEBUF(c->cl_intermediaBuf, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, intermediaSize*sizeof(cl_short), NULL);\r
+\r
+ static int init_chr_status = 0;\r
+ static cl_kernel chr_kernel;\r
+\r
+ if(init_chr_status == 0){\r
+\r
+ if(!(c->flags & 1))\r
+ {\r
+ chr_kernel = clCreateKernel( kenv->program, "hscale_all_opencl", NULL );\r
+ //Set the Kernel Argument;\r
+ OCLCHECK(clSetKernelArg,chr_kernel, 2, sizeof(cl_mem), (void*)&c->cl_hLumFilter);\r
+ OCLCHECK(clSetKernelArg,chr_kernel, 3, sizeof(cl_mem), (void*)&c->cl_hLumFilterPos);\r
+ OCLCHECK(clSetKernelArg,chr_kernel, 4, sizeof(int), (void*)&c->hLumFilterSize);\r
+ OCLCHECK(clSetKernelArg,chr_kernel, 5, sizeof(cl_mem), (void*)&c->cl_hChrFilter);\r
+ OCLCHECK(clSetKernelArg,chr_kernel, 6, sizeof(cl_mem), (void*)&c->cl_hChrFilterPos);\r
+ OCLCHECK(clSetKernelArg,chr_kernel, 7, sizeof(int), (void*)&c->hChrFilterSize);\r
+ }\r
+ \r
+ /*Set the arguments*/\r
+ OCLCHECK(clSetKernelArg, chr_kernel, 8, sizeof(dstW), (void*)&dstW);\r
+ OCLCHECK(clSetKernelArg, chr_kernel, 9, sizeof(srcH), (void*)&srcH);\r
+ OCLCHECK(clSetKernelArg, chr_kernel, 10, sizeof(srcW), (void*)&srcW);\r
+ OCLCHECK(clSetKernelArg, chr_kernel, 11, sizeof(srcH), (void*)&srcH);\r
+ OCLCHECK(clSetKernelArg, chr_kernel, 12, sizeof(dstStride), (void*)&dstStride);\r
+ OCLCHECK(clSetKernelArg, chr_kernel, 13, sizeof(dstChrStride), (void*)&dstChrStride);\r
+ OCLCHECK(clSetKernelArg, chr_kernel, 14, sizeof(srcStride), (void*)&srcStride);\r
+ OCLCHECK(clSetKernelArg, chr_kernel, 15, sizeof(srcChrStride), (void*)&srcChrStride);\r
+ init_chr_status = 1;\r
+ }\r
+\r
+ kenv->kernel = chr_kernel;\r
+ OCLCHECK(clSetKernelArg, chr_kernel, 0, sizeof(cl_mem), (void*)&c->cl_intermediaBuf);\r
+ OCLCHECK(clSetKernelArg, chr_kernel, 1, sizeof(cl_mem), (void*)&c->cl_src);\r
+ /*Run the Kernel*/\r
+ global_work_size[0] = c->chrDstW;//dstW >> 1; //must times 256;\r
+ global_work_size[1] = c->chrSrcH;\r
+\r
+ OCLCHECK(clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);\r
+\r
+ static int init_lum_status = 0;\r
+ static cl_kernel lum_kernel;\r
+\r
+ if( init_lum_status == 0 ){\r
+ //Vertical:\r
+ /*Create Kernel*/\r
+ if( c->vLumFilterSize > 1 && c->vChrFilterSize > 1 )\r
+ lum_kernel = clCreateKernel( kenv->program, "vscale_all_nodither_opencl", NULL );\r
+ else\r
+ lum_kernel = clCreateKernel( kenv->program, "vscale_fast_opencl", NULL );\r
+\r
+ if( c->vLumFilterSize > 1 && c->vChrFilterSize > 1 )\r
+ {\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 2, sizeof(cl_mem), (void*)&c->cl_vLumFilter);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 3, sizeof(int), (void*)&c->vLumFilterSize);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 4, sizeof(cl_mem), (void*)&c->cl_vChrFilter);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 5, sizeof(int), (void*)&c->vChrFilterSize);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 6, sizeof(cl_mem), (void*)&c->cl_vLumFilterPos);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 7, sizeof(cl_mem), (void*)&c->cl_vChrFilterPos);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 8, sizeof(dstW), (void*)&dstW);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 9, sizeof(dstH), (void*)&dstH);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 10, sizeof(srcW), (void*)&srcW);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 11, sizeof(srcH), (void*)&srcH);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 12, sizeof(dstStride), (void*)&dstStride);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 13, sizeof(dstChrStride), (void*)&dstChrStride);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 14, sizeof(dstStride), (void*)&dstStride);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 15, sizeof(dstChrStride), (void*)&dstChrStride);\r
+ }\r
+ else\r
+ {\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 2, sizeof(cl_mem), (void*)&c->cl_vLumFilterPos);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 3, sizeof(cl_mem), (void*)&c->cl_vChrFilterPos);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 4, sizeof(dstW), (void*)&dstW);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 5, sizeof(dstH), (void*)&dstH);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 6, sizeof(srcW), (void*)&srcW);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 7, sizeof(srcH), (void*)&srcH);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 8, sizeof(dstStride), (void*)&dstStride);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 9, sizeof(dstChrStride), (void*)&dstChrStride);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 10, sizeof(dstStride), (void*)&dstStride);\r
+ OCLCHECK(clSetKernelArg, lum_kernel, 11, sizeof(dstChrStride), (void*)&dstChrStride);\r
+ }\r
+ init_lum_status = 1;\r
+ }\r
+ \r
+ kenv->kernel = lum_kernel;\r
+ OCLCHECK(clSetKernelArg, kenv->kernel, 0, sizeof(cl_mem), (void*)&c->cl_dst);\r
+ OCLCHECK(clSetKernelArg, kenv->kernel, 1, sizeof(cl_mem), (void*)&c->cl_intermediaBuf);\r
+ \r
+ /*Run the Kernel*/\r
+ global_work_size[0] = c->chrDstW;\r
+ global_work_size[1] = c->chrDstH;\r
+\r
+ OCLCHECK(clEnqueueNDRangeKernel, kenv->command_queue, kenv->kernel, 2, NULL,global_work_size, NULL, 0, NULL, NULL);\r
+\r
+ clReleaseMemObject( c->cl_intermediaBuf );\r
+ \r
+ return 1;\r
+}\r
+\r
+void av_scale_frame(ScaleContext *c, void *dst, void *src, int *srcStride, int *dstStride, int *should_dither)\r
+{\r
+ \r
+ static int regflg = 0;\r
+ void *userdata[CL_PARAM_NUM];\r
+ userdata[0] = (void *)c;\r
+ userdata[1] = (void *)dst;\r
+ userdata[2] = (void *)src;\r
+ userdata[3] = (void *)srcStride;\r
+ userdata[4] = (void *)dstStride;\r
+ userdata[5] = (void *)should_dither;\r
+\r
+ if( regflg==0 )\r
+ {\r
+ int st = hb_register_kernel_wrapper( "scale_opencl", av_scale_frame_func);\r
+ if( !st )\r
+ {\r
+ hb_log( "register kernel[%s] faild %d", "scale_opencl",st );\r
+ return;\r
+ }\r
+ regflg++;\r
+ }\r
+ \r
+ if( !hb_run_kernel( "scale_opencl", userdata ))\r
+ {\r
+ hb_log("run kernel function[%s] faild", "scale_opencl_func" );\r
+ return;\r
+ } \r
+}\r
+\r
+#endif\r
--- /dev/null
+/* scale_kernel.h
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>
+ Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>
+
+
+ */
+
+#ifndef _H_SCALE_KERNEL_H
+#define _H_SCALE_KERNEL_H
+#ifdef USE_OPENCL
+void av_scale_frame(ScaleContext *c, void *dst, void *src, int *srcStride, int *dstStride, int *should_dither);
+#endif
+#endif
#include "lang.h"
#include "a52dec/a52.h"
#include "libbluray/bluray.h"
+#include "vadxva2.h"
#define min(a, b) a < b ? a : b
#define HB_MAX_PROBE_SIZE (1*1024*1024)
static const stream2codec_t st2codec[256] = {
st(0x00, U, 0, 0, NULL),
st(0x01, V, WORK_DECMPEG2, 0, "MPEG1"),
- st(0x02, V, WORK_DECMPEG2, 0, "MPEG2"),
+ st(0x02, V, WORK_DECMPEG2, AV_CODEC_ID_MPEG2VIDEO, "MPEG2"),
st(0x03, A, HB_ACODEC_FFMPEG, AV_CODEC_ID_MP2, "MPEG1"),
st(0x04, A, HB_ACODEC_FFMPEG, AV_CODEC_ID_MP2, "MPEG2"),
st(0x05, N, 0, 0, "ISO 13818-1 private section"),
if ( fread(buf, 1, sizeof(buf), stream->file_handle) == sizeof(buf) )
{
+#ifdef USE_HWD
+ if ( hb_gui_use_hwd_flag == 1 )
+ return 0;
+#endif
int psize;
if ( ( psize = hb_stream_check_for_ts(buf) ) != 0 )
{
{
hb_log( "transport stream missing PCRs - using video DTS instead" );
}
-
+#ifdef USE_HWD
+ hb_va_dxva2_t * dxva2 = NULL;
+ dxva2 = hb_va_create_dxva2( dxva2, title->video_codec_param );
+ if ( dxva2 )
+ {
+ title->hwd_support = 1;
+ hb_va_close(dxva2);
+ dxva2 = NULL;
+ }
+ else
+ title->hwd_support = 0;
+#else
+ title->hwd_support = 0;
+#endif
+#ifdef USE_OPENCL
+ if ( hb_confirm_gpu_type() == 0 )
+ title->opencl_support = 1;
+ else
+ title->opencl_support = 0;
+#else
+ title->opencl_support = 0;
+#endif
// Height, width, rate and aspect ratio information is filled in
// when the previews are built
return title;
title->demuxer = HB_NULL_DEMUXER;
title->video_codec = 0;
int i;
+ int pix_fmt = -1;
for (i = 0; i < ic->nb_streams; ++i )
{
if ( ic->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO &&
title->video_codec == 0 )
{
AVCodecContext *context = ic->streams[i]->codec;
+ pix_fmt = context->pix_fmt;
if ( context->pix_fmt != AV_PIX_FMT_YUV420P &&
!sws_isSupportedInput( context->pix_fmt ) )
{
chapter->seconds = title->seconds;
hb_list_add( title->list_chapter, chapter );
}
+#ifdef USE_HWD
+ hb_va_dxva2_t * dxva2 = NULL;
+ dxva2 = hb_va_create_dxva2( dxva2, title->video_codec_param );
+ if (dxva2)
+ {
+ title->hwd_support = 1;
+ hb_va_close(dxva2);
+ dxva2 = NULL;
+ }
+ else
+ title->hwd_support = 0;
+ if ( hb_check_hwd_fmt(pix_fmt) == 0)
+ title->hwd_support = 0;
+#else
+ title->hwd_support = 0;
+#endif
+#ifdef USE_OPENCL
+ if (hb_confirm_gpu_type() == 0)
+ title->opencl_support = 1;
+ else
+ title->opencl_support = 0;
+#else
+ title->opencl_support = 0;
+#endif
return title;
}
--- /dev/null
+/* vadxva2.c
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>
+ Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>
+
+ */
+#include "vadxva2.h"
+
+#ifdef USE_OPENCL
+#if defined(__APPLE__)
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#include "oclnv12toyuv.h"
+#include "scale.h"
+#endif
+
+#ifdef USE_HWD
+
+static int hb_va_setup( hb_va_dxva2_t *dxva2, void **hw, int width, int height );
+static int hb_va_get( hb_va_dxva2_t *dxva2, AVFrame *frame );
+static int hb_d3d_create_device( hb_va_dxva2_t *dxva2 );
+static void hb_d3d_destroy_device( hb_va_dxva2_t *dxvva2 );
+static int hb_d3d_create_device_manager( hb_va_dxva2_t *dxva2 );
+static void hb_d3d_destroy_device_manager( hb_va_dxva2_t *dxva2 );
+static int hb_dx_create_video_service( hb_va_dxva2_t *dxva2 );
+static void hb_dx_destroy_video_service( hb_va_dxva2_t *dxva2 );
+static int hb_dx_find_video_service_conversion( hb_va_dxva2_t *dxva2, GUID *input, D3DFORMAT *output );
+static int hb_dx_create_video_decoder( hb_va_dxva2_t *dxva2, int codec_id, const hb_title_t* fmt );
+static void hb_dx_create_video_conversion( hb_va_dxva2_t *dxva2 );
+static const hb_d3d_format_t *hb_d3d_find_format( D3DFORMAT format );
+static const hb_dx_mode_t *hb_dx_find_mode( const GUID *guid );
+static void hb_dx_destroy_video_decoder( hb_va_dxva2_t *dxva2 );
+/**
+ * It destroys a Direct3D device manager
+ */
+static void hb_d3d_destroy_device_manager( hb_va_dxva2_t *dxva2 )
+{
+ if( dxva2->devmng )
+ IDirect3DDeviceManager9_Release( dxva2->devmng );
+}
+/**
+ * It releases a Direct3D device and its resources.
+ */
+static void hb_d3d_destroy_device( hb_va_dxva2_t *dxva2 )
+{
+ if( dxva2->d3ddev )
+ IDirect3DDevice9_Release( dxva2->d3ddev );
+ if( dxva2->d3dobj )
+ IDirect3D9_Release( dxva2->d3dobj );
+}
+/**
+ * It destroys a DirectX video service
+ */
+static void hb_dx_destroy_video_service( hb_va_dxva2_t *dxva2 )
+{
+ if( dxva2->device )
+ IDirect3DDeviceManager9_CloseDeviceHandle( dxva2->devmng, dxva2->device );
+
+ if( dxva2->vs )
+ IDirectXVideoDecoderService_Release( dxva2->vs );
+}
+
+static const hb_d3d_format_t *hb_d3d_find_format( D3DFORMAT format )
+{
+ unsigned i;
+ for( i = 0; d3d_formats[i].name; i++ )
+ {
+ if( d3d_formats[i].format == format )
+ return &d3d_formats[i];
+ }
+ return NULL;
+}
+
+static void hb_dx_create_video_conversion( hb_va_dxva2_t *dxva2 )
+{
+ switch( dxva2->render )
+ {
+ case MAKEFOURCC( 'N', 'V', '1', '2' ):
+ dxva2->output = MAKEFOURCC( 'Y', 'V', '1', '2' );
+ break;
+ default:
+ dxva2->output = dxva2->render;
+ break;
+ }
+}
+
+void hb_va_release( hb_va_dxva2_t *dxva2, AVFrame *frame )
+{
+ LPDIRECT3DSURFACE9 d3d = (LPDIRECT3DSURFACE9)(uintptr_t)frame->data[3];
+ unsigned i;
+ for( i = 0; i < dxva2->surface_count; i++ )
+ {
+ hb_va_surface_t *surface = &dxva2->surface[i];
+ if( surface->d3d == d3d )
+ surface->refcount--;
+ }
+}
+
+
+void hb_va_close( hb_va_dxva2_t *dxva2 )
+{
+ hb_dx_destroy_video_decoder( dxva2 );
+ hb_dx_destroy_video_service( dxva2 );
+ hb_d3d_destroy_device_manager( dxva2 );
+ hb_d3d_destroy_device( dxva2 );
+
+ if( dxva2->hdxva2_dll )
+ FreeLibrary( dxva2->hdxva2_dll );
+ if( dxva2->hd3d9_dll )
+ FreeLibrary( dxva2->hd3d9_dll );
+
+#ifdef USE_OPENCL
+ if ( dxva2->nv12toyuv_tmp_in )
+ free( dxva2->nv12toyuv_tmp_in );
+ if ( dxva2->nv12toyuv_tmp_out )
+ free( dxva2->nv12toyuv_tmp_out );
+#endif
+ dxva2->description = NULL;
+ free( dxva2 );
+}
+
+/**
+ * It creates a DXVA2 decoder using the given video format
+ */
+static int hb_dx_create_video_decoder( hb_va_dxva2_t *dxva2, int codec_id, const hb_title_t* fmt )
+{
+ dxva2->width = fmt->width;
+ dxva2->height = fmt->height;
+ dxva2->surface_width = (fmt->width + 15) & ~15;
+ dxva2->surface_height = (fmt->height + 15) & ~15;
+ switch( codec_id )
+ {
+ case AV_CODEC_ID_H264:
+ dxva2->surface_count = 16 + 1;
+ break;
+ default:
+ dxva2->surface_count = 2 + 1;
+ break;
+ }
+ LPDIRECT3DSURFACE9 surface_list[VA_DXVA2_MAX_SURFACE_COUNT];
+ if( FAILED( IDirectXVideoDecoderService_CreateSurface( dxva2->vs,
+ dxva2->surface_width,
+ dxva2->surface_height,
+ dxva2->surface_count - 1,
+ dxva2->render,
+ D3DPOOL_DEFAULT,
+ 0,
+ DXVA2_VideoDecoderRenderTarget,
+ surface_list, NULL )))
+ {
+ hb_log( "dxva2:IDirectXVideoAccelerationService_CreateSurface failed" );
+ dxva2->surface_count = 0;
+ return HB_WORK_ERROR;
+ }
+
+ unsigned i;
+ for( i = 0; i<dxva2->surface_count; i++ )
+ {
+ hb_va_surface_t *surface = &dxva2->surface[i];
+ surface->d3d = surface_list[i];
+ surface->refcount = 0;
+ surface->order = 0;
+ }
+ hb_log( "dxva2:CreateSurface succeed with %d, fmt (%dx%d) surfaces (%dx%d)", dxva2->surface_count,
+ fmt->width, fmt->height, dxva2->surface_width, dxva2->surface_height );
+ DXVA2_VideoDesc dsc;
+ memset( &dsc, 0, sizeof(dsc));
+ dsc.SampleWidth = fmt->width;
+ dsc.SampleHeight = fmt->height;
+ dsc.Format = dxva2->render;
+
+ if( fmt->rate> 0 && fmt->rate_base> 0 )
+ {
+ dsc.InputSampleFreq.Numerator = fmt->rate;
+ dsc.InputSampleFreq.Denominator = fmt->rate_base;
+ }
+ else
+ {
+ dsc.InputSampleFreq.Numerator = 0;
+ dsc.InputSampleFreq.Denominator = 0;
+ }
+
+ dsc.OutputFrameFreq = dsc.InputSampleFreq;
+ dsc.UABProtectionLevel = FALSE;
+ dsc.Reserved = 0;
+
+ /* FIXME I am unsure we can let unknown everywhere */
+ DXVA2_ExtendedFormat *ext = &dsc.SampleFormat;
+ ext->SampleFormat = 0; //DXVA2_SampleUnknown;
+ ext->VideoChromaSubsampling = 0; //DXVA2_VideoChromaSubsampling_Unknown;
+ ext->NominalRange = 0; //DXVA2_NominalRange_Unknown;
+ ext->VideoTransferMatrix = 0; //DXVA2_VideoTransferMatrix_Unknown;
+ ext->VideoLighting = 0; //DXVA2_VideoLighting_Unknown;
+ ext->VideoPrimaries = 0; //DXVA2_VideoPrimaries_Unknown;
+ ext->VideoTransferFunction = 0; //DXVA2_VideoTransFunc_Unknown;
+
+ /* List all configurations available for the decoder */
+ UINT cfg_count = 0;
+ DXVA2_ConfigPictureDecode *cfg_list = NULL;
+ if( FAILED( IDirectXVideoDecoderService_GetDecoderConfigurations( dxva2->vs, &dxva2->input, &dsc, NULL, &cfg_count, &cfg_list )))
+ {
+ hb_log( "dxva2:IDirectXVideoDecoderService_GetDecoderConfigurations failed" );
+ return HB_WORK_ERROR;
+ }
+ hb_log( "dxva2:we got %d decoder configurations", cfg_count );
+
+ /* Select the best decoder configuration */
+ int cfg_score = 0;
+ for( i = 0; i < cfg_count; i++ )
+ {
+ const DXVA2_ConfigPictureDecode *cfg = &cfg_list[i];
+ hb_log( "dxva2:configuration[%d] ConfigBitstreamRaw %d", i, cfg->ConfigBitstreamRaw );
+ int score;
+ if( cfg->ConfigBitstreamRaw == 1 )
+ score = 1;
+ else if( codec_id == AV_CODEC_ID_H264 && cfg->ConfigBitstreamRaw == 2 )
+ score = 2;
+ else
+ continue;
+ if( IsEqualGUID( &cfg->guidConfigBitstreamEncryption, &DXVA_NoEncrypt ))
+ score += 16;
+ if( cfg_score < score )
+ {
+ dxva2->cfg = *cfg;
+ cfg_score = score;
+ }
+ }
+ //my_release(cfg_list);
+ if( cfg_score <= 0 )
+ {
+ hb_log( "dxva2:Failed to find a supported decoder configuration" );
+ return HB_WORK_ERROR;
+ }
+
+ /* Create the decoder */
+ IDirectXVideoDecoder *decoder;
+ if( FAILED( IDirectXVideoDecoderService_CreateVideoDecoder( dxva2->vs, &dxva2->input, &dsc, &dxva2->cfg, surface_list, dxva2->surface_count, &decoder )))
+ {
+ hb_log( "dxva2:IDirectXVideoDecoderService_CreateVideoDecoder failed" );
+ return HB_WORK_ERROR;
+ }
+ dxva2->decoder = decoder;
+ hb_log( "dxva2:IDirectXVideoDecoderService_CreateVideoDecoder succeed" );
+ return HB_WORK_OK;
+}
+
+typedef HWND (WINAPI *PROCGETSHELLWND)();
+/**
+ * It creates a DirectX video service
+ */
+static int hb_d3d_create_device( hb_va_dxva2_t *dxva2 )
+{
+ LPDIRECT3D9 (WINAPI *Create9)( UINT SDKVersion );
+ Create9 = (void*)GetProcAddress( dxva2->hd3d9_dll, TEXT( "Direct3DCreate9" ));
+ if( !Create9 )
+ {
+ hb_log( "dxva2:Cannot locate reference to Direct3DCreate9 ABI in DLL" );
+ return HB_WORK_ERROR;
+ }
+ LPDIRECT3D9 d3dobj;
+ d3dobj = Create9( D3D_SDK_VERSION );
+ if( !d3dobj )
+ {
+ hb_log( "dxva2:Direct3DCreate9 failed" );
+ return HB_WORK_ERROR;
+ }
+ dxva2->d3dobj = d3dobj;
+ D3DADAPTER_IDENTIFIER9 *d3dai = &dxva2->d3dai;
+ if( FAILED( IDirect3D9_GetAdapterIdentifier( dxva2->d3dobj, D3DADAPTER_DEFAULT, 0, d3dai )))
+ {
+ hb_log( "dxva2:IDirect3D9_GetAdapterIdentifier failed" );
+ memset( d3dai, 0, sizeof(*d3dai));
+ }
+
+ PROCGETSHELLWND GetShellWindow;
+ HMODULE hUser32 = GetModuleHandle( "user32" );
+ GetShellWindow = (PROCGETSHELLWND)
+ GetProcAddress( hUser32, "GetShellWindow" );
+
+ D3DPRESENT_PARAMETERS *d3dpp = &dxva2->d3dpp;
+ memset( d3dpp, 0, sizeof(*d3dpp));
+ d3dpp->Flags = D3DPRESENTFLAG_VIDEO;
+ d3dpp->Windowed = TRUE;
+ d3dpp->hDeviceWindow = NULL;
+ d3dpp->SwapEffect = D3DSWAPEFFECT_DISCARD;
+ d3dpp->MultiSampleType = D3DMULTISAMPLE_NONE;
+ d3dpp->PresentationInterval = D3DPRESENT_INTERVAL_DEFAULT;
+ d3dpp->BackBufferCount = 0; /* FIXME what to put here */
+ d3dpp->BackBufferFormat = D3DFMT_X8R8G8B8; /* FIXME what to put here */
+ d3dpp->BackBufferWidth = 0;
+ d3dpp->BackBufferHeight = 0;
+ d3dpp->EnableAutoDepthStencil = FALSE;
+
+ LPDIRECT3DDEVICE9 d3ddev;
+ //if (FAILED(IDirect3D9_CreateDevice(d3dobj, D3DADAPTER_DEFAULT, D3DDEVTYPE_HAL, GetShellWindow(), D3DCREATE_SOFTWARE_VERTEXPROCESSING|D3DCREATE_MULTITHREADED, d3dpp, &d3ddev)))
+ if( FAILED( IDirect3D9_CreateDevice( d3dobj,
+ D3DADAPTER_DEFAULT,
+ D3DDEVTYPE_HAL,
+ GetShellWindow(),
+ D3DCREATE_HARDWARE_VERTEXPROCESSING|D3DCREATE_MULTITHREADED,
+ d3dpp,
+ &d3ddev )))
+ {
+ hb_log( "dxva2:IDirect3D9_CreateDevice failed" );
+ return HB_WORK_ERROR;
+ }
+ dxva2->d3ddev = d3ddev;
+
+ return HB_WORK_OK;
+}
+/**
+ * It creates a Direct3D device manager
+ */
+static int hb_d3d_create_device_manager( hb_va_dxva2_t *dxva2 )
+{
+ HRESULT(WINAPI *CreateDeviceManager9)( UINT *pResetToken, IDirect3DDeviceManager9 ** );
+ CreateDeviceManager9 = (void*)GetProcAddress( dxva2->hdxva2_dll, TEXT( "DXVA2CreateDirect3DDeviceManager9" ));
+
+ if( !CreateDeviceManager9 )
+ {
+ hb_log( "dxva2:cannot load function" );
+ return HB_WORK_ERROR;
+ }
+
+ UINT token;
+ IDirect3DDeviceManager9 *devmng;
+ if( FAILED( CreateDeviceManager9( &token, &devmng )))
+ {
+ hb_log( "dxva2:OurDirect3DCreateDeviceManager9 failed" );
+ return HB_WORK_ERROR;
+ }
+ dxva2->token = token;
+ dxva2->devmng = devmng;
+
+ long hr = IDirect3DDeviceManager9_ResetDevice( devmng, dxva2->d3ddev, token );
+ if( FAILED( hr ))
+ {
+ hb_log( "dxva2:IDirect3DDeviceManager9_ResetDevice failed: %08x", (unsigned)hr );
+ return HB_WORK_ERROR;
+ }
+ return HB_WORK_OK;
+}
+/**
+ * It creates a DirectX video service
+ */
+static int hb_dx_create_video_service( hb_va_dxva2_t *dxva2 )
+{
+ HRESULT (WINAPI *CreateVideoService)( IDirect3DDevice9 *, REFIID riid, void **ppService );
+ CreateVideoService = (void*)GetProcAddress( dxva2->hdxva2_dll, TEXT( "DXVA2CreateVideoService" ));
+
+ if( !CreateVideoService )
+ {
+ hb_log( "dxva2:cannot load function" );
+ return HB_WORK_ERROR;
+ }
+
+ HRESULT hr;
+
+ HANDLE device;
+ hr = IDirect3DDeviceManager9_OpenDeviceHandle( dxva2->devmng, &device );
+ if( FAILED( hr ))
+ {
+ hb_log( "dxva2:OpenDeviceHandle failed" );
+ return HB_WORK_ERROR;
+ }
+ dxva2->device = device;
+
+ IDirectXVideoDecoderService *vs;
+ hr = IDirect3DDeviceManager9_GetVideoService( dxva2->devmng, device, &IID_IDirectXVideoDecoderService, (void*)&vs );
+ if( FAILED( hr ))
+ {
+ hb_log( "dxva2:GetVideoService failed" );
+ return HB_WORK_ERROR;
+ }
+ dxva2->vs = vs;
+
+ return HB_WORK_OK;
+}
+/**
+ * Find the best suited decoder mode GUID and render format.
+ */
+static int hb_dx_find_video_service_conversion( hb_va_dxva2_t *dxva2, GUID *input, D3DFORMAT *output )
+{
+ unsigned int input_count = 0;
+ GUID *input_list = NULL;
+ if( FAILED( IDirectXVideoDecoderService_GetDecoderDeviceGuids( dxva2->vs, &input_count, &input_list )))
+ {
+ hb_log( "dxva2:IDirectXVideoDecoderService_GetDecoderDeviceGuids failed" );
+ return HB_WORK_ERROR;
+ }
+ unsigned i, j;
+ for( i = 0; i < input_count; i++ )
+ {
+ const GUID *g = &input_list[i];
+ const hb_dx_mode_t *mode = hb_dx_find_mode( g );
+ }
+
+ for( i = 0; dxva2_modes[i].name; i++ )
+ {
+ const hb_dx_mode_t *mode = &dxva2_modes[i];
+ if( !mode->codec || mode->codec != dxva2->codec_id )
+ continue;
+
+ int is_suported = 0;
+ const GUID *g;
+ for( g = &input_list[0]; !is_suported && g < &input_list[input_count]; g++ )
+ {
+ is_suported = IsEqualGUID( mode->guid, g );
+ }
+ if( !is_suported )
+ continue;
+
+ unsigned int output_count = 0;
+ D3DFORMAT *output_list = NULL;
+ if( FAILED( IDirectXVideoDecoderService_GetDecoderRenderTargets( dxva2->vs, mode->guid, &output_count, &output_list )))
+ {
+ hb_log( "dxva2:IDirectXVideoDecoderService_GetDecoderRenderTargets failed" );
+ continue;
+ }
+ for( j = 0; j < output_count; j++ )
+ {
+ const D3DFORMAT f = output_list[j];
+ const hb_d3d_format_t *format = hb_d3d_find_format( f );
+ if( format )
+ {
+ //hb_log( "dxva2:%s is supported for output", format->name );
+ }
+ else
+ {
+ hb_log( "dxvar2:%d is supported for output (%4.4s)", f, (const char*)&f );
+ }
+ }
+
+ for( j = 0; d3d_formats[j].name; j++ )
+ {
+ const hb_d3d_format_t *format = &d3d_formats[j];
+ int is_suported = 0;
+ unsigned k;
+ for( k = 0; !is_suported && k < output_count; k++ )
+ {
+ is_suported = format->format == output_list[k];
+ }
+ if( !is_suported )
+ continue;
+ *input = *mode->guid;
+ *output = format->format;
+ return HB_WORK_OK;
+ }
+ }
+ return HB_WORK_ERROR;
+}
+static const hb_dx_mode_t *hb_dx_find_mode( const GUID *guid )
+{
+ unsigned i;
+ for( i = 0; dxva2_modes[i].name; i++ )
+ {
+ if( IsEqualGUID( dxva2_modes[i].guid, guid ))
+ return &dxva2_modes[i];
+ }
+ return NULL;
+}
+
+
+static void hb_dx_destroy_video_decoder( hb_va_dxva2_t *dxva2 )
+{
+ if( dxva2->decoder )
+ IDirectXVideoDecoder_Release( dxva2->decoder );
+ dxva2->decoder = NULL;
+
+ unsigned i;
+ for( i = 0; i<dxva2->surface_count; i++ )
+ IDirect3DSurface9_Release( dxva2->surface[i].d3d );
+ dxva2->surface_count = 0;
+}
+/**
+ * setup dxva2
+*/
+static int hb_va_setup( hb_va_dxva2_t *dxva2, void **hw, int width, int height )
+{
+ if( dxva2->width == width && dxva2->height == height && dxva2->decoder )
+ goto ok;
+
+ hb_dx_destroy_video_decoder( dxva2 );
+ *hw = NULL;
+ dxva2->i_chroma = 0;
+
+ if( width <= 0 || height <= 0 )
+ return HB_WORK_ERROR;
+
+ hb_title_t fmt;
+ memset( &fmt, 0, sizeof(fmt));
+ fmt.width = width;
+ fmt.height = height;
+
+ if( hb_dx_create_video_decoder( dxva2, dxva2->codec_id, &fmt ) == HB_WORK_ERROR )
+ return HB_WORK_ERROR;
+ dxva2->hw.decoder = dxva2->decoder;
+ dxva2->hw.cfg = &dxva2->cfg;
+ dxva2->hw.surface_count = dxva2->surface_count;
+ dxva2->hw.surface = dxva2->hw_surface;
+
+ unsigned i;
+ for( i = 0; i < dxva2->surface_count; i++ )
+ dxva2->hw.surface[i] = dxva2->surface[i].d3d;
+
+ hb_dx_create_video_conversion( dxva2 );
+
+ok:
+ *hw = &dxva2->hw;
+ const hb_d3d_format_t *output = hb_d3d_find_format( dxva2->output );
+ dxva2->i_chroma = output->codec;
+ return HB_WORK_OK;
+
+}
+
+static int hb_va_get( hb_va_dxva2_t *dxva2, AVFrame *frame )
+{
+ unsigned i, old;
+ for( i = 0, old = 0; i < dxva2->surface_count; i++ )
+ {
+ hb_va_surface_t *surface = &dxva2->surface[i];
+ if( !surface->refcount )
+ break;
+ if( surface->order < dxva2->surface[old].order )
+ old = i;
+ }
+ if( i >= dxva2->surface_count )
+ i = old;
+
+ hb_va_surface_t *surface = &dxva2->surface[i];
+
+ surface->refcount = 1;
+ surface->order = dxva2->surface_order++;
+
+ for( i = 0; i < 4; i++ )
+ {
+ frame->data[i] = NULL;
+ frame->linesize[i] = 0;
+ if( i == 0 || i == 3 )
+ frame->data[i] = (void*)surface->d3d;
+ }
+ return HB_WORK_OK;
+}
+/**
+ * nv12 to yuv of c reference
+ */
+static void hb_copy_from_nv12( uint8_t *dst, uint8_t *src[2], size_t src_pitch[2], unsigned width, unsigned height )
+{
+ unsigned int i, j;
+ uint8_t *dstU, *dstV;
+ dstU = dst + width*height;
+ dstV = dstU + width*height/4;
+ unsigned int heithtUV, widthUV;
+ heithtUV = height/2;
+ widthUV = width/2;
+
+ for( i = 0; i < height; i++ ) //Y
+ {
+ memcpy( dst + i * width, src[0] + i * src_pitch[0], width );
+ }
+ for( i = 0; i < heithtUV; i++ )
+ {
+ for( j = 0; j < widthUV; j++ )
+ {
+ dstU[i * widthUV + j] = *(src[1] + i * src_pitch[1] + 2 * j);
+ dstV[i *widthUV + j] = *(src[1] + i * src_pitch[1] + 2 * j + 1);
+ }
+ }
+}
+
+#ifdef USE_OPENCL
+void hb_init_filter( cl_mem src, int srcwidth, int srcheight, uint8_t* dst, int dstwidth, int dstheight, int *crop )
+{
+ T_FilterLink fl = {0};
+ int STEP = srcwidth * srcheight * 3 / 2;
+ int OUTSTEP = dstwidth * dstheight * 3 / 2;
+ int HEIGHT = srcheight;
+ int LINESIZEY = srcwidth;
+ int LINESIZEUV = srcwidth / 2;
+
+ cl_mem cl_outbuf;
+
+ if( !hb_create_buffer( &(cl_outbuf), CL_MEM_WRITE_ONLY, OUTSTEP ) )
+ {
+ hb_log("av_create_buffer cl_outbuf Error");
+ return;
+ }
+
+ fl.cl_outbuf = cl_outbuf;
+
+ scale_run( src, fl.cl_outbuf, LINESIZEY, LINESIZEUV, HEIGHT );
+
+ hb_read_opencl_buffer( fl.cl_outbuf, dst, OUTSTEP );
+ CL_FREE( cl_outbuf );
+
+ return;
+}
+#endif
+/**
+ * lock frame data form surface.
+ * nv12 to yuv with opencl and with C reference
+ * scale with opencl
+ */
+int hb_va_extract( hb_va_dxva2_t *dxva2, uint8_t *dst, AVFrame *frame, int job_w, int job_h, int *crop, hb_oclscale_t *os, int use_opencl, int use_decomb, int use_detelecine )
+
+{
+ LPDIRECT3DSURFACE9 d3d = (LPDIRECT3DSURFACE9)(uintptr_t)frame->data[3];
+ D3DLOCKED_RECT lock;
+ if( FAILED( IDirect3DSurface9_LockRect( d3d, &lock, NULL, D3DLOCK_READONLY )))
+ {
+ hb_log( "dxva2:Failed to lock surface" );
+ return HB_WORK_ERROR;
+ }
+
+ if( dxva2->render == MAKEFOURCC( 'N', 'V', '1', '2' ))
+ {
+ uint8_t *plane[2] =
+ {
+ lock.pBits,
+ (uint8_t*)lock.pBits + lock.Pitch * dxva2->surface_height
+ };
+ size_t pitch[2] =
+ {
+ lock.Pitch,
+ lock.Pitch,
+ };
+
+ hb_copy_from_nv12( dst, plane, pitch, dxva2->width, dxva2->height );
+ }
+ IDirect3DSurface9_UnlockRect( d3d );
+ return HB_WORK_OK;
+}
+
+/**
+ * create dxva2 service
+ * load library D3D9.dll
+ */
+hb_va_dxva2_t * hb_va_create_dxva2( hb_va_dxva2_t *dxva2, int codec_id )
+{
+ if( dxva2 )
+ {
+ hb_va_close( dxva2 );
+ dxva2 = NULL;
+ }
+
+ hb_va_dxva2_t *dxva = calloc( 1, sizeof(*dxva) );
+ if( !dxva ) return NULL;
+ dxva->codec_id = codec_id;
+
+ dxva->hd3d9_dll = LoadLibrary( TEXT( "D3D9.DLL" ) );
+ if( !dxva->hd3d9_dll )
+ {
+ hb_log( "dxva2:cannot load d3d9.dll" );
+ goto error;
+ }
+ dxva->hdxva2_dll = LoadLibrary( TEXT( "DXVA2.DLL" ) );
+ if( !dxva->hdxva2_dll )
+ {
+ hb_log( "dxva2:cannot load DXVA2.dll" );
+ goto error;
+ }
+
+ if( hb_d3d_create_device( dxva ) == HB_WORK_ERROR )
+ {
+ hb_log( "dxva2:Failed to create Direct3D device" );
+ goto error;
+ }
+
+ if( hb_d3d_create_device_manager( dxva ) == HB_WORK_ERROR )
+ {
+ hb_log( "dxva2:D3dCreateDeviceManager failed" );
+ goto error;
+ }
+
+
+ if( hb_dx_create_video_service( dxva ) == HB_WORK_ERROR )
+ {
+ hb_log( "dxva2:DxCreateVideoService failed" );
+ goto error;
+ }
+
+ if( hb_dx_find_video_service_conversion( dxva, &dxva->input, &dxva->render ) == HB_WORK_ERROR )
+ {
+ hb_log( "dxva2:DxFindVideoServiceConversion failed" );
+ goto error;
+ }
+
+ dxva->do_job = HB_WORK_OK;
+ dxva->description = "DXVA2";
+
+ return dxva;
+
+error:
+ hb_va_close( dxva );
+ return NULL;
+}
+
+void hb_va_new_dxva2( hb_va_dxva2_t *dxva2, AVCodecContext *p_context )
+{
+ if( p_context->width > 0 && p_context->height > 0 )
+ {
+ if( hb_va_setup( dxva2, &p_context->hwaccel_context, p_context->width, p_context->height ) == HB_WORK_ERROR )
+ {
+ hb_log( "dxva2:hb_va_Setup failed" );
+ hb_va_close( dxva2 );
+ dxva2 = NULL;
+ }
+ }
+ if( dxva2 )
+ {
+ dxva2->input_pts[0] = 0;
+ dxva2->input_pts[1] = 0;
+ if( dxva2->description )
+ hb_log( "dxva2:Using %s for hardware decoding", dxva2->description );
+ p_context->draw_horiz_band = NULL;
+ }
+
+}
+
+char* hb_get_pix_fmt_name( int pix_fmt )
+{
+ static const char *ppsz_name[AV_PIX_FMT_NB] =
+ {
+ [AV_PIX_FMT_VDPAU_H264] = "AV_PIX_FMT_VDPAU_H264",
+ [AV_PIX_FMT_VAAPI_IDCT] = "AV_PIX_FMT_VAAPI_IDCT",
+ [AV_PIX_FMT_VAAPI_VLD] = "AV_PIX_FMT_VAAPI_VLD",
+ [AV_PIX_FMT_VAAPI_MOCO] = "AV_PIX_FMT_VAAPI_MOCO",
+ [AV_PIX_FMT_DXVA2_VLD] = "AV_PIX_FMT_DXVA2_VLD",
+ [AV_PIX_FMT_YUYV422] = "AV_PIX_FMT_YUYV422",
+ [AV_PIX_FMT_YUV420P] = "AV_PIX_FMT_YUV420P",
+ };
+
+ return ppsz_name[pix_fmt];
+}
+
+enum PixelFormat hb_ffmpeg_get_format( AVCodecContext *p_context, const enum PixelFormat *pi_fmt )
+{
+ int i;
+ for( i = 0; pi_fmt[i] != AV_PIX_FMT_NONE; i++ )
+ {
+ hb_log( "dxva2:Available decoder output format %d (%s)", pi_fmt[i], hb_get_pix_fmt_name(pi_fmt[i]) ? : "Unknown" );
+ if( pi_fmt[i] == AV_PIX_FMT_DXVA2_VLD )
+ {
+ return pi_fmt[i];
+ }
+ }
+ return avcodec_default_get_format( p_context, pi_fmt );
+}
+
+int hb_va_get_frame_buf( hb_va_dxva2_t *dxva2, AVCodecContext *p_context, AVFrame *frame )
+{
+ frame->type = FF_BUFFER_TYPE_USER;
+ if( hb_va_get( dxva2, frame ) == HB_WORK_ERROR )
+ {
+ hb_log( "VaGrabSurface failed" );
+ return HB_WORK_ERROR;
+ }
+ return HB_WORK_OK;
+
+}
+
+int hb_check_hwd_fmt( int fmt )
+{
+ int result = 1;
+ switch ( fmt )
+ {
+ case AV_PIX_FMT_YUV420P16LE:
+ case AV_PIX_FMT_YUV420P16BE:
+ case AV_PIX_FMT_YUV422P16LE:
+ case AV_PIX_FMT_YUV422P16BE:
+ case AV_PIX_FMT_YUV444P16LE:
+ case AV_PIX_FMT_YUV444P16BE:
+ case AV_PIX_FMT_YUV420P9BE:
+ case AV_PIX_FMT_YUV420P9LE:
+ case AV_PIX_FMT_YUV420P10BE:
+ case AV_PIX_FMT_YUV420P10LE:
+ case AV_PIX_FMT_YUV422P10BE:
+ case AV_PIX_FMT_YUV422P10LE:
+ case AV_PIX_FMT_YUV444P9BE:
+ case AV_PIX_FMT_YUV444P9LE:
+ case AV_PIX_FMT_YUV444P10BE:
+ case AV_PIX_FMT_YUV444P10LE:
+ case AV_PIX_FMT_YUV422P9BE:
+ case AV_PIX_FMT_YUV422P9LE:
+ case AV_PIX_FMT_GBRP9BE:
+ case AV_PIX_FMT_GBRP9LE:
+ case AV_PIX_FMT_GBRP10BE:
+ case AV_PIX_FMT_GBRP10LE:
+ case AV_PIX_FMT_GBRP16BE:
+ case AV_PIX_FMT_GBRP16LE:
+ case AV_PIX_FMT_YUVA420P9BE:
+ case AV_PIX_FMT_YUVA420P9LE:
+ case AV_PIX_FMT_YUVA422P9BE:
+ case AV_PIX_FMT_YUVA422P9LE:
+ case AV_PIX_FMT_YUVA444P9BE:
+ case AV_PIX_FMT_YUVA444P9LE:
+ case AV_PIX_FMT_YUVA420P10BE:
+ case AV_PIX_FMT_YUVA420P10LE:
+ result = 0;
+ }
+ return result;
+}
+#endif
--- /dev/null
+/* vadxva2.h
+
+ Copyright (c) 2003-2012 HandBrake Team
+ This file is part of the HandBrake source code
+ Homepage: <http://handbrake.fr/>.
+ It may be used under the terms of the GNU General Public License v2.
+ For full terms see the file COPYING file or visit http://www.gnu.org/licenses/gpl-2.0.html
+
+ Authors: Peng Gao <peng@multicorewareinc.com> <http://www.multicorewareinc.com/>
+ Li Cao <li@multicorewareinc.com> <http://www.multicorewareinc.com/>
+
+ */
+
+#ifndef VA_DXVA2_H
+#define VA_DXVA2_H
+
+#ifdef USE_HWD
+#include "hbffmpeg.h"
+#include "d3d9.h"
+#include "libavcodec/dxva2.h"
+#include "dxva2api.h"
+#include "common.h"
+#include "openclwrapper.h"
+
+#define HB_FOURCC( a, b, c, d ) ( ((uint32_t)a) | ( ((uint32_t)b) << 8 ) | ( ((uint32_t)c) << 16 ) | ( ((uint32_t)d) << 24 ) )
+#define MAKEFOURCC( a, b, c, d ) ((DWORD)(BYTE)(a) | ((DWORD)(BYTE)(b) << 8) | ((DWORD)(BYTE)(c) << 16) | ((DWORD)(BYTE)(d) << 24 ))
+#define HB_CODEC_YV12 HB_FOURCC( 'Y', 'V', '1', '2' )
+#define HB_CODEC_NV12 HB_FOURCC( 'N', 'V', '1', '2' )
+#define DXVA2_E_NOT_INITIALIZED MAKE_HRESULT( 1, 4, 4096 )
+#define DXVA2_E_NEW_VIDEO_DEVICE MAKE_HRESULT( 1, 4, 4097 )
+#define DXVA2_E_VIDEO_DEVICE_LOCKED MAKE_HRESULT( 1, 4, 4098 )
+#define DXVA2_E_NOT_AVAILABLE MAKE_HRESULT( 1, 4, 4099 )
+#define VA_DXVA2_MAX_SURFACE_COUNT (64)
+
+static const GUID DXVA_NoEncrypt = { 0x1b81bed0, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID IID_IDirectXVideoDecoderService = {0xfc51a551, 0xd5e7, 0x11d9, {0xaf, 0x55, 0x00, 0x05, 0x4e, 0x43, 0xff, 0x02}};
+static const GUID DXVA2_ModeMPEG2_MoComp = { 0xe6a9f44b, 0x61b0, 0x4563, {0x9e, 0xa4, 0x63, 0xd2, 0xa3, 0xc6, 0xfe, 0x66} };
+static const GUID DXVA2_ModeMPEG2_IDCT = { 0xbf22ad00, 0x03ea, 0x4690, {0x80, 0x77, 0x47, 0x33, 0x46, 0x20, 0x9b, 0x7e} };
+static const GUID DXVA2_ModeMPEG2_VLD = { 0xee27417f, 0x5e28, 0x4e65, {0xbe, 0xea, 0x1d, 0x26, 0xb5, 0x08, 0xad, 0xc9} };
+static const GUID DXVA2_ModeH264_A = { 0x1b81be64, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID DXVA2_ModeH264_B = { 0x1b81be65, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID DXVA2_ModeH264_C = { 0x1b81be66, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID DXVA2_ModeH264_D = { 0x1b81be67, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID DXVA2_ModeH264_E = { 0x1b81be68, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID DXVA2_ModeH264_F = { 0x1b81be69, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID DXVADDI_Intel_ModeH264_A = { 0x604F8E64, 0x4951, 0x4c54, {0x88, 0xFE, 0xAB, 0xD2, 0x5C, 0x15, 0xB3, 0xD6} };
+static const GUID DXVADDI_Intel_ModeH264_C = { 0x604F8E66, 0x4951, 0x4c54, {0x88, 0xFE, 0xAB, 0xD2, 0x5C, 0x15, 0xB3, 0xD6} };
+static const GUID DXVADDI_Intel_ModeH264_E = { 0x604F8E68, 0x4951, 0x4c54, {0x88, 0xFE, 0xAB, 0xD2, 0x5C, 0x15, 0xB3, 0xD6} };
+static const GUID DXVA2_ModeWMV8_A = { 0x1b81be80, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID DXVA2_ModeWMV8_B = { 0x1b81be81, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID DXVA2_ModeWMV9_A = { 0x1b81be90, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID DXVA2_ModeWMV9_B = { 0x1b81be91, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID DXVA2_ModeWMV9_C = { 0x1b81be94, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID DXVA2_ModeVC1_A = { 0x1b81beA0, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID DXVA2_ModeVC1_B = { 0x1b81beA1, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID DXVA2_ModeVC1_C = { 0x1b81beA2, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+static const GUID DXVA2_ModeVC1_D = { 0x1b81beA3, 0xa0c7, 0x11d3, {0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5} };
+
+typedef struct
+{
+ int width;
+ int height;
+ int rate;
+ int rate_base;
+
+}hb_dx_format;
+
+typedef struct
+{
+ LPDIRECT3DSURFACE9 d3d;
+ int refcount;
+ unsigned int order;
+
+} hb_va_surface_t;
+
+typedef struct
+{
+ uint8_t *base;
+ uint8_t *buffer;
+ size_t size;
+
+} hb_copy_cache_t;
+
+typedef struct
+{
+ const char *name;
+ D3DFORMAT format;
+ uint32_t codec;
+
+} hb_d3d_format_t;
+
+typedef struct
+{
+ const char *name;
+ const GUID *guid;
+ int codec;
+} hb_dx_mode_t;
+
+typedef struct
+{
+ char *description;
+ int codec_id;
+ uint32_t i_chroma;
+ int width;
+ int height;
+ HINSTANCE hd3d9_dll;
+ HINSTANCE hdxva2_dll;
+ D3DPRESENT_PARAMETERS d3dpp;
+ LPDIRECT3D9 d3dobj;
+ D3DADAPTER_IDENTIFIER9 d3dai;
+ LPDIRECT3DDEVICE9 d3ddev;
+ UINT token;
+ IDirect3DDeviceManager9 *devmng;
+ HANDLE device;
+ IDirectXVideoDecoderService *vs;
+ GUID input;
+ D3DFORMAT render;
+ DXVA2_ConfigPictureDecode cfg;
+ IDirectXVideoDecoder *decoder;
+ D3DFORMAT output;
+ struct dxva_context hw;
+ unsigned surface_count;
+ unsigned surface_order;
+ int surface_width;
+ int surface_height;
+ uint32_t surface_chroma;
+ hb_va_surface_t surface[VA_DXVA2_MAX_SURFACE_COUNT];
+ LPDIRECT3DSURFACE9 hw_surface[VA_DXVA2_MAX_SURFACE_COUNT];
+ IDirectXVideoProcessorService *ps;
+ IDirectXVideoProcessor *vp;
+ int64_t input_pts[2];
+ int64_t input_dts;
+ int do_job;
+
+ // running nv12toyuv kernel.
+#ifdef USE_OPENCL
+ cl_kernel nv12toyuv;
+ cl_mem cl_mem_nv12;
+ cl_mem cl_mem_yuv;
+ uint8_t * nv12toyuv_tmp_in;
+ uint8_t * nv12toyuv_tmp_out;
+#endif
+} hb_va_dxva2_t;
+
+typedef struct FilterLink_T
+{
+#ifdef USE_OPENCL
+ cl_mem cl_inbuf;
+ cl_mem cl_outbuf;
+#endif
+ uint8_t *mem_inbuf;
+ uint8_t *mem_outbuf;
+ int width;
+ int height;
+ int linesizeY;
+ int linesizeUV;
+ int inmemdataflag;
+ int outmemdataflag;
+ int incldataflag;
+ int outcldataflag;
+ int framenum;
+ int outputSize;
+} T_FilterLink;
+
+static const hb_d3d_format_t d3d_formats[] =
+{
+ { "YV12", MAKEFOURCC( 'Y', 'V', '1', '2' ), HB_CODEC_YV12 },
+ { "NV12", MAKEFOURCC( 'N', 'V', '1', '2' ), HB_CODEC_NV12 },
+ { NULL, 0, 0 }
+};
+
+static const hb_dx_mode_t dxva2_modes[] =
+{
+ { "DXVA2_ModeMPEG2_VLD", &DXVA2_ModeMPEG2_VLD, AV_CODEC_ID_MPEG2VIDEO },
+ { "DXVA2_ModeMPEG2_MoComp", &DXVA2_ModeMPEG2_MoComp, 0 },
+ { "DXVA2_ModeMPEG2_IDCT", &DXVA2_ModeMPEG2_IDCT, 0 },
+
+ { "H.264 variable-length decoder (VLD), FGT", &DXVA2_ModeH264_F, AV_CODEC_ID_H264 },
+ { "H.264 VLD, no FGT", &DXVA2_ModeH264_E, AV_CODEC_ID_H264 },
+ { "H.264 VLD, no FGT (Intel)", &DXVADDI_Intel_ModeH264_E, AV_CODEC_ID_H264 },
+ { "H.264 IDCT, FGT", &DXVA2_ModeH264_D, 0 },
+ { "H.264 inverse discrete cosine transform (IDCT), no FGT", &DXVA2_ModeH264_C, 0 },
+ { "H.264 inverse discrete cosine transform (IDCT), no FGT (Intel)", &DXVADDI_Intel_ModeH264_C, 0 },
+ { "H.264 MoComp, FGT", &DXVA2_ModeH264_B, 0 },
+ { "H.264 motion compensation (MoComp), no FGT", &DXVA2_ModeH264_A, 0 },
+ { "H.264 motion compensation (MoComp), no FGT (Intel)", &DXVADDI_Intel_ModeH264_A, 0 },
+
+ { "Windows Media Video 8 MoComp", &DXVA2_ModeWMV8_B, 0 },
+ { "Windows Media Video 8 post processing", &DXVA2_ModeWMV8_A, 0 },
+
+ { "Windows Media Video 9 IDCT", &DXVA2_ModeWMV9_C, 0 },
+ { "Windows Media Video 9 MoComp", &DXVA2_ModeWMV9_B, 0 },
+ { "Windows Media Video 9 post processing", &DXVA2_ModeWMV9_A, 0 },
+
+ { "VC-1 VLD", &DXVA2_ModeVC1_D, AV_CODEC_ID_VC1 },
+ { "VC-1 VLD", &DXVA2_ModeVC1_D, AV_CODEC_ID_WMV3 },
+ { "VC-1 IDCT", &DXVA2_ModeVC1_C, 0 },
+ { "VC-1 MoComp", &DXVA2_ModeVC1_B, 0 },
+ { "VC-1 post processing", &DXVA2_ModeVC1_A, 0 },
+
+ { NULL, NULL, 0 }
+};
+
+int hb_va_get_frame_buf( hb_va_dxva2_t *dxva2, AVCodecContext *p_context, AVFrame *frame );
+int hb_va_extract( hb_va_dxva2_t *dxva2, uint8_t *dst, AVFrame *frame, int job_w, int job_h, int *crop, hb_oclscale_t *os, int use_opencl, int use_decomb, int use_detelecine );
+enum PixelFormat hb_ffmpeg_get_format( AVCodecContext *, const enum PixelFormat * );
+hb_va_dxva2_t *hb_va_create_dxva2( hb_va_dxva2_t *dxva2, int codec_id );
+void hb_va_new_dxva2( hb_va_dxva2_t *dxva2, AVCodecContext *p_context );
+void hb_va_release( hb_va_dxva2_t *dxva2, AVFrame *frame );
+void hb_va_close( hb_va_dxva2_t *dxva2 );
+int hb_check_hwd_fmt( int fmt );
+#endif
+#endif
#include "hb.h"
#include "a52dec/a52.h"
#include "libavformat/avformat.h"
+#include "openclwrapper.h"
#ifdef USE_QSV
#include "qsv_common.h"
job->list_work = hb_list_init();
+#ifdef USE_OPENCL
+ /* init opencl environment */
+ if (job->use_opencl)
+ job->use_opencl = !hb_init_opencl_run_env(0, NULL, "-I.");
+#else
+ job->use_opencl = 0;
+#endif
+
hb_log( "starting job" );
+ if (job->use_opencl || job->use_hwd)
+ {
+ hb_log("Using GPU: Yes.");
+ }
+ else
+ {
+ hb_log("Using GPU: No.");
+ }
/* Look for the scanned subtitle in the existing subtitle list
* select_subtitle implies that we did a scan. */
if( !job->indepth_scan && interjob->select_subtitle )
init.pix_fmt = AV_PIX_FMT_YUV420P;
init.width = title->width;
init.height = title->height;
+#ifdef USE_OPENCL
+ init.use_dxva = hb_use_dxva( title );
+#endif
init.par_width = job->anamorphic.par_width;
init.par_height = job->anamorphic.par_height;
memcpy(init.crop, title->crop, sizeof(int[4]));
title->video_codec_param = AV_CODEC_ID_MPEG2VIDEO;
}
#endif
+
hb_list_add( job->list_work, ( w = hb_get_work( vcodec ) ) );
w->codec_param = title->video_codec_param;
w->fifo_in = job->fifo_mpeg2;
if( src && dst && src->s.start == dst->s.start)
{
// restore log below to debug chapter mark propagation problems
- //hb_log("work %s: Copying Chapter Break @ %"PRId64, w->name, src->s.start);
dst->s.new_chap = src->s.new_chap;
}
}
/* Begin PBXBuildFile section */
226268E01572CC7300477B4E /* libavresample.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 226268DF1572CC7300477B4E /* libavresample.a */; };
226268E11572CC7300477B4E /* libavresample.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 226268DF1572CC7300477B4E /* libavresample.a */; };
+ 22D1C457170B2EB0002A7BD4 /* OpenCL.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 22D1C456170B2EB0002A7BD4 /* OpenCL.framework */; };
+ 22D1C458170B2EBD002A7BD4 /* OpenCL.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 22D1C456170B2EB0002A7BD4 /* OpenCL.framework */; };
273F202314ADB8650021BE6D /* IOKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 273F202214ADB8650021BE6D /* IOKit.framework */; };
273F202614ADB8A40021BE6D /* libz.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 273F202514ADB8A40021BE6D /* libz.dylib */; };
273F202814ADB8BE0021BE6D /* libbz2.dylib in Frameworks */ = {isa = PBXBuildFile; fileRef = 273F202714ADB8BE0021BE6D /* libbz2.dylib */; };
/* Begin PBXFileReference section */
226268DF1572CC7300477B4E /* libavresample.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libavresample.a; path = external/contrib/lib/libavresample.a; sourceTree = BUILT_PRODUCTS_DIR; };
+ 22D1C456170B2EB0002A7BD4 /* OpenCL.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = OpenCL.framework; path = System/Library/Frameworks/OpenCL.framework; sourceTree = SDKROOT; };
271BA4C014B119F800BC1D2C /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist; name = Info.plist; path = external/macosx/Info.plist; sourceTree = BUILT_PRODUCTS_DIR; };
273F1FFF14ADAE950021BE6D /* HandBrakeCLI */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = HandBrakeCLI; sourceTree = BUILT_PRODUCTS_DIR; };
273F202214ADB8650021BE6D /* IOKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = IOKit.framework; path = System/Library/Frameworks/IOKit.framework; sourceTree = SDKROOT; };
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
+ 22D1C457170B2EB0002A7BD4 /* OpenCL.framework in Frameworks */,
273F203014ADB9790021BE6D /* AudioToolbox.framework in Frameworks */,
273F202314ADB8650021BE6D /* IOKit.framework in Frameworks */,
273F203314ADB9F00021BE6D /* CoreServices.framework in Frameworks */,
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
+ 22D1C458170B2EBD002A7BD4 /* OpenCL.framework in Frameworks */,
A9E1467B16BC2ABD00C307BC /* QuartzCore.framework in Frameworks */,
273F21C114ADE7A20021BE6D /* Growl.framework in Frameworks */,
273F21C214ADE7BC0021BE6D /* Sparkle.framework in Frameworks */,
273F1FDE14AD9DA40021BE6D = {
isa = PBXGroup;
children = (
+ 22D1C456170B2EB0002A7BD4 /* OpenCL.framework */,
273F204114ADBC210021BE6D /* HandBrake */,
273F200214ADAE950021BE6D /* HandBrakeCLI */,
273F200014ADAE950021BE6D /* Products */,
MACOSX.map.g.std = debug
MACOSX.map.g.max = debug
+ifeq (1,$(FEATURE.opencl))
+MACOSX.extra_cflags = OTHER_CFLAGS='-DUSE_OPENCL'
+endif
+
ifeq (1,$(FEATURE.fdk_aac))
extra_libs += $(abspath $(BUILD))/contrib/lib/libfdk-aac.a
endif
EXTERNAL_JOBS='$(BUILD.jobs)' \
EXTERNAL_VARS='$(-*-command-variables-*-)' \
\
+ $(MACOSX.extra_cflags) \
$(MACOSX.extra_ldflags) \
\
$(2) )
h = IfHost( 'enable use of Intel Quick Sync Video hardware acceleration', '*-*-*', none=optparse.SUPPRESS_HELP ).value
grp.add_option( '--enable-qsv', default=False, action='store_true', help=h )
+ h = IfHost( 'enable OpenCL features', '*-*-*', none=optparse.SUPPRESS_HELP ).value
+ grp.add_option( '--enable-opencl', default=False, action='store_true', help=h )
+ h = IfHost( 'enable HWD features', '*-*-*', none=optparse.SUPPRESS_HELP ).value
+ grp.add_option( '--enable-hwd', default=False, action='store_true', help=h )
+
h = IfHost( 'enable use of fdk-aac encoder', '*-*-*', none=optparse.SUPPRESS_HELP ).value
grp.add_option( '--enable-fdk-aac', dest="enable_fdk_aac", default=not host.match( '*-*-darwin*' ), action='store_true', help=h )
grp.add_option( '--disable-fdk-aac', dest="enable_fdk_aac", action='store_false' )
doc.add( 'FEATURE.libmkv', int( options.enable_libmkv ))
doc.add( 'FEATURE.avformat', int( options.enable_avformat ))
doc.add( 'FEATURE.qsv', int( options.enable_qsv ))
+ doc.add( 'FEATURE.opencl', int( options.enable_opencl ))
+ doc.add( 'FEATURE.hwd', int( options.enable_hwd ))
doc.add( 'FEATURE.xcode', int( not (Tools.xcodebuild.fail or options.disable_xcode or options.cross) ))
if not Tools.xcodebuild.fail and not options.disable_xcode:
TEST.GCC.I += $(LIBHB.GCC.I)
+ifeq (1,$(FEATURE.opencl))
+ifeq ($(BUILD.system),darwin)
+ TEST.GCC.f += OpenCL
+else
+ TEST.GCC.l += OpenCL
+endif
+ TEST.GCC.D += USE_OPENCL
+endif
+
ifeq ($(BUILD.system),darwin)
TEST.GCC.f += IOKit CoreServices AudioToolbox
TEST.GCC.l += iconv
else ifeq (1-mingw,$(BUILD.cross)-$(BUILD.system))
ifeq ($(HAS.dlfcn),1)
TEST.GCC.l += dl
+endif
+ifeq (1,$(FEATURE.hwd))
+ TEST.GCC.D += USE_HWD
endif
TEST.GCC.l += pthreadGC2 iconv ws2_32
TEST.GCC.D += PTW32_STATIC_LIB
- TEST.GCC.args.extra.exe++ += -static
-endif
+ TEST.GCC.args.extra.exe++ += -static
+endif # (1-mingw,$(BUILD.cross)-$(BUILD.system))
#include "hb.h"
#include "lang.h"
#include "parsecsv.h"
+#include "openclwrapper.h"
#ifdef USE_QSV
#include "qsv_common.h"
static int64_t stop_at_pts = 0;
static int stop_at_frame = 0;
static uint64_t min_title_duration = 10;
+static int use_opencl = 0;
+static int use_hwd = 0;
#ifdef USE_QSV
static int qsv_decode = 1;
static int qsv_async_depth = -1;
/* Init libhb */
h = hb_init( debug, update );
hb_dvd_set_dvdnav( dvdnav );
-
+#ifdef USE_OPENCL
+ if( use_opencl )
+ hb_get_opencl_env();
+#endif
/* Show version */
fprintf( stderr, "%s - %s - %s\n",
HB_PROJECT_TITLE, HB_PROJECT_BUILD_TITLE, HB_PROJECT_URL_WEBSITE );
titleindex = 0;
}
+
hb_system_sleep_prevent(h);
- hb_scan(h, input, titleindex, preview_count, store_previews,
- min_title_duration * 90000LL);
+ hb_gui_use_hwd_flag = use_hwd;
+ hb_scan( h, input, titleindex, preview_count, store_previews, min_title_duration * 90000LL );
/* Wait... */
while( !die )
(float) title->rate / title->rate_base );
fprintf( stderr, " + autocrop: %d/%d/%d/%d\n", title->crop[0],
title->crop[1], title->crop[2], title->crop[3] );
+ if ( title->opencl_support )
+ fprintf( stderr, " + support opencl: yes\n");
+ else
+ fprintf( stderr, " + support opencl: no\n");
+ if ( title->hwd_support )
+ fprintf( stderr, " + support hwd: yes\n");
+ else
+ fprintf( stderr, " + support hwd: no\n");
fprintf( stderr, " + chapters:\n" );
for( i = 0; i < hb_list_count( title->list_chapter ); i++ )
{
die = 1;
break;
}
- if( main_feature )
- {
+ if( main_feature )
+ {
int i;
int main_feature_idx=0;
int main_feature_pos=-1;
}
}
- if ( chapter_markers )
- {
- job->chapter_markers = chapter_markers;
+ if ( chapter_markers )
+ {
+ job->chapter_markers = chapter_markers;
if( marker_file != NULL )
{
hb_close_csv_file( file );
}
}
- }
+ }
if( crop[0] >= 0 && crop[1] >= 0 &&
crop[2] >= 0 && crop[3] >= 0 )
hb_filter_object_t * filter;
+ job->use_detelecine = detelecine;
+ job->use_decomb = decomb;
+
/* Add selected filters */
if( detelecine )
{
job->maxWidth = maxWidth;
if (maxHeight)
job->maxHeight = maxHeight;
+ if (use_hwd)
+ {
+ job->use_hwd = use_hwd;
+ }
switch( anamorphic_mode )
{
filter_str = hb_strdup_printf("%d:%d:%d:%d:%d:%d",
job->width, job->height,
job->crop[0], job->crop[1], job->crop[2], job->crop[3] );
+
filter = hb_filter_init( HB_FILTER_CROP_SCALE );
hb_add_filter( job, filter, filter_str );
free( filter_str );
job->frame_to_start = start_at_frame;
subtitle_scan = 0;
}
-
+#ifdef USE_OPENCL
+ job->use_opencl = use_opencl;
+#else
+ job->use_opencl = 0;
+#endif
if( subtitle_scan )
{
/*
" double quotation marks\n"
" -z, --preset-list See a list of available built-in presets\n"
" --no-dvdnav Do not use dvdnav for reading DVDs\n"
+ " --no-opencl Disable use of OpenCL\n"
"\n"
"### Source Options-----------------------------------------------------------\n\n"
" of data. Note: breaks pre-iOS iPod compatibility.\n"
" -O, --optimize Optimize mp4 files for HTTP streaming (\"fast start\")\n"
" -I, --ipod-atom Mark mp4 files so 5.5G iPods will accept them\n"
+ " -P, --use-opencl Use OpenCL where applicable\n"
+ " -U, --use-hwd Use DXVA2 hardware decoding\n"
"\n"
static char * hb_strndup( char * str, int len )
{
- char * res;
- int str_len = strlen( str );
+ char * res;
+ int str_len = strlen( str );
- res = malloc( len > str_len ? str_len + 1 : len + 1 );
- strncpy( res, str, len );
- res[len] = '\0';
- return res;
+ res = malloc( len > str_len ? str_len + 1 : len + 1 );
+ strncpy( res, str, len );
+ res[len] = '\0';
+ return res;
}
static char** str_split( char *str, char delem )
#define X264_TUNE 284
#define H264_PROFILE 285
#define H264_LEVEL 286
- #define NORMALIZE_MIX 287
- #define AUDIO_DITHER 288
+ #define NO_OPENCL 287
+ #define NORMALIZE_MIX 288
+ #define AUDIO_DITHER 289
#define QSV_BASELINE 289
#define QSV_ASYNC_DEPTH 290
{ "update", no_argument, NULL, 'u' },
{ "verbose", optional_argument, NULL, 'v' },
{ "no-dvdnav", no_argument, NULL, DVDNAV },
+ { "no-opencl", no_argument, NULL, NO_OPENCL },
+
#ifdef USE_QSV
{ "qsv-baseline", no_argument, NULL, QSV_BASELINE },
{ "qsv-async-depth", required_argument, NULL, QSV_ASYNC_DEPTH },
{ "large-file", no_argument, NULL, '4' },
{ "optimize", no_argument, NULL, 'O' },
{ "ipod-atom", no_argument, NULL, 'I' },
+ { "use-opencl", no_argument, NULL, 'P' },
+ { "use-hwd", no_argument, NULL, 'U' },
{ "title", required_argument, NULL, 't' },
{ "min-duration",required_argument, NULL, MIN_DURATION },
cur_optind = optind;
c = getopt_long( argc, argv,
- "hv::uC:f:4i:Io:t:c:m::M:a:A:6:s:UF::N:e:E:Q:C:"
+ "hv::uC:f:4i:Io:PUt:c:m::M:a:A:6:s:F::N:e:E:Q:C:"
"2dD:7895gOw:l:n:b:q:S:B:r:R:x:TY:X:Z:z",
long_options, &option_index );
if( c < 0 )
case 'I':
ipod_atom = 1;
break;
+ case 'P':
+ use_opencl = 1;
+ break;
+ case 'U':
+ use_hwd = 1;
+ break;
case 't':
titleindex = atoi( optarg );
}
break;
}
+ case NO_OPENCL:
+ use_opencl = 0;
+ break;
case ANGLE:
angle = atoi( optarg );
break;