From: Pierre Ossman Date: Mon, 9 Mar 2009 13:34:17 +0000 (+0000) Subject: Add SSE2 SIMD implementation of computationally intensive routines. X-Git-Tag: 0.0.90~132 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=eea72155259c61fe79d7b811330a9be91affca68;p=libjpeg-turbo Add SSE2 SIMD implementation of computationally intensive routines. git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@22 632fc199-4ca6-4c93-a231-07263d6284db --- eea72155259c61fe79d7b811330a9be91affca68 diff --cc Makefile.am index 655e207,0000000..7f67542 mode 100644,000000..100644 --- a/Makefile.am +++ b/Makefile.am @@@ -1,39 -1,0 +1,44 @@@ +noinst_LTLIBRARIES = libjpeg.la + +HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \ + jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h + +libjpeg_la_SOURCES = $(HDRS) jcapimin.c jcapistd.c jccoefct.c jccolor.c \ + jcdctmgr.c jchuff.c jcinit.c jcmainct.c jcmarker.c jcmaster.c \ + jcomapi.c jcparam.c jcphuff.c jcprepct.c jcsample.c jctrans.c \ + jdapimin.c jdapistd.c jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c \ + jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c jdmaster.c \ + jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c \ + jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c \ + jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c \ + jsimd.c + +if WITH_SIMD + +BUILT_SOURCES = simd/jsimdcfg.inc + +EXTRA_DIST = nasm_lt.sh + +libjpeg_la_SOURCES += simd/jsimd.h simd/jsimdcfg.inc.h \ + simd/jsimdext.inc simd/jcolsamp.inc simd/jdct.inc \ + simd/jsimdcpu.asm \ + simd/jccolmmx.asm simd/jdcolmmx.asm \ + simd/jcsammmx.asm simd/jdsammmx.asm simd/jdmermmx.asm \ + simd/jcqntmmx.asm simd/jfmmxfst.asm simd/jfmmxint.asm \ + simd/jimmxred.asm simd/jimmxint.asm simd/jimmxfst.asm \ + simd/jcqnt3dn.asm simd/jf3dnflt.asm simd/ji3dnflt.asm \ - simd/jcqntsse.asm simd/jfsseflt.asm simd/jisseflt.asm ++ simd/jcqntsse.asm simd/jfsseflt.asm simd/jisseflt.asm \ ++ simd/jccolss2.asm simd/jdcolss2.asm \ ++ simd/jcsamss2.asm simd/jdsamss2.asm simd/jdmerss2.asm \ ++ simd/jcqnts2i.asm simd/jfss2fst.asm simd/jfss2int.asm \ ++ simd/jiss2red.asm simd/jiss2int.asm simd/jiss2fst.asm \ ++ simd/jcqnts2f.asm simd/jiss2flt.asm + +endif + +.asm.lo: + $(LIBTOOL) --mode=compile --tag NASM ./nasm_lt.sh $(NASM) $(NAFLAGS) $< -o $@ + +simd/jsimdcfg.inc: simd/jsimdcfg.inc.h jpeglib.h jconfig.h jmorecfg.h + $(CPP) $< | grep ^[\;%] | sed 's%_cpp_protection_%%' > $@ + diff --cc jsimd.c index 6c60b5b,0000000..3248911 mode 100644,000000..100644 --- a/jsimd.c +++ b/jsimd.c @@@ -1,741 -1,0 +1,850 @@@ +/* + * jsimd.c + * + * Copyright 2009 Pierre Ossman for Cendio AB + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + * This file contains the interface between the "normal" portions + * of the library and the SIMD implementations. + */ + +#define JPEG_INTERNALS +#include "jinclude.h" +#include "jpeglib.h" +#include "jsimd.h" +#include "jdct.h" +#include "jsimddct.h" +#include "simd/jsimd.h" + +/* + * In the PIC cases, we have no guarantee that constants will keep + * their alignment. This macro allows us to verify it at runtime. + */ +#ifdef WITH_SIMD +#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0) +#else +#define IS_ALIGNED(ptr, order) (0) +#endif + +#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */ + +static unsigned int simd_support = ~0; + +/* + * Check what SIMD accelerations are supported. + * + * FIXME: This code is racy under a multi-threaded environment. + */ +LOCAL(void) +init_simd (void) +{ + if (simd_support != ~0) + return; + +#ifdef WITH_SIMD + simd_support = jpeg_simd_cpu_support(); +#else + simd_support = JSIMD_NONE; +#endif +} + +GLOBAL(int) +jsimd_can_rgb_ycc (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + ++ if ((simd_support & JSIMD_SSE2) && ++ IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2)) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_ycc_rgb (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4)) + return 0; + ++ if ((simd_support & JSIMD_SSE2) && ++ IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2)) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_rgb_ycc_convert (j_compress_ptr cinfo, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if ((simd_support & JSIMD_SSE2) && ++ IS_ALIGNED_SSE(jconst_rgb_ycc_convert_sse2)) ++ jsimd_rgb_ycc_convert_sse2(cinfo->image_width, input_buf, ++ output_buf, output_row, num_rows); ++ else if (simd_support & JSIMD_MMX) + jsimd_rgb_ycc_convert_mmx(cinfo->image_width, input_buf, + output_buf, output_row, num_rows); +#endif +} + +GLOBAL(void) +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if ((simd_support & JSIMD_SSE2) && ++ IS_ALIGNED_SSE(jconst_ycc_rgb_convert_sse2)) ++ jsimd_ycc_rgb_convert_sse2(cinfo->output_width, input_buf, ++ input_row, output_buf, num_rows); ++ else if (simd_support & JSIMD_MMX) + jsimd_ycc_rgb_convert_mmx(cinfo->output_width, input_buf, + input_row, output_buf, num_rows); +#endif +} + +GLOBAL(int) +jsimd_can_h2v2_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + ++ if (simd_support & JSIMD_SSE2) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_downsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + ++ if (simd_support & JSIMD_SSE2) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if (simd_support & JSIMD_SSE2) ++ jsimd_h2v2_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, ++ compptr->v_samp_factor, compptr->width_in_blocks, ++ input_data, output_data); ++ else if (simd_support & JSIMD_MMX) + jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +#endif +} + +GLOBAL(void) +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr, + JSAMPARRAY input_data, JSAMPARRAY output_data) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if (simd_support & JSIMD_SSE2) ++ jsimd_h2v1_downsample_sse2(cinfo->image_width, cinfo->max_v_samp_factor, ++ compptr->v_samp_factor, compptr->width_in_blocks, ++ input_data, output_data); ++ else if (simd_support & JSIMD_MMX) + jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor, + compptr->v_samp_factor, compptr->width_in_blocks, + input_data, output_data); +#endif +} + +GLOBAL(int) +jsimd_can_h2v2_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + ++ if (simd_support & JSIMD_SSE2) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + ++ if (simd_support & JSIMD_SSE2) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if (simd_support & JSIMD_SSE2) ++ jsimd_h2v2_upsample_sse2(cinfo->max_v_samp_factor, ++ cinfo->output_width, input_data, output_data_ptr); ++ else if (simd_support & JSIMD_MMX) + jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor, + cinfo->output_width, input_data, output_data_ptr); +#endif +} + +GLOBAL(void) +jsimd_h2v1_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if (simd_support & JSIMD_SSE2) ++ jsimd_h2v1_upsample_sse2(cinfo->max_v_samp_factor, ++ cinfo->output_width, input_data, output_data_ptr); ++ else if (simd_support & JSIMD_MMX) + jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor, + cinfo->output_width, input_data, output_data_ptr); +#endif +} + +GLOBAL(int) +jsimd_can_h2v2_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + ++ if ((simd_support & JSIMD_SSE2) && ++ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_fancy_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + ++ if ((simd_support & JSIMD_SSE2) && ++ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if ((simd_support & JSIMD_SSE2) && ++ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) ++ jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor, ++ compptr->downsampled_width, input_data, output_data_ptr); ++ else if (simd_support & JSIMD_MMX) + jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, output_data_ptr); +#endif +} + +GLOBAL(void) +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo, + jpeg_component_info * compptr, + JSAMPARRAY input_data, + JSAMPARRAY * output_data_ptr) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if ((simd_support & JSIMD_SSE2) && ++ IS_ALIGNED_SSE(jconst_fancy_upsample_sse2)) ++ jsimd_h2v1_fancy_upsample_sse2(cinfo->max_v_samp_factor, ++ compptr->downsampled_width, input_data, output_data_ptr); ++ else if (simd_support & JSIMD_MMX) + jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor, + compptr->downsampled_width, input_data, output_data_ptr); +#endif +} + +GLOBAL(int) +jsimd_can_h2v2_merged_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + ++ if ((simd_support & JSIMD_SSE2) && ++ IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_h2v1_merged_upsample (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + ++ if ((simd_support & JSIMD_SSE2) && ++ IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if ((simd_support & JSIMD_SSE2) && ++ IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) ++ jsimd_h2v2_merged_upsample_sse2(cinfo->output_width, input_buf, ++ in_row_group_ctr, output_buf); ++ else if (simd_support & JSIMD_MMX) + jsimd_h2v2_merged_upsample_mmx(cinfo->output_width, input_buf, + in_row_group_ctr, output_buf); +#endif +} + +GLOBAL(void) +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo, + JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, + JSAMPARRAY output_buf) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if ((simd_support & JSIMD_SSE2) && ++ IS_ALIGNED_SSE(jconst_merged_upsample_sse2)) ++ jsimd_h2v1_merged_upsample_sse2(cinfo->output_width, input_buf, ++ in_row_group_ctr, output_buf); ++ else if (simd_support & JSIMD_MMX) + jsimd_h2v1_merged_upsample_mmx(cinfo->output_width, input_buf, + in_row_group_ctr, output_buf); +#endif +} + +GLOBAL(int) +jsimd_can_convsamp (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + ++ if (simd_support & JSIMD_SSE2) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_convsamp_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + ++ if (simd_support & JSIMD_SSE2) ++ return 1; + if (simd_support & JSIMD_SSE) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col, + DCTELEM * workspace) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if (simd_support & JSIMD_SSE2) ++ jsimd_convsamp_sse2(sample_data, start_col, workspace); ++ else if (simd_support & JSIMD_MMX) + jsimd_convsamp_mmx(sample_data, start_col, workspace); +#endif +} + +GLOBAL(void) +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col, + FAST_FLOAT * workspace) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_SSE) ++ if (simd_support & JSIMD_SSE2) ++ jsimd_convsamp_float_sse2(sample_data, start_col, workspace); ++ else if (simd_support & JSIMD_SSE) + jsimd_convsamp_float_sse(sample_data, start_col, workspace); + else if (simd_support & JSIMD_3DNOW) + jsimd_convsamp_float_3dnow(sample_data, start_col, workspace); +#endif +} + +GLOBAL(int) +jsimd_can_fdct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + ++ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + ++ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_ifast_sse2)) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_fdct_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_fdct_islow (DCTELEM * data) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) ++ jsimd_fdct_islow_sse2(data); ++ else if (simd_support & JSIMD_MMX) + jsimd_fdct_islow_mmx(data); +#endif +} + +GLOBAL(void) +jsimd_fdct_ifast (DCTELEM * data) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_fdct_islow_sse2)) ++ jsimd_fdct_ifast_sse2(data); ++ else if (simd_support & JSIMD_MMX) + jsimd_fdct_ifast_mmx(data); +#endif +} + +GLOBAL(void) +jsimd_fdct_float (FAST_FLOAT * data) +{ +#ifdef WITH_SIMD + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse)) + jsimd_fdct_float_sse(data); + else if (simd_support & JSIMD_3DNOW) + jsimd_fdct_float_3dnow(data); +#endif +} + +GLOBAL(int) +jsimd_can_quantize (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(DCTELEM) != 2) + return 0; + ++ if (simd_support & JSIMD_SSE2) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_quantize_float (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + ++ if (simd_support & JSIMD_SSE2) ++ return 1; + if (simd_support & JSIMD_SSE) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors, + DCTELEM * workspace) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if (simd_support & JSIMD_SSE2) ++ jsimd_quantize_sse2(coef_block, divisors, workspace); ++ else if (simd_support & JSIMD_MMX) + jsimd_quantize_mmx(coef_block, divisors, workspace); +#endif +} + +GLOBAL(void) +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors, + FAST_FLOAT * workspace) +{ +#ifdef WITH_SIMD - if (simd_support & JSIMD_SSE) ++ if (simd_support & JSIMD_SSE2) ++ jsimd_quantize_float_sse2(coef_block, divisors, workspace); ++ else if (simd_support & JSIMD_SSE) + jsimd_quantize_float_sse(coef_block, divisors, workspace); + else if (simd_support & JSIMD_3DNOW) + jsimd_quantize_float_3dnow(coef_block, divisors, workspace); +#endif +} + +GLOBAL(int) +jsimd_can_idct_2x2 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + ++ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_4x4 (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + ++ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +#if WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) ++ jsimd_idct_2x2_sse2(compptr->dct_table, coef_block, output_buf, output_col); ++ else if (simd_support & JSIMD_MMX) + jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col); +#endif +} + +GLOBAL(void) +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +#if WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_red_sse2)) ++ jsimd_idct_4x4_sse2(compptr->dct_table, coef_block, output_buf, output_col); ++ else if (simd_support & JSIMD_MMX) + jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col); +#endif +} + +GLOBAL(int) +jsimd_can_idct_islow (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(ISLOW_MULT_TYPE) != 2) + return 0; + ++ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2)) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_ifast (void) +{ + init_simd(); + + /* The code is optimised for these values only */ + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(IFAST_MULT_TYPE) != 2) + return 0; + if (IFAST_SCALE_BITS != 2) + return 0; + ++ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) ++ return 1; + if (simd_support & JSIMD_MMX) + return 1; + + return 0; +} + +GLOBAL(int) +jsimd_can_idct_float (void) +{ + init_simd(); + + if (DCTSIZE != 8) + return 0; + if (sizeof(JCOEF) != 2) + return 0; + if (BITS_IN_JSAMPLE != 8) + return 0; + if (sizeof(JDIMENSION) != 4) + return 0; + if (sizeof(FAST_FLOAT) != 4) + return 0; + if (sizeof(FLOAT_MULT_TYPE) != 4) + return 0; + ++ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) ++ return 1; + if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse)) + return 1; + if (simd_support & JSIMD_3DNOW) + return 1; + + return 0; +} + +GLOBAL(void) +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +#if WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_islow_sse2)) ++ jsimd_idct_islow_sse2(compptr->dct_table, coef_block, output_buf, output_col); ++ else if (simd_support & JSIMD_MMX) + jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, output_col); +#endif +} + +GLOBAL(void) +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +#if WITH_SIMD - if (simd_support & JSIMD_MMX) ++ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_ifast_sse2)) ++ jsimd_idct_ifast_sse2(compptr->dct_table, coef_block, output_buf, output_col); ++ else if (simd_support & JSIMD_MMX) + jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, output_col); +#endif +} + +GLOBAL(void) +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr, + JCOEFPTR coef_block, JSAMPARRAY output_buf, + JDIMENSION output_col) +{ +#if WITH_SIMD - if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse)) ++ if ((simd_support & JSIMD_SSE2) && IS_ALIGNED_SSE(jconst_idct_float_sse2)) ++ jsimd_idct_float_sse2(compptr->dct_table, coef_block, ++ output_buf, output_col); ++ else if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse)) + jsimd_idct_float_sse(compptr->dct_table, coef_block, + output_buf, output_col); + else if (simd_support & JSIMD_3DNOW) + jsimd_idct_float_3dnow(compptr->dct_table, coef_block, + output_buf, output_col); +#endif +} + diff --cc simd/jccolss2.asm index 0000000,1aabd89..99473b6 mode 000000,100644..100644 --- a/simd/jccolss2.asm +++ b/simd/jccolss2.asm @@@ -1,0 -1,541 +1,533 @@@ + ; + ; jccolss2.asm - colorspace conversion (SSE2) + ; + ; x86 SIMD extension for IJG JPEG library + ; Copyright (C) 1999-2006, MIYASAKA Masaru. + ; For conditions of distribution and use, see copyright notice in jsimdext.inc + ; + ; This file should be assembled with NASM (Netwide Assembler), + ; can *not* be assembled with Microsoft's MASM or any compatible + ; assembler (including Borland's Turbo Assembler). + ; NASM is available from http://nasm.sourceforge.net/ or + ; http://sourceforge.net/project/showfiles.php?group_id=6208 + ; -; Last Modified : February 4, 2006 -; + ; [TAB8] + -%include "jsimdext.inc" -%include "jcolsamp.inc" - -%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 -%ifdef JCCOLOR_RGBYCC_SSE2_SUPPORTED ++%include "simd/jsimdext.inc" ++%include "simd/jcolsamp.inc" + + ; -------------------------------------------------------------------------- + + %define SCALEBITS 16 + + F_0_081 equ 5329 ; FIX(0.08131) + F_0_114 equ 7471 ; FIX(0.11400) + F_0_168 equ 11059 ; FIX(0.16874) + F_0_250 equ 16384 ; FIX(0.25000) + F_0_299 equ 19595 ; FIX(0.29900) + F_0_331 equ 21709 ; FIX(0.33126) + F_0_418 equ 27439 ; FIX(0.41869) + F_0_587 equ 38470 ; FIX(0.58700) + F_0_337 equ (F_0_587 - F_0_250) ; FIX(0.58700) - FIX(0.25000) + + ; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_rgb_ycc_convert_sse2) + + EXTN(jconst_rgb_ycc_convert_sse2): + + PW_F0299_F0337 times 4 dw F_0_299, F_0_337 + PW_F0114_F0250 times 4 dw F_0_114, F_0_250 + PW_MF016_MF033 times 4 dw -F_0_168,-F_0_331 + PW_MF008_MF041 times 4 dw -F_0_081,-F_0_418 + PD_ONEHALFM1_CJ times 4 dd (1 << (SCALEBITS-1)) - 1 + (CENTERJSAMPLE << SCALEBITS) + PD_ONEHALF times 4 dd (1 << (SCALEBITS-1)) + + alignz 16 + + ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + ; + ; Convert some rows of samples to the output colorspace. + ; + ; GLOBAL(void) -; jpeg_rgb_ycc_convert_sse2 (j_compress_ptr cinfo, -; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, -; JDIMENSION output_row, int num_rows); ++; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width, ++; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, ++; JDIMENSION output_row, int num_rows); + ; + -%define cinfo(b) (b)+8 ; j_compress_ptr cinfo ++%define img_width(b) (b)+8 ; JDIMENSION img_width + %define input_buf(b) (b)+12 ; JSAMPARRAY input_buf + %define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf + %define output_row(b) (b)+20 ; JDIMENSION output_row + %define num_rows(b) (b)+24 ; int num_rows + + %define original_ebp ebp+0 + %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] + %define WK_NUM 8 + %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 - global EXTN(jpeg_rgb_ycc_convert_sse2) ++ global EXTN(jsimd_rgb_ycc_convert_sse2) + -EXTN(jpeg_rgb_ycc_convert_sse2): ++EXTN(jsimd_rgb_ycc_convert_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx + ; push ecx ; need not be preserved + ; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + - mov ecx, POINTER [cinfo(eax)] - mov ecx, JDIMENSION [jcstruct_image_width(ecx)] ; num_cols ++ mov ecx, JDIMENSION [img_width(eax)] + test ecx,ecx + jz near .return + + push ecx + + mov esi, JSAMPIMAGE [output_buf(eax)] + mov ecx, JDIMENSION [output_row(eax)] + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] + lea edi, [edi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov esi, JSAMPARRAY [input_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 + .rowloop: + pushpic eax + push edx + push ebx + push edi + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr0 + mov ebx, JSAMPROW [ebx] ; outptr1 + mov edx, JSAMPROW [edx] ; outptr2 + movpic eax, POINTER [gotptr] ; load GOT address (eax) + + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + alignx 16,7 + + %if RGB_PIXELSIZE == 3 ; --------------- + + .column_ld1: + push eax + push edx + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE + test cl, SIZEOF_BYTE + jz short .column_ld2 + sub ecx, byte SIZEOF_BYTE + movzx eax, BYTE [esi+ecx] + .column_ld2: + test cl, SIZEOF_WORD + jz short .column_ld4 + sub ecx, byte SIZEOF_WORD + movzx edx, WORD [esi+ecx] + shl eax, WORD_BIT + or eax,edx + .column_ld4: + movd xmmA,eax + pop edx + pop eax + test cl, SIZEOF_DWORD + jz short .column_ld8 + sub ecx, byte SIZEOF_DWORD - movd xmmF, _DWORD [esi+ecx] ++ movd xmmF, XMM_DWORD [esi+ecx] + pslldq xmmA, SIZEOF_DWORD + por xmmA,xmmF + .column_ld8: + test cl, SIZEOF_MMWORD + jz short .column_ld16 + sub ecx, byte SIZEOF_MMWORD - movq xmmB, _MMWORD [esi+ecx] ++ movq xmmB, XMM_MMWORD [esi+ecx] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmB + .column_ld16: + test cl, SIZEOF_XMMWORD + jz short .column_ld32 + movdqa xmmF,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + mov ecx, SIZEOF_XMMWORD + jmp short .rgb_ycc_cnv + .column_ld32: + test cl, 2*SIZEOF_XMMWORD + mov ecx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmB,xmmA + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + alignx 16,7 + + .columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] + + .rgb_ycc_cnv: + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + movdqa xmmG,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) + + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) + + movdqa xmmD,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) + + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) + + movdqa xmmE,xmmA + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) + + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) + + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) + + pxor xmmH,xmmH + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmB,xmmE + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) + + movdqa xmmF,xmmD + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) + + %else ; RGB_PIXELSIZE == 4 ; ----------- + + .column_ld1: + test cl, SIZEOF_XMMWORD/16 + jz short .column_ld2 + sub ecx, byte SIZEOF_XMMWORD/16 + movd xmmA, _DWORD [esi+ecx*RGB_PIXELSIZE] + .column_ld2: + test cl, SIZEOF_XMMWORD/8 + jz short .column_ld4 + sub ecx, byte SIZEOF_XMMWORD/8 + movq xmmE, _MMWORD [esi+ecx*RGB_PIXELSIZE] + pslldq xmmA, SIZEOF_MMWORD + por xmmA,xmmE + .column_ld4: + test cl, SIZEOF_XMMWORD/4 + jz short .column_ld8 + sub ecx, byte SIZEOF_XMMWORD/4 + movdqa xmmE,xmmA + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] + .column_ld8: + test cl, SIZEOF_XMMWORD/2 + mov ecx, SIZEOF_XMMWORD + jz short .rgb_ycc_cnv + movdqa xmmF,xmmA + movdqa xmmH,xmmE + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + jmp short .rgb_ycc_cnv + alignx 16,7 + + .columnloop: + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] + + .rgb_ycc_cnv: + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) + + movdqa xmmC,xmmF + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) + + movdqa xmmB,xmmA + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) + + movdqa xmmG,xmmD + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) + + movdqa xmmE,xmmA + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) + + movdqa xmmH,xmmB + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) + + pxor xmmF,xmmF + + movdqa xmmC,xmmA + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) + + movdqa xmmD,xmmB + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) + + movdqa xmmG,xmmE + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) + + punpcklbw xmmF,xmmH + punpckhbw xmmH,xmmH + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) + + %endif ; RGB_PIXELSIZE ; --------------- + + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO + + ; (Original) + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + ; + ; (This implementation) + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE + movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO + + movdqa xmm6,xmm1 + punpcklwd xmm1,xmm3 + punpckhwd xmm6,xmm3 + movdqa xmm7,xmm1 + movdqa xmm4,xmm6 + pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) + pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) + pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) + + movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) + movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) + + pxor xmm1,xmm1 + pxor xmm6,xmm6 + punpcklwd xmm1,xmm5 ; xmm1=BOL + punpckhwd xmm6,xmm5 ; xmm6=BOH + psrld xmm1,1 ; xmm1=BOL*FIX(0.500) + psrld xmm6,1 ; xmm6=BOH*FIX(0.500) + + movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] + + paddd xmm7,xmm1 + paddd xmm4,xmm6 + paddd xmm7,xmm5 + paddd xmm4,xmm5 + psrld xmm7,SCALEBITS ; xmm7=CbOL + psrld xmm4,SCALEBITS ; xmm4=CbOH + packssdw xmm7,xmm4 ; xmm7=CbO + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE + + movdqa xmm6,xmm0 + punpcklwd xmm0,xmm2 + punpckhwd xmm6,xmm2 + movdqa xmm5,xmm0 + movdqa xmm4,xmm6 + pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) + pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) + pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) + + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) + movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) + + pxor xmm0,xmm0 + pxor xmm6,xmm6 + punpcklwd xmm0,xmm1 ; xmm0=BEL + punpckhwd xmm6,xmm1 ; xmm6=BEH + psrld xmm0,1 ; xmm0=BEL*FIX(0.500) + psrld xmm6,1 ; xmm6=BEH*FIX(0.500) + + movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm5,xmm0 + paddd xmm4,xmm6 + paddd xmm5,xmm1 + paddd xmm4,xmm1 + psrld xmm5,SCALEBITS ; xmm5=CbEL + psrld xmm4,SCALEBITS ; xmm4=CbEH + packssdw xmm5,xmm4 ; xmm5=CbE + + psllw xmm7,BYTE_BIT + por xmm5,xmm7 ; xmm5=Cb + movdqa XMMWORD [ebx], xmm5 ; Save Cb + + movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO + + movdqa xmm4,xmm0 + punpcklwd xmm0,xmm3 + punpckhwd xmm4,xmm3 + movdqa xmm7,xmm0 + movdqa xmm5,xmm4 + pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) + pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) + pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) + + movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] + + paddd xmm0, XMMWORD [wk(4)] + paddd xmm4, XMMWORD [wk(5)] + paddd xmm0,xmm3 + paddd xmm4,xmm3 + psrld xmm0,SCALEBITS ; xmm0=YOL + psrld xmm4,SCALEBITS ; xmm4=YOH + packssdw xmm0,xmm4 ; xmm0=YO + + pxor xmm3,xmm3 + pxor xmm4,xmm4 + punpcklwd xmm3,xmm1 ; xmm3=ROL + punpckhwd xmm4,xmm1 ; xmm4=ROH + psrld xmm3,1 ; xmm3=ROL*FIX(0.500) + psrld xmm4,1 ; xmm4=ROH*FIX(0.500) + + movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] + + paddd xmm7,xmm3 + paddd xmm5,xmm4 + paddd xmm7,xmm1 + paddd xmm5,xmm1 + psrld xmm7,SCALEBITS ; xmm7=CrOL + psrld xmm5,SCALEBITS ; xmm5=CrOH + packssdw xmm7,xmm5 ; xmm7=CrO + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE + + movdqa xmm4,xmm6 + punpcklwd xmm6,xmm2 + punpckhwd xmm4,xmm2 + movdqa xmm1,xmm6 + movdqa xmm5,xmm4 + pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) + pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) + pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) + + movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] + + paddd xmm6, XMMWORD [wk(6)] + paddd xmm4, XMMWORD [wk(7)] + paddd xmm6,xmm2 + paddd xmm4,xmm2 + psrld xmm6,SCALEBITS ; xmm6=YEL + psrld xmm4,SCALEBITS ; xmm4=YEH + packssdw xmm6,xmm4 ; xmm6=YE + + psllw xmm0,BYTE_BIT + por xmm6,xmm0 ; xmm6=Y + movdqa XMMWORD [edi], xmm6 ; Save Y + + pxor xmm2,xmm2 + pxor xmm4,xmm4 + punpcklwd xmm2,xmm3 ; xmm2=REL + punpckhwd xmm4,xmm3 ; xmm4=REH + psrld xmm2,1 ; xmm2=REL*FIX(0.500) + psrld xmm4,1 ; xmm4=REH*FIX(0.500) + + movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] + + paddd xmm1,xmm2 + paddd xmm5,xmm4 + paddd xmm1,xmm0 + paddd xmm5,xmm0 + psrld xmm1,SCALEBITS ; xmm1=CrEL + psrld xmm5,SCALEBITS ; xmm5=CrEH + packssdw xmm1,xmm5 ; xmm1=CrE + + psllw xmm7,BYTE_BIT + por xmm1,xmm7 ; xmm1=Cr + movdqa XMMWORD [edx], xmm1 ; Save Cr + + sub ecx, byte SIZEOF_XMMWORD + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr + add edi, byte SIZEOF_XMMWORD ; outptr0 + add ebx, byte SIZEOF_XMMWORD ; outptr1 + add edx, byte SIZEOF_XMMWORD ; outptr2 + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx,ecx + jnz near .column_ld1 + + pop ecx ; col + pop esi + pop edi + pop ebx + pop edx + poppic eax + + add esi, byte SIZEOF_JSAMPROW ; input_buf + add edi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + dec eax ; num_rows + jg near .rowloop + + .return: + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + -%endif ; JCCOLOR_RGBYCC_SSE2_SUPPORTED -%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 diff --cc simd/jcolsamp.inc index 56b6bfd,0000000..79751b7 mode 100644,000000..100644 --- a/simd/jcolsamp.inc +++ b/simd/jcolsamp.inc @@@ -1,73 -1,0 +1,105 @@@ +; +; jcolsamp.inc - private declarations for color conversion & up/downsampling +; +; Copyright 2009 Pierre Ossman for Cendio AB +; +; Based on +; x86 SIMD extension for IJG JPEG library +; Copyright (C) 1999-2006, MIYASAKA Masaru. +; For conditions of distribution and use, see copyright notice in jsimdext.inc +; +; [TAB8] + +; -------------------------------------------------------------------------- + +; pseudo-resisters to make ordering of RGB configurable +; +%if RGB_RED == 0 +%define mmA mm0 +%define mmB mm1 ++%define xmmA xmm0 ++%define xmmB xmm1 +%elif RGB_GREEN == 0 +%define mmA mm2 +%define mmB mm3 ++%define xmmA xmm2 ++%define xmmB xmm3 +%elif RGB_BLUE == 0 +%define mmA mm4 +%define mmB mm5 ++%define xmmA xmm4 ++%define xmmB xmm5 +%else +%define mmA mm6 +%define mmB mm7 ++%define xmmA xmm6 ++%define xmmB xmm7 +%endif + +%if RGB_RED == 1 +%define mmC mm0 +%define mmD mm1 ++%define xmmC xmm0 ++%define xmmD xmm1 +%elif RGB_GREEN == 1 +%define mmC mm2 +%define mmD mm3 ++%define xmmC xmm2 ++%define xmmD xmm3 +%elif RGB_BLUE == 1 +%define mmC mm4 +%define mmD mm5 ++%define xmmC xmm4 ++%define xmmD xmm5 +%else +%define mmC mm6 +%define mmD mm7 ++%define xmmC xmm6 ++%define xmmD xmm7 +%endif + +%if RGB_RED == 2 +%define mmE mm0 +%define mmF mm1 ++%define xmmE xmm0 ++%define xmmF xmm1 +%elif RGB_GREEN == 2 +%define mmE mm2 +%define mmF mm3 ++%define xmmE xmm2 ++%define xmmF xmm3 +%elif RGB_BLUE == 2 +%define mmE mm4 +%define mmF mm5 ++%define xmmE xmm4 ++%define xmmF xmm5 +%else +%define mmE mm6 +%define mmF mm7 ++%define xmmE xmm6 ++%define xmmF xmm7 +%endif + +%if RGB_RED == 3 +%define mmG mm0 +%define mmH mm1 ++%define xmmG xmm0 ++%define xmmH xmm1 +%elif RGB_GREEN == 3 +%define mmG mm2 +%define mmH mm3 ++%define xmmG xmm2 ++%define xmmH xmm3 +%elif RGB_BLUE == 3 +%define mmG mm4 +%define mmH mm5 ++%define xmmG xmm4 ++%define xmmH xmm5 +%else +%define mmG mm6 +%define mmH mm7 ++%define xmmG xmm6 ++%define xmmH xmm7 +%endif + +; -------------------------------------------------------------------------- diff --cc simd/jcqnts2f.asm index 0000000,faf663e..eef547a mode 000000,100644..100644 --- a/simd/jcqnts2f.asm +++ b/simd/jcqnts2f.asm @@@ -1,0 -1,178 +1,168 @@@ + ; + ; jcqnts2f.asm - sample data conversion and quantization (SSE & SSE2) + ; ++; Copyright 2009 Pierre Ossman for Cendio AB ++; ++; Based on + ; x86 SIMD extension for IJG JPEG library + ; Copyright (C) 1999-2006, MIYASAKA Masaru. + ; For conditions of distribution and use, see copyright notice in jsimdext.inc + ; + ; This file should be assembled with NASM (Netwide Assembler), + ; can *not* be assembled with Microsoft's MASM or any compatible + ; assembler (including Borland's Turbo Assembler). + ; NASM is available from http://nasm.sourceforge.net/ or + ; http://sourceforge.net/project/showfiles.php?group_id=6208 + ; -; Last Modified : January 18, 2005 -; + ; [TAB8] + -%include "jsimdext.inc" -%include "jdct.inc" - -%ifdef DCT_FLOAT_SUPPORTED -%ifdef JFDCT_FLT_SSE_SSE2_SUPPORTED - -; This module is specialized to the case DCTSIZE = 8. -; -%if DCTSIZE != 8 -%error "Sorry, this code only copes with 8x8 DCTs." -%endif ++%include "simd/jsimdext.inc" ++%include "simd/jdct.inc" + + ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + ; + ; Load data into workspace, applying unsigned->signed conversion + ; + ; GLOBAL(void) -; jpeg_convsamp_flt_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, -; FAST_FLOAT * workspace); ++; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, ++; FAST_FLOAT * workspace); + ; + + %define sample_data ebp+8 ; JSAMPARRAY sample_data + %define start_col ebp+12 ; JDIMENSION start_col + %define workspace ebp+16 ; FAST_FLOAT * workspace + + align 16 - global EXTN(jpeg_convsamp_flt_sse2) ++ global EXTN(jsimd_convsamp_float_sse2) + -EXTN(jpeg_convsamp_flt_sse2): ++EXTN(jsimd_convsamp_float_sse2): + push ebp + mov ebp,esp + push ebx + ; push ecx ; need not be preserved + ; push edx ; need not be preserved + push esi + push edi + + pcmpeqw xmm7,xmm7 + psllw xmm7,7 + packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/2 + alignx 16,7 + .convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + - movq xmm0, _MMWORD [ebx+eax*SIZEOF_JSAMPLE] - movq xmm1, _MMWORD [edx+eax*SIZEOF_JSAMPLE] ++ movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ++ movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] + + psubb xmm0,xmm7 ; xmm0=(01234567) + psubb xmm1,xmm7 ; xmm1=(89ABCDEF) + + punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) + punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) + + punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) + punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) + punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) + punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) + + psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) + psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) + cvtdq2ps xmm2,xmm2 ; xmm2=(0123) + cvtdq2ps xmm0,xmm0 ; xmm0=(4567) + psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) + psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) + cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) + cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + + add esi, byte 2*SIZEOF_JSAMPROW + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT + dec ecx + jnz short .convloop + + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + + + ; -------------------------------------------------------------------------- + ; + ; Quantize/descale the coefficients, and store into coef_block + ; + ; GLOBAL(void) -; jpeg_quantize_flt_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors, ++; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors, + ; FAST_FLOAT * workspace); + ; + + %define coef_block ebp+8 ; JCOEFPTR coef_block + %define divisors ebp+12 ; FAST_FLOAT * divisors + %define workspace ebp+16 ; FAST_FLOAT * workspace + + align 16 - global EXTN(jpeg_quantize_flt_sse2) ++ global EXTN(jsimd_quantize_float_sse2) + -EXTN(jpeg_quantize_flt_sse2): ++EXTN(jsimd_quantize_float_sse2): + push ebp + mov ebp,esp + ; push ebx ; unused + ; push ecx ; unused + ; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/16 + alignx 16,7 + .quantloop: + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] + mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] + + cvtps2dq xmm0,xmm0 + cvtps2dq xmm1,xmm1 + cvtps2dq xmm2,xmm2 + cvtps2dq xmm3,xmm3 + + packssdw xmm0,xmm1 + packssdw xmm2,xmm3 + + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 + + add esi, byte 16*SIZEOF_FAST_FLOAT + add edx, byte 16*SIZEOF_FAST_FLOAT + add edi, byte 16*SIZEOF_JCOEF + dec eax + jnz short .quantloop + + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; unused + ; pop ebx ; unused + pop ebp + ret + -%endif ; JFDCT_FLT_SSE_SSE2_SUPPORTED -%endif ; DCT_FLOAT_SUPPORTED diff --cc simd/jcqnts2i.asm index 0000000,71bae2c..7414e41 mode 000000,100644..100644 --- a/simd/jcqnts2i.asm +++ b/simd/jcqnts2i.asm @@@ -1,0 -1,216 +1,197 @@@ + ; + ; jcqnts2i.asm - sample data conversion and quantization (SSE2) + ; ++; Copyright 2009 Pierre Ossman for Cendio AB ++; ++; Based on + ; x86 SIMD extension for IJG JPEG library + ; Copyright (C) 1999-2006, MIYASAKA Masaru. + ; For conditions of distribution and use, see copyright notice in jsimdext.inc + ; + ; This file should be assembled with NASM (Netwide Assembler), + ; can *not* be assembled with Microsoft's MASM or any compatible + ; assembler (including Borland's Turbo Assembler). + ; NASM is available from http://nasm.sourceforge.net/ or + ; http://sourceforge.net/project/showfiles.php?group_id=6208 + ; -; Last Modified : January 27, 2005 -; + ; [TAB8] + -%include "jsimdext.inc" -%include "jdct.inc" - -%ifdef JFDCT_INT_SSE2_SUPPORTED - -; This module is specialized to the case DCTSIZE = 8. -; -%if DCTSIZE != 8 -%error "Sorry, this code only copes with 8x8 DCTs." -%endif ++%include "simd/jsimdext.inc" ++%include "simd/jdct.inc" + + ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + ; + ; Load data into workspace, applying unsigned->signed conversion + ; + ; GLOBAL(void) -; jpeg_convsamp_int_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, -; DCTELEM * workspace); ++; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, ++; DCTELEM * workspace); + ; + + %define sample_data ebp+8 ; JSAMPARRAY sample_data + %define start_col ebp+12 ; JDIMENSION start_col + %define workspace ebp+16 ; DCTELEM * workspace + + align 16 - global EXTN(jpeg_convsamp_int_sse2) ++ global EXTN(jsimd_convsamp_sse2) + -EXTN(jpeg_convsamp_int_sse2): ++EXTN(jsimd_convsamp_sse2): + push ebp + mov ebp,esp + push ebx + ; push ecx ; need not be preserved + ; push edx ; need not be preserved + push esi + push edi + + pxor xmm6,xmm6 ; xmm6=(all 0's) + pcmpeqw xmm7,xmm7 + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) + mov eax, JDIMENSION [start_col] + mov edi, POINTER [workspace] ; (DCTELEM *) + mov ecx, DCTSIZE/4 + alignx 16,7 + .convloop: + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) + - movq xmm0, _MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) - movq xmm1, _MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) ++ movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) ++ movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) + + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) + - movq xmm2, _MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) - movq xmm3, _MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) ++ movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) ++ movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) + + punpcklbw xmm0,xmm6 ; xmm0=(01234567) + punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) + paddw xmm0,xmm7 + paddw xmm1,xmm7 + punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) + punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) + paddw xmm2,xmm7 + paddw xmm3,xmm7 + + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 + + add esi, byte 4*SIZEOF_JSAMPROW + add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM + dec ecx + jnz short .convloop + + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + -%ifndef JFDCT_INT_QUANTIZE_WITH_DIVISION - + ; -------------------------------------------------------------------------- + ; + ; Quantize/descale the coefficients, and store into coef_block + ; + ; This implementation is based on an algorithm described in + ; "How to optimize for the Pentium family of microprocessors" + ; (http://www.agner.org/assem/). + ; + ; GLOBAL(void) -; jpeg_quantize_int_sse2 (JCOEFPTR coef_block, DCTELEM * divisors, -; DCTELEM * workspace); ++; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors, ++; DCTELEM * workspace); + ; + + %define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) + %define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) + %define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) + + %define coef_block ebp+8 ; JCOEFPTR coef_block + %define divisors ebp+12 ; DCTELEM * divisors + %define workspace ebp+16 ; DCTELEM * workspace + + align 16 - global EXTN(jpeg_quantize_int_sse2) ++ global EXTN(jsimd_quantize_sse2) + -EXTN(jpeg_quantize_int_sse2): ++EXTN(jsimd_quantize_sse2): + push ebp + mov ebp,esp + ; push ebx ; unused + ; push ecx ; unused + ; push edx ; need not be preserved + push esi + push edi + + mov esi, POINTER [workspace] + mov edx, POINTER [divisors] + mov edi, JCOEFPTR [coef_block] + mov eax, DCTSIZE2/32 + alignx 16,7 + .quantloop: + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] + movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] + movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] + movdqa xmm0,xmm4 + movdqa xmm1,xmm5 + movdqa xmm2,xmm6 + movdqa xmm3,xmm7 + psraw xmm4,(WORD_BIT-1) + psraw xmm5,(WORD_BIT-1) + psraw xmm6,(WORD_BIT-1) + psraw xmm7,(WORD_BIT-1) + pxor xmm0,xmm4 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; + psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; + psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; + psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; + + paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor + paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] + paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] + paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] - psllw xmm0,1 - psllw xmm1,1 - psllw xmm2,1 - psllw xmm3,1 + pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal + pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] + pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] + pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] - psllw xmm0,1 - psllw xmm1,1 - psllw xmm2,1 - psllw xmm3,1 + pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale + pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] + pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] + pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] + + pxor xmm0,xmm4 + pxor xmm1,xmm5 + pxor xmm2,xmm6 + pxor xmm3,xmm7 + psubw xmm0,xmm4 + psubw xmm1,xmm5 + psubw xmm2,xmm6 + psubw xmm3,xmm7 + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 + + add esi, byte 32*SIZEOF_DCTELEM + add edx, byte 32*SIZEOF_DCTELEM + add edi, byte 32*SIZEOF_JCOEF + dec eax + jnz near .quantloop + + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; unused + ; pop ebx ; unused + pop ebp + ret + -%endif ; !JFDCT_INT_QUANTIZE_WITH_DIVISION -%endif ; JFDCT_INT_SSE2_SUPPORTED diff --cc simd/jcsamss2.asm index 0000000,e187d63..ec2df9a mode 000000,100644..100644 --- a/simd/jcsamss2.asm +++ b/simd/jcsamss2.asm @@@ -1,0 -1,355 +1,348 @@@ + ; + ; jcsamss2.asm - downsampling (SSE2) + ; ++; Copyright 2009 Pierre Ossman for Cendio AB ++; ++; Based on + ; x86 SIMD extension for IJG JPEG library + ; Copyright (C) 1999-2006, MIYASAKA Masaru. + ; For conditions of distribution and use, see copyright notice in jsimdext.inc + ; + ; This file should be assembled with NASM (Netwide Assembler), + ; can *not* be assembled with Microsoft's MASM or any compatible + ; assembler (including Borland's Turbo Assembler). + ; NASM is available from http://nasm.sourceforge.net/ or + ; http://sourceforge.net/project/showfiles.php?group_id=6208 + ; -; Last Modified : January 23, 2006 -; + ; [TAB8] + -%include "jsimdext.inc" -%include "jcolsamp.inc" - -%ifdef JCSAMPLE_SSE2_SUPPORTED ++%include "simd/jsimdext.inc" + + ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + ; + ; Downsample pixel values of a single component. + ; This version handles the common case of 2:1 horizontal and 1:1 vertical, + ; without smoothing. + ; + ; GLOBAL(void) -; jpeg_h2v1_downsample_sse2 (j_compress_ptr cinfo, -; jpeg_component_info * compptr, -; JSAMPARRAY input_data, JSAMPARRAY output_data); ++; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, ++; JDIMENSION v_samp_factor, JDIMENSION width_blocks, ++; JSAMPARRAY input_data, JSAMPARRAY output_data); + ; + -%define cinfo(b) (b)+8 ; j_compress_ptr cinfo -%define compptr(b) (b)+12 ; jpeg_component_info * compptr -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data(b) (b)+20 ; JSAMPARRAY output_data ++%define img_width(b) (b)+8 ; JDIMENSION image_width ++%define max_v_samp(b) (b)+12 ; int max_v_samp_factor ++%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor ++%define width_blks(b) (b)+20 ; JDIMENSION width_blocks ++%define input_data(b) (b)+24 ; JSAMPARRAY input_data ++%define output_data(b) (b)+28 ; JSAMPARRAY output_data + + align 16 - global EXTN(jpeg_h2v1_downsample_sse2) ++ global EXTN(jsimd_h2v1_downsample_sse2) + -EXTN(jpeg_h2v1_downsample_sse2): ++EXTN(jsimd_h2v1_downsample_sse2): + push ebp + mov ebp,esp + ; push ebx ; unused + ; push ecx ; need not be preserved + ; push edx ; need not be preserved + push esi + push edi + - mov ecx, POINTER [compptr(ebp)] - mov ecx, JDIMENSION [jcompinfo_width_in_blocks(ecx)] ++ mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + - mov edx, POINTER [cinfo(ebp)] - mov edx, JDIMENSION [jcstruct_image_width(edx)] ++ mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx,1 ; output_cols * 2 + sub ecx,edx + jle short .expand_end + - mov eax, POINTER [cinfo(ebp)] - mov eax, INT [jcstruct_max_v_samp_factor(eax)] ++ mov eax, INT [max_v_samp(ebp)] + test eax,eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16,7 + .expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi,edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + + .expand_end: + pop ecx ; output_cols + + ; -- h2v1_downsample + - mov eax, POINTER [compptr(ebp)] - mov eax, JDIMENSION [jcompinfo_v_samp_factor(eax)] ; rowctr ++ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax,eax + jle near .return + + mov edx, 0x00010000 ; bias pattern + movd xmm7,edx + pcmpeqw xmm6,xmm6 + pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16,7 + .rowloop: + push ecx + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + alignx 16,7 + + .columnloop_r8: + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm1,xmm1 + mov ecx, SIZEOF_XMMWORD + jmp short .downsample + alignx 16,7 + + .columnloop: + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] + + .downsample: + movdqa xmm2,xmm0 + movdqa xmm3,xmm1 + + pand xmm0,xmm6 + psrlw xmm2,BYTE_BIT + pand xmm1,xmm6 + psrlw xmm3,BYTE_BIT + + paddw xmm0,xmm2 + paddw xmm1,xmm3 + paddw xmm0,xmm7 + paddw xmm1,xmm7 + psrlw xmm0,1 + psrlw xmm1,1 + + packuswb xmm0,xmm1 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + + sub ecx, byte SIZEOF_XMMWORD ; outcol + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add edi, byte 1*SIZEOF_XMMWORD ; outptr + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + test ecx,ecx + jnz short .columnloop_r8 + + pop esi + pop edi + pop ecx + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + + .return: + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; need not be preserved + ; pop ebx ; unused + pop ebp + ret + + ; -------------------------------------------------------------------------- + ; + ; Downsample pixel values of a single component. + ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, + ; without smoothing. + ; + ; GLOBAL(void) -; jpeg_h2v2_downsample_sse2 (j_compress_ptr cinfo, -; jpeg_component_info * compptr, -; JSAMPARRAY input_data, JSAMPARRAY output_data); ++; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, ++; JDIMENSION v_samp_factor, JDIMENSION width_blocks, ++; JSAMPARRAY input_data, JSAMPARRAY output_data); + ; + -%define cinfo(b) (b)+8 ; j_compress_ptr cinfo -%define compptr(b) (b)+12 ; jpeg_component_info * compptr -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data(b) (b)+20 ; JSAMPARRAY output_data ++%define img_width(b) (b)+8 ; JDIMENSION image_width ++%define max_v_samp(b) (b)+12 ; int max_v_samp_factor ++%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor ++%define width_blks(b) (b)+20 ; JDIMENSION width_blocks ++%define input_data(b) (b)+24 ; JSAMPARRAY input_data ++%define output_data(b) (b)+28 ; JSAMPARRAY output_data + + align 16 - global EXTN(jpeg_h2v2_downsample_sse2) ++ global EXTN(jsimd_h2v2_downsample_sse2) + -EXTN(jpeg_h2v2_downsample_sse2): ++EXTN(jsimd_h2v2_downsample_sse2): + push ebp + mov ebp,esp + ; push ebx ; unused + ; push ecx ; need not be preserved + ; push edx ; need not be preserved + push esi + push edi + - mov ecx, POINTER [compptr(ebp)] - mov ecx, JDIMENSION [jcompinfo_width_in_blocks(ecx)] ++ mov ecx, JDIMENSION [width_blks(ebp)] + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) + jz near .return + - mov edx, POINTER [cinfo(ebp)] - mov edx, JDIMENSION [jcstruct_image_width(edx)] ++ mov edx, JDIMENSION [img_width(ebp)] + + ; -- expand_right_edge + + push ecx + shl ecx,1 ; output_cols * 2 + sub ecx,edx + jle short .expand_end + - mov eax, POINTER [cinfo(ebp)] - mov eax, INT [jcstruct_max_v_samp_factor(eax)] ++ mov eax, INT [max_v_samp(ebp)] + test eax,eax + jle short .expand_end + + cld + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + alignx 16,7 + .expandloop: + push eax + push ecx + + mov edi, JSAMPROW [esi] + add edi,edx + mov al, JSAMPLE [edi-1] + + rep stosb + + pop ecx + pop eax + + add esi, byte SIZEOF_JSAMPROW + dec eax + jg short .expandloop + + .expand_end: + pop ecx ; output_cols + + ; -- h2v2_downsample + - mov eax, POINTER [compptr(ebp)] - mov eax, JDIMENSION [jcompinfo_v_samp_factor(eax)] ; rowctr ++ mov eax, JDIMENSION [v_samp(ebp)] ; rowctr + test eax,eax + jle near .return + + mov edx, 0x00020001 ; bias pattern + movd xmm7,edx + pcmpeqw xmm6,xmm6 + pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data + alignx 16,7 + .rowloop: + push ecx + push edi + push esi + + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 + mov edi, JSAMPROW [edi] ; outptr + + cmp ecx, byte SIZEOF_XMMWORD + jae short .columnloop + alignx 16,7 + + .columnloop_r8: + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + pxor xmm2,xmm2 + pxor xmm3,xmm3 + mov ecx, SIZEOF_XMMWORD + jmp short .downsample + alignx 16,7 + + .columnloop: + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] + + .downsample: + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 + pand xmm0,xmm6 + psrlw xmm4,BYTE_BIT + pand xmm1,xmm6 + psrlw xmm5,BYTE_BIT + paddw xmm0,xmm4 + paddw xmm1,xmm5 + + movdqa xmm4,xmm2 + movdqa xmm5,xmm3 + pand xmm2,xmm6 + psrlw xmm4,BYTE_BIT + pand xmm3,xmm6 + psrlw xmm5,BYTE_BIT + paddw xmm2,xmm4 + paddw xmm3,xmm5 + + paddw xmm0,xmm1 + paddw xmm2,xmm3 + paddw xmm0,xmm7 + paddw xmm2,xmm7 + psrlw xmm0,2 + psrlw xmm2,2 + + packuswb xmm0,xmm2 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + + sub ecx, byte SIZEOF_XMMWORD ; outcol + add edx, byte 2*SIZEOF_XMMWORD ; inptr0 + add esi, byte 2*SIZEOF_XMMWORD ; inptr1 + add edi, byte 1*SIZEOF_XMMWORD ; outptr + cmp ecx, byte SIZEOF_XMMWORD + jae near .columnloop + test ecx,ecx + jnz near .columnloop_r8 + + pop esi + pop edi + pop ecx + + add esi, byte 2*SIZEOF_JSAMPROW ; input_data + add edi, byte 1*SIZEOF_JSAMPROW ; output_data + dec eax ; rowctr + jg near .rowloop + + .return: + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; need not be preserved + ; pop ebx ; unused + pop ebp + ret + -%endif ; JCSAMPLE_SSE2_SUPPORTED diff --cc simd/jdcolss2.asm index 0000000,fd6f04d..3fd591b mode 000000,100644..100644 --- a/simd/jdcolss2.asm +++ b/simd/jdcolss2.asm @@@ -1,0 -1,536 +1,531 @@@ + ; + ; jdcolss2.asm - colorspace conversion (SSE2) + ; ++; Copyright 2009 Pierre Ossman for Cendio AB ++; ++; Based on + ; x86 SIMD extension for IJG JPEG library + ; Copyright (C) 1999-2006, MIYASAKA Masaru. + ; For conditions of distribution and use, see copyright notice in jsimdext.inc + ; + ; This file should be assembled with NASM (Netwide Assembler), + ; can *not* be assembled with Microsoft's MASM or any compatible + ; assembler (including Borland's Turbo Assembler). + ; NASM is available from http://nasm.sourceforge.net/ or + ; http://sourceforge.net/project/showfiles.php?group_id=6208 + ; -; Last Modified : February 4, 2006 -; + ; [TAB8] + -%include "jsimdext.inc" -%include "jcolsamp.inc" - -%if RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 -%ifdef JDCOLOR_YCCRGB_SSE2_SUPPORTED ++%include "simd/jsimdext.inc" ++%include "simd/jcolsamp.inc" + + ; -------------------------------------------------------------------------- + + %define SCALEBITS 16 + + F_0_344 equ 22554 ; FIX(0.34414) + F_0_714 equ 46802 ; FIX(0.71414) + F_1_402 equ 91881 ; FIX(1.40200) + F_1_772 equ 116130 ; FIX(1.77200) + F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) + F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) + F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) + + ; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_ycc_rgb_convert_sse2) + + EXTN(jconst_ycc_rgb_convert_sse2): + + PW_F0402 times 8 dw F_0_402 + PW_MF0228 times 8 dw -F_0_228 + PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 + PW_ONE times 8 dw 1 + PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) + + alignz 16 + + ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + ; + ; Convert some rows of samples to the output colorspace. + ; + ; GLOBAL(void) -; jpeg_ycc_rgb_convert_sse2 (j_decompress_ptr cinfo, -; JSAMPIMAGE input_buf, JDIMENSION input_row, -; JSAMPARRAY output_buf, int num_rows) ++; jsimd_ycc_rgb_convert_sse2 (JDIMENSION out_width, ++; JSAMPIMAGE input_buf, JDIMENSION input_row, ++; JSAMPARRAY output_buf, int num_rows) + ; + -%define cinfo(b) (b)+8 ; j_decompress_ptr cinfo ++%define out_width(b) (b)+8 ; JDIMENSION out_width + %define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf + %define input_row(b) (b)+16 ; JDIMENSION input_row + %define output_buf(b) (b)+20 ; JSAMPARRAY output_buf + %define num_rows(b) (b)+24 ; int num_rows + + %define original_ebp ebp+0 + %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] + %define WK_NUM 2 + %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 - global EXTN(jpeg_ycc_rgb_convert_sse2) ++ global EXTN(jsimd_ycc_rgb_convert_sse2) + -EXTN(jpeg_ycc_rgb_convert_sse2): ++EXTN(jsimd_ycc_rgb_convert_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx + ; push ecx ; need not be preserved + ; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + - mov ecx, POINTER [cinfo(eax)] - mov ecx, JDIMENSION [jdstruct_output_width(ecx)] ; num_cols ++ mov ecx, JDIMENSION [out_width(eax)] ; num_cols + test ecx,ecx + jz near .return + + push ecx + + mov edi, JSAMPIMAGE [input_buf(eax)] + mov ecx, JDIMENSION [input_row(eax)] + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] + lea esi, [esi+ecx*SIZEOF_JSAMPROW] + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] + lea edx, [edx+ecx*SIZEOF_JSAMPROW] + + pop ecx + + mov edi, JSAMPARRAY [output_buf(eax)] + mov eax, INT [num_rows(eax)] + test eax,eax + jle near .return + alignx 16,7 + .rowloop: + push eax + push edi + push edx + push ebx + push esi + push ecx ; col + + mov esi, JSAMPROW [esi] ; inptr0 + mov ebx, JSAMPROW [ebx] ; inptr1 + mov edx, JSAMPROW [edx] ; inptr2 + mov edi, JSAMPROW [edi] ; outptr + movpic eax, POINTER [gotptr] ; load GOT address (eax) + alignx 16,7 + .columnloop: + + movdqa xmm5, XMMWORD [ebx] ; xmm5=Cb(0123456789ABCDEF) + movdqa xmm1, XMMWORD [edx] ; xmm1=Cr(0123456789ABCDEF) + + pcmpeqw xmm4,xmm4 + pcmpeqw xmm7,xmm7 + psrlw xmm4,BYTE_BIT + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} + movdqa xmm0,xmm4 ; xmm0=xmm4={0xFF 0x00 0xFF 0x00 ..} + + pand xmm4,xmm5 ; xmm4=Cb(02468ACE)=CbE + psrlw xmm5,BYTE_BIT ; xmm5=Cb(13579BDF)=CbO + pand xmm0,xmm1 ; xmm0=Cr(02468ACE)=CrE + psrlw xmm1,BYTE_BIT ; xmm1=Cr(13579BDF)=CrO + + paddw xmm4,xmm7 + paddw xmm5,xmm7 + paddw xmm0,xmm7 + paddw xmm1,xmm7 + + ; (Original) + ; R = Y + 1.40200 * Cr + ; G = Y - 0.34414 * Cb - 0.71414 * Cr + ; B = Y + 1.77200 * Cb + ; + ; (This implementation) + ; R = Y + 0.40200 * Cr + Cr + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr + ; B = Y - 0.22800 * Cb + Cb + Cb + + movdqa xmm2,xmm4 ; xmm2=CbE + movdqa xmm3,xmm5 ; xmm3=CbO + paddw xmm4,xmm4 ; xmm4=2*CbE + paddw xmm5,xmm5 ; xmm5=2*CbO + movdqa xmm6,xmm0 ; xmm6=CrE + movdqa xmm7,xmm1 ; xmm7=CrO + paddw xmm0,xmm0 ; xmm0=2*CrE + paddw xmm1,xmm1 ; xmm1=2*CrO + + pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbE * -FIX(0.22800)) + pmulhw xmm5,[GOTOFF(eax,PW_MF0228)] ; xmm5=(2*CbO * -FIX(0.22800)) + pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrE * FIX(0.40200)) + pmulhw xmm1,[GOTOFF(eax,PW_F0402)] ; xmm1=(2*CrO * FIX(0.40200)) + + paddw xmm4,[GOTOFF(eax,PW_ONE)] + paddw xmm5,[GOTOFF(eax,PW_ONE)] + psraw xmm4,1 ; xmm4=(CbE * -FIX(0.22800)) + psraw xmm5,1 ; xmm5=(CbO * -FIX(0.22800)) + paddw xmm0,[GOTOFF(eax,PW_ONE)] + paddw xmm1,[GOTOFF(eax,PW_ONE)] + psraw xmm0,1 ; xmm0=(CrE * FIX(0.40200)) + psraw xmm1,1 ; xmm1=(CrO * FIX(0.40200)) + + paddw xmm4,xmm2 + paddw xmm5,xmm3 + paddw xmm4,xmm2 ; xmm4=(CbE * FIX(1.77200))=(B-Y)E + paddw xmm5,xmm3 ; xmm5=(CbO * FIX(1.77200))=(B-Y)O + paddw xmm0,xmm6 ; xmm0=(CrE * FIX(1.40200))=(R-Y)E + paddw xmm1,xmm7 ; xmm1=(CrO * FIX(1.40200))=(R-Y)O + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=(B-Y)E + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(B-Y)O + + movdqa xmm4,xmm2 + movdqa xmm5,xmm3 + punpcklwd xmm2,xmm6 + punpckhwd xmm4,xmm6 + pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm4,[GOTOFF(eax,PW_MF0344_F0285)] + punpcklwd xmm3,xmm7 + punpckhwd xmm5,xmm7 + pmaddwd xmm3,[GOTOFF(eax,PW_MF0344_F0285)] + pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] + + paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] + paddd xmm4,[GOTOFF(eax,PD_ONEHALF)] + psrad xmm2,SCALEBITS + psrad xmm4,SCALEBITS + paddd xmm3,[GOTOFF(eax,PD_ONEHALF)] + paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] + psrad xmm3,SCALEBITS + psrad xmm5,SCALEBITS + + packssdw xmm2,xmm4 ; xmm2=CbE*-FIX(0.344)+CrE*FIX(0.285) + packssdw xmm3,xmm5 ; xmm3=CbO*-FIX(0.344)+CrO*FIX(0.285) + psubw xmm2,xmm6 ; xmm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E + psubw xmm3,xmm7 ; xmm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O + + movdqa xmm5, XMMWORD [esi] ; xmm5=Y(0123456789ABCDEF) + + pcmpeqw xmm4,xmm4 + psrlw xmm4,BYTE_BIT ; xmm4={0xFF 0x00 0xFF 0x00 ..} + pand xmm4,xmm5 ; xmm4=Y(02468ACE)=YE + psrlw xmm5,BYTE_BIT ; xmm5=Y(13579BDF)=YO + + paddw xmm0,xmm4 ; xmm0=((R-Y)E+YE)=RE=R(02468ACE) + paddw xmm1,xmm5 ; xmm1=((R-Y)O+YO)=RO=R(13579BDF) + packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) + packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) + + paddw xmm2,xmm4 ; xmm2=((G-Y)E+YE)=GE=G(02468ACE) + paddw xmm3,xmm5 ; xmm3=((G-Y)O+YO)=GO=G(13579BDF) + packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) + packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) + + paddw xmm4, XMMWORD [wk(0)] ; xmm4=(YE+(B-Y)E)=BE=B(02468ACE) + paddw xmm5, XMMWORD [wk(1)] ; xmm5=(YO+(B-Y)O)=BO=B(13579BDF) + packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) + packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) + + %if RGB_PIXELSIZE == 3 ; --------------- + + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) + punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) + + movdqa xmmG,xmmA + movdqa xmmH,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) + punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) + + psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) + psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) + + movdqa xmmC,xmmD + movdqa xmmB,xmmD + punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) + punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) + + psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) + + movdqa xmmF,xmmE + punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) + punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) + + pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) + movdqa xmmB,xmmE + punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) + punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) + punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) + + pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) + movdqa xmmB,xmmF + punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) + punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) + punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) + + punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) + punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) + punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + jmp short .out0 + .out1: ; --(unaligned)----------------- + pcmpeqb xmmH,xmmH ; xmmH=(all 1's) + maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD + add edi, byte SIZEOF_XMMWORD ; outptr + maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF + add edi, byte SIZEOF_XMMWORD ; outptr + .out0: + sub ecx, byte SIZEOF_XMMWORD + jz near .nextrow + + add esi, byte SIZEOF_XMMWORD ; inptr0 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 + + .column_st32: + pcmpeqb xmmH,xmmH ; xmmH=(all 1's) + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE + cmp ecx, byte 2*SIZEOF_XMMWORD + jb short .column_st16 + maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmF + sub ecx, byte 2*SIZEOF_XMMWORD + jmp short .column_st15 + .column_st16: + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st15 + maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub ecx, byte SIZEOF_XMMWORD + .column_st15: + mov eax,ecx + xor ecx, byte 0x0F + shl ecx, 2 + movd xmmB,ecx + psrlq xmmH,4 + pcmpeqb xmmE,xmmE + psrlq xmmH,xmmB + psrlq xmmE,xmmB + punpcklbw xmmE,xmmH + ; ---------------- + mov ecx,edi + and ecx, byte SIZEOF_XMMWORD-1 + jz short .adj0 + add eax,ecx + cmp eax, byte SIZEOF_XMMWORD + ja short .adj0 + and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary + shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx + movdqa xmmG,xmmA + movdqa xmmC,xmmE + pslldq xmmA, SIZEOF_XMMWORD/2 + pslldq xmmE, SIZEOF_XMMWORD/2 + movd xmmD,ecx + sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT + jb short .adj1 + movd xmmF,ecx + psllq xmmA,xmmF + psllq xmmE,xmmF + jmp short .adj0 + .adj1: neg ecx + movd xmmF,ecx + psrlq xmmA,xmmF + psrlq xmmE,xmmF + psllq xmmG,xmmD + psllq xmmC,xmmD + por xmmA,xmmG + por xmmE,xmmC + .adj0: ; ---------------- + maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + + %else ; RGB_PIXELSIZE == 4 ; ----------- + + %ifdef RGBX_FILLER_0XFF + pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + %else + pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) + pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) + %endif + ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) + ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) + ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) + ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) + + punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) + punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) + punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) + punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) + + movdqa xmmC,xmmA + punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) + punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) + movdqa xmmG,xmmB + punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) + punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) + + movdqa xmmD,xmmA + punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) + punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) + movdqa xmmH,xmmC + punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) + punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) + + cmp ecx, byte SIZEOF_XMMWORD + jb short .column_st32 + + test edi, SIZEOF_XMMWORD-1 + jnz short .out1 + ; --(aligned)------------------- + movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA + movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD + movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC + movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH + add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr + jmp short .out0 + .out1: ; --(unaligned)----------------- + pcmpeqb xmmE,xmmE ; xmmE=(all 1's) + maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD + add edi, byte SIZEOF_XMMWORD ; outptr + maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC + add edi, byte SIZEOF_XMMWORD ; outptr + maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH + add edi, byte SIZEOF_XMMWORD ; outptr + .out0: + sub ecx, byte SIZEOF_XMMWORD + jz near .nextrow + + add esi, byte SIZEOF_XMMWORD ; inptr0 + add ebx, byte SIZEOF_XMMWORD ; inptr1 + add edx, byte SIZEOF_XMMWORD ; inptr2 + jmp near .columnloop + alignx 16,7 + + .column_st32: + pcmpeqb xmmE,xmmE ; xmmE=(all 1's) + cmp ecx, byte SIZEOF_XMMWORD/2 + jb short .column_st16 + maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmC + movdqa xmmD,xmmH + sub ecx, byte SIZEOF_XMMWORD/2 + .column_st16: + cmp ecx, byte SIZEOF_XMMWORD/4 + jb short .column_st15 + maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + add edi, byte SIZEOF_XMMWORD ; outptr + movdqa xmmA,xmmD + sub ecx, byte SIZEOF_XMMWORD/4 + .column_st15: + cmp ecx, byte SIZEOF_XMMWORD/16 + jb short .nextrow + mov eax,ecx + xor ecx, byte 0x03 + inc ecx + shl ecx, 4 + movd xmmF,ecx + psrlq xmmE,xmmF + punpcklbw xmmE,xmmE + ; ---------------- + mov ecx,edi + and ecx, byte SIZEOF_XMMWORD-1 + jz short .adj0 + lea eax, [ecx+eax*4] ; RGB_PIXELSIZE + cmp eax, byte SIZEOF_XMMWORD + ja short .adj0 + and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary + shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx + movdqa xmmB,xmmA + movdqa xmmG,xmmE + pslldq xmmA, SIZEOF_XMMWORD/2 + pslldq xmmE, SIZEOF_XMMWORD/2 + movd xmmC,ecx + sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT + jb short .adj1 + movd xmmH,ecx + psllq xmmA,xmmH + psllq xmmE,xmmH + jmp short .adj0 + .adj1: neg ecx + movd xmmH,ecx + psrlq xmmA,xmmH + psrlq xmmE,xmmH + psllq xmmB,xmmC + psllq xmmG,xmmC + por xmmA,xmmB + por xmmE,xmmG + .adj0: ; ---------------- + maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA + + %endif ; RGB_PIXELSIZE ; --------------- + + alignx 16,7 + + .nextrow: + pop ecx + pop esi + pop ebx + pop edx + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW + add ebx, byte SIZEOF_JSAMPROW + add edx, byte SIZEOF_JSAMPROW + add edi, byte SIZEOF_JSAMPROW ; output_buf + dec eax ; num_rows + jg near .rowloop + + sfence ; flush the write buffer + + .return: + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + -%endif ; JDCOLOR_YCCRGB_SSE2_SUPPORTED -%endif ; RGB_PIXELSIZE == 3 || RGB_PIXELSIZE == 4 diff --cc simd/jdmerss2.asm index 0000000,0000000..c0804ec new file mode 100644 --- /dev/null +++ b/simd/jdmerss2.asm @@@ -1,0 -1,0 +1,590 @@@ ++; ++; jdmerss2.asm - merged upsampling/color conversion (SSE2) ++; ++; Copyright 2009 Pierre Ossman for Cendio AB ++; ++; Based on ++; x86 SIMD extension for IJG JPEG library ++; Copyright (C) 1999-2006, MIYASAKA Masaru. ++; For conditions of distribution and use, see copyright notice in jsimdext.inc ++; ++; This file should be assembled with NASM (Netwide Assembler), ++; can *not* be assembled with Microsoft's MASM or any compatible ++; assembler (including Borland's Turbo Assembler). ++; NASM is available from http://nasm.sourceforge.net/ or ++; http://sourceforge.net/project/showfiles.php?group_id=6208 ++; ++; [TAB8] ++ ++%include "simd/jsimdext.inc" ++%include "simd/jcolsamp.inc" ++ ++; -------------------------------------------------------------------------- ++ ++%define SCALEBITS 16 ++ ++F_0_344 equ 22554 ; FIX(0.34414) ++F_0_714 equ 46802 ; FIX(0.71414) ++F_1_402 equ 91881 ; FIX(1.40200) ++F_1_772 equ 116130 ; FIX(1.77200) ++F_0_402 equ (F_1_402 - 65536) ; FIX(1.40200) - FIX(1) ++F_0_285 equ ( 65536 - F_0_714) ; FIX(1) - FIX(0.71414) ++F_0_228 equ (131072 - F_1_772) ; FIX(2) - FIX(1.77200) ++ ++; -------------------------------------------------------------------------- ++ SECTION SEG_CONST ++ ++ alignz 16 ++ global EXTN(jconst_merged_upsample_sse2) ++ ++EXTN(jconst_merged_upsample_sse2): ++ ++PW_F0402 times 8 dw F_0_402 ++PW_MF0228 times 8 dw -F_0_228 ++PW_MF0344_F0285 times 4 dw -F_0_344, F_0_285 ++PW_ONE times 8 dw 1 ++PD_ONEHALF times 4 dd 1 << (SCALEBITS-1) ++ ++ alignz 16 ++ ++; -------------------------------------------------------------------------- ++ SECTION SEG_TEXT ++ BITS 32 ++; ++; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. ++; ++; GLOBAL(void) ++; jsimd_h2v1_merged_upsample_sse2 (JDIMENSION output_width, ++; JSAMPIMAGE input_buf, ++; JDIMENSION in_row_group_ctr, ++; JSAMPARRAY output_buf); ++; ++ ++%define output_width(b) (b)+8 ; JDIMENSION output_width ++%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf ++%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr ++%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf ++ ++%define original_ebp ebp+0 ++%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] ++%define WK_NUM 3 ++%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr ++ ++ align 16 ++ global EXTN(jsimd_h2v1_merged_upsample_sse2) ++ ++EXTN(jsimd_h2v1_merged_upsample_sse2): ++ push ebp ++ mov eax,esp ; eax = original ebp ++ sub esp, byte 4 ++ and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits ++ mov [esp],eax ++ mov ebp,esp ; ebp = aligned ebp ++ lea esp, [wk(0)] ++ pushpic eax ; make a room for GOT address ++ push ebx ++; push ecx ; need not be preserved ++; push edx ; need not be preserved ++ push esi ++ push edi ++ ++ get_GOT ebx ; get GOT address ++ movpic POINTER [gotptr], ebx ; save GOT address ++ ++ mov ecx, JDIMENSION [output_width(eax)] ; col ++ test ecx,ecx ++ jz near .return ++ ++ push ecx ++ ++ mov edi, JSAMPIMAGE [input_buf(eax)] ++ mov ecx, JDIMENSION [in_row_group_ctr(eax)] ++ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] ++ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] ++ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] ++ mov edi, JSAMPARRAY [output_buf(eax)] ++ mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 ++ mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 ++ mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 ++ mov edi, JSAMPROW [edi] ; outptr ++ ++ pop ecx ; col ++ ++ alignx 16,7 ++.columnloop: ++ movpic eax, POINTER [gotptr] ; load GOT address (eax) ++ ++ movdqa xmm6, XMMWORD [ebx] ; xmm6=Cb(0123456789ABCDEF) ++ movdqa xmm7, XMMWORD [edx] ; xmm7=Cr(0123456789ABCDEF) ++ ++ pxor xmm1,xmm1 ; xmm1=(all 0's) ++ pcmpeqw xmm3,xmm3 ++ psllw xmm3,7 ; xmm3={0xFF80 0xFF80 0xFF80 0xFF80 ..} ++ ++ movdqa xmm4,xmm6 ++ punpckhbw xmm6,xmm1 ; xmm6=Cb(89ABCDEF)=CbH ++ punpcklbw xmm4,xmm1 ; xmm4=Cb(01234567)=CbL ++ movdqa xmm0,xmm7 ++ punpckhbw xmm7,xmm1 ; xmm7=Cr(89ABCDEF)=CrH ++ punpcklbw xmm0,xmm1 ; xmm0=Cr(01234567)=CrL ++ ++ paddw xmm6,xmm3 ++ paddw xmm4,xmm3 ++ paddw xmm7,xmm3 ++ paddw xmm0,xmm3 ++ ++ ; (Original) ++ ; R = Y + 1.40200 * Cr ++ ; G = Y - 0.34414 * Cb - 0.71414 * Cr ++ ; B = Y + 1.77200 * Cb ++ ; ++ ; (This implementation) ++ ; R = Y + 0.40200 * Cr + Cr ++ ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr ++ ; B = Y - 0.22800 * Cb + Cb + Cb ++ ++ movdqa xmm5,xmm6 ; xmm5=CbH ++ movdqa xmm2,xmm4 ; xmm2=CbL ++ paddw xmm6,xmm6 ; xmm6=2*CbH ++ paddw xmm4,xmm4 ; xmm4=2*CbL ++ movdqa xmm1,xmm7 ; xmm1=CrH ++ movdqa xmm3,xmm0 ; xmm3=CrL ++ paddw xmm7,xmm7 ; xmm7=2*CrH ++ paddw xmm0,xmm0 ; xmm0=2*CrL ++ ++ pmulhw xmm6,[GOTOFF(eax,PW_MF0228)] ; xmm6=(2*CbH * -FIX(0.22800)) ++ pmulhw xmm4,[GOTOFF(eax,PW_MF0228)] ; xmm4=(2*CbL * -FIX(0.22800)) ++ pmulhw xmm7,[GOTOFF(eax,PW_F0402)] ; xmm7=(2*CrH * FIX(0.40200)) ++ pmulhw xmm0,[GOTOFF(eax,PW_F0402)] ; xmm0=(2*CrL * FIX(0.40200)) ++ ++ paddw xmm6,[GOTOFF(eax,PW_ONE)] ++ paddw xmm4,[GOTOFF(eax,PW_ONE)] ++ psraw xmm6,1 ; xmm6=(CbH * -FIX(0.22800)) ++ psraw xmm4,1 ; xmm4=(CbL * -FIX(0.22800)) ++ paddw xmm7,[GOTOFF(eax,PW_ONE)] ++ paddw xmm0,[GOTOFF(eax,PW_ONE)] ++ psraw xmm7,1 ; xmm7=(CrH * FIX(0.40200)) ++ psraw xmm0,1 ; xmm0=(CrL * FIX(0.40200)) ++ ++ paddw xmm6,xmm5 ++ paddw xmm4,xmm2 ++ paddw xmm6,xmm5 ; xmm6=(CbH * FIX(1.77200))=(B-Y)H ++ paddw xmm4,xmm2 ; xmm4=(CbL * FIX(1.77200))=(B-Y)L ++ paddw xmm7,xmm1 ; xmm7=(CrH * FIX(1.40200))=(R-Y)H ++ paddw xmm0,xmm3 ; xmm0=(CrL * FIX(1.40200))=(R-Y)L ++ ++ movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=(B-Y)H ++ movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(R-Y)H ++ ++ movdqa xmm6,xmm5 ++ movdqa xmm7,xmm2 ++ punpcklwd xmm5,xmm1 ++ punpckhwd xmm6,xmm1 ++ pmaddwd xmm5,[GOTOFF(eax,PW_MF0344_F0285)] ++ pmaddwd xmm6,[GOTOFF(eax,PW_MF0344_F0285)] ++ punpcklwd xmm2,xmm3 ++ punpckhwd xmm7,xmm3 ++ pmaddwd xmm2,[GOTOFF(eax,PW_MF0344_F0285)] ++ pmaddwd xmm7,[GOTOFF(eax,PW_MF0344_F0285)] ++ ++ paddd xmm5,[GOTOFF(eax,PD_ONEHALF)] ++ paddd xmm6,[GOTOFF(eax,PD_ONEHALF)] ++ psrad xmm5,SCALEBITS ++ psrad xmm6,SCALEBITS ++ paddd xmm2,[GOTOFF(eax,PD_ONEHALF)] ++ paddd xmm7,[GOTOFF(eax,PD_ONEHALF)] ++ psrad xmm2,SCALEBITS ++ psrad xmm7,SCALEBITS ++ ++ packssdw xmm5,xmm6 ; xmm5=CbH*-FIX(0.344)+CrH*FIX(0.285) ++ packssdw xmm2,xmm7 ; xmm2=CbL*-FIX(0.344)+CrL*FIX(0.285) ++ psubw xmm5,xmm1 ; xmm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H ++ psubw xmm2,xmm3 ; xmm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L ++ ++ movdqa XMMWORD [wk(2)], xmm5 ; wk(2)=(G-Y)H ++ ++ mov al,2 ; Yctr ++ jmp short .Yloop_1st ++ alignx 16,7 ++ ++.Yloop_2nd: ++ movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(R-Y)H ++ movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(G-Y)H ++ movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(B-Y)H ++ alignx 16,7 ++ ++.Yloop_1st: ++ movdqa xmm7, XMMWORD [esi] ; xmm7=Y(0123456789ABCDEF) ++ ++ pcmpeqw xmm6,xmm6 ++ psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} ++ pand xmm6,xmm7 ; xmm6=Y(02468ACE)=YE ++ psrlw xmm7,BYTE_BIT ; xmm7=Y(13579BDF)=YO ++ ++ movdqa xmm1,xmm0 ; xmm1=xmm0=(R-Y)(L/H) ++ movdqa xmm3,xmm2 ; xmm3=xmm2=(G-Y)(L/H) ++ movdqa xmm5,xmm4 ; xmm5=xmm4=(B-Y)(L/H) ++ ++ paddw xmm0,xmm6 ; xmm0=((R-Y)+YE)=RE=R(02468ACE) ++ paddw xmm1,xmm7 ; xmm1=((R-Y)+YO)=RO=R(13579BDF) ++ packuswb xmm0,xmm0 ; xmm0=R(02468ACE********) ++ packuswb xmm1,xmm1 ; xmm1=R(13579BDF********) ++ ++ paddw xmm2,xmm6 ; xmm2=((G-Y)+YE)=GE=G(02468ACE) ++ paddw xmm3,xmm7 ; xmm3=((G-Y)+YO)=GO=G(13579BDF) ++ packuswb xmm2,xmm2 ; xmm2=G(02468ACE********) ++ packuswb xmm3,xmm3 ; xmm3=G(13579BDF********) ++ ++ paddw xmm4,xmm6 ; xmm4=((B-Y)+YE)=BE=B(02468ACE) ++ paddw xmm5,xmm7 ; xmm5=((B-Y)+YO)=BO=B(13579BDF) ++ packuswb xmm4,xmm4 ; xmm4=B(02468ACE********) ++ packuswb xmm5,xmm5 ; xmm5=B(13579BDF********) ++ ++%if RGB_PIXELSIZE == 3 ; --------------- ++ ++ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) ++ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) ++ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ++ ; xmmG=(** ** ** ** ** ** ** ** **), xmmH=(** ** ** ** ** ** ** ** **) ++ ++ punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) ++ punpcklbw xmmE,xmmB ; xmmE=(20 01 22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F) ++ punpcklbw xmmD,xmmF ; xmmD=(11 21 13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F) ++ ++ movdqa xmmG,xmmA ++ movdqa xmmH,xmmA ++ punpcklwd xmmA,xmmE ; xmmA=(00 10 20 01 02 12 22 03 04 14 24 05 06 16 26 07) ++ punpckhwd xmmG,xmmE ; xmmG=(08 18 28 09 0A 1A 2A 0B 0C 1C 2C 0D 0E 1E 2E 0F) ++ ++ psrldq xmmH,2 ; xmmH=(02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E -- --) ++ psrldq xmmE,2 ; xmmE=(22 03 24 05 26 07 28 09 2A 0B 2C 0D 2E 0F -- --) ++ ++ movdqa xmmC,xmmD ++ movdqa xmmB,xmmD ++ punpcklwd xmmD,xmmH ; xmmD=(11 21 02 12 13 23 04 14 15 25 06 16 17 27 08 18) ++ punpckhwd xmmC,xmmH ; xmmC=(19 29 0A 1A 1B 2B 0C 1C 1D 2D 0E 1E 1F 2F -- --) ++ ++ psrldq xmmB,2 ; xmmB=(13 23 15 25 17 27 19 29 1B 2B 1D 2D 1F 2F -- --) ++ ++ movdqa xmmF,xmmE ++ punpcklwd xmmE,xmmB ; xmmE=(22 03 13 23 24 05 15 25 26 07 17 27 28 09 19 29) ++ punpckhwd xmmF,xmmB ; xmmF=(2A 0B 1B 2B 2C 0D 1D 2D 2E 0F 1F 2F -- -- -- --) ++ ++ pshufd xmmH,xmmA,0x4E; xmmH=(04 14 24 05 06 16 26 07 00 10 20 01 02 12 22 03) ++ movdqa xmmB,xmmE ++ punpckldq xmmA,xmmD ; xmmA=(00 10 20 01 11 21 02 12 02 12 22 03 13 23 04 14) ++ punpckldq xmmE,xmmH ; xmmE=(22 03 13 23 04 14 24 05 24 05 15 25 06 16 26 07) ++ punpckhdq xmmD,xmmB ; xmmD=(15 25 06 16 26 07 17 27 17 27 08 18 28 09 19 29) ++ ++ pshufd xmmH,xmmG,0x4E; xmmH=(0C 1C 2C 0D 0E 1E 2E 0F 08 18 28 09 0A 1A 2A 0B) ++ movdqa xmmB,xmmF ++ punpckldq xmmG,xmmC ; xmmG=(08 18 28 09 19 29 0A 1A 0A 1A 2A 0B 1B 2B 0C 1C) ++ punpckldq xmmF,xmmH ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 2C 0D 1D 2D 0E 1E 2E 0F) ++ punpckhdq xmmC,xmmB ; xmmC=(1D 2D 0E 1E 2E 0F 1F 2F 1F 2F -- -- -- -- -- --) ++ ++ punpcklqdq xmmA,xmmE ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) ++ punpcklqdq xmmD,xmmG ; xmmD=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) ++ punpcklqdq xmmF,xmmC ; xmmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) ++ ++ cmp ecx, byte SIZEOF_XMMWORD ++ jb short .column_st32 ++ ++ test edi, SIZEOF_XMMWORD-1 ++ jnz short .out1 ++ ; --(aligned)------------------- ++ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA ++ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD ++ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmF ++ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr ++ jmp short .out0 ++.out1: ; --(unaligned)----------------- ++ pcmpeqb xmmH,xmmH ; xmmH=(all 1's) ++ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA ++ add edi, byte SIZEOF_XMMWORD ; outptr ++ maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD ++ add edi, byte SIZEOF_XMMWORD ; outptr ++ maskmovdqu xmmF,xmmH ; movntdqu XMMWORD [edi], xmmF ++ add edi, byte SIZEOF_XMMWORD ; outptr ++.out0: ++ sub ecx, byte SIZEOF_XMMWORD ++ jz near .endcolumn ++ ++ add esi, byte SIZEOF_XMMWORD ; inptr0 ++ dec al ; Yctr ++ jnz near .Yloop_2nd ++ ++ add ebx, byte SIZEOF_XMMWORD ; inptr1 ++ add edx, byte SIZEOF_XMMWORD ; inptr2 ++ jmp near .columnloop ++ alignx 16,7 ++ ++.column_st32: ++ pcmpeqb xmmH,xmmH ; xmmH=(all 1's) ++ lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE ++ cmp ecx, byte 2*SIZEOF_XMMWORD ++ jb short .column_st16 ++ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA ++ add edi, byte SIZEOF_XMMWORD ; outptr ++ maskmovdqu xmmD,xmmH ; movntdqu XMMWORD [edi], xmmD ++ add edi, byte SIZEOF_XMMWORD ; outptr ++ movdqa xmmA,xmmF ++ sub ecx, byte 2*SIZEOF_XMMWORD ++ jmp short .column_st15 ++.column_st16: ++ cmp ecx, byte SIZEOF_XMMWORD ++ jb short .column_st15 ++ maskmovdqu xmmA,xmmH ; movntdqu XMMWORD [edi], xmmA ++ add edi, byte SIZEOF_XMMWORD ; outptr ++ movdqa xmmA,xmmD ++ sub ecx, byte SIZEOF_XMMWORD ++.column_st15: ++ mov eax,ecx ++ xor ecx, byte 0x0F ++ shl ecx, 2 ++ movd xmmB,ecx ++ psrlq xmmH,4 ++ pcmpeqb xmmE,xmmE ++ psrlq xmmH,xmmB ++ psrlq xmmE,xmmB ++ punpcklbw xmmE,xmmH ++ ; ---------------- ++ mov ecx,edi ++ and ecx, byte SIZEOF_XMMWORD-1 ++ jz short .adj0 ++ add eax,ecx ++ cmp eax, byte SIZEOF_XMMWORD ++ ja short .adj0 ++ and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary ++ shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx ++ movdqa xmmG,xmmA ++ movdqa xmmC,xmmE ++ pslldq xmmA, SIZEOF_XMMWORD/2 ++ pslldq xmmE, SIZEOF_XMMWORD/2 ++ movd xmmD,ecx ++ sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT ++ jb short .adj1 ++ movd xmmF,ecx ++ psllq xmmA,xmmF ++ psllq xmmE,xmmF ++ jmp short .adj0 ++.adj1: neg ecx ++ movd xmmF,ecx ++ psrlq xmmA,xmmF ++ psrlq xmmE,xmmF ++ psllq xmmG,xmmD ++ psllq xmmC,xmmD ++ por xmmA,xmmG ++ por xmmE,xmmC ++.adj0: ; ---------------- ++ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA ++ ++%else ; RGB_PIXELSIZE == 4 ; ----------- ++ ++%ifdef RGBX_FILLER_0XFF ++ pcmpeqb xmm6,xmm6 ; xmm6=XE=X(02468ACE********) ++ pcmpeqb xmm7,xmm7 ; xmm7=XO=X(13579BDF********) ++%else ++ pxor xmm6,xmm6 ; xmm6=XE=X(02468ACE********) ++ pxor xmm7,xmm7 ; xmm7=XO=X(13579BDF********) ++%endif ++ ; xmmA=(00 02 04 06 08 0A 0C 0E **), xmmB=(01 03 05 07 09 0B 0D 0F **) ++ ; xmmC=(10 12 14 16 18 1A 1C 1E **), xmmD=(11 13 15 17 19 1B 1D 1F **) ++ ; xmmE=(20 22 24 26 28 2A 2C 2E **), xmmF=(21 23 25 27 29 2B 2D 2F **) ++ ; xmmG=(30 32 34 36 38 3A 3C 3E **), xmmH=(31 33 35 37 39 3B 3D 3F **) ++ ++ punpcklbw xmmA,xmmC ; xmmA=(00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E) ++ punpcklbw xmmE,xmmG ; xmmE=(20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E) ++ punpcklbw xmmB,xmmD ; xmmB=(01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F) ++ punpcklbw xmmF,xmmH ; xmmF=(21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F) ++ ++ movdqa xmmC,xmmA ++ punpcklwd xmmA,xmmE ; xmmA=(00 10 20 30 02 12 22 32 04 14 24 34 06 16 26 36) ++ punpckhwd xmmC,xmmE ; xmmC=(08 18 28 38 0A 1A 2A 3A 0C 1C 2C 3C 0E 1E 2E 3E) ++ movdqa xmmG,xmmB ++ punpcklwd xmmB,xmmF ; xmmB=(01 11 21 31 03 13 23 33 05 15 25 35 07 17 27 37) ++ punpckhwd xmmG,xmmF ; xmmG=(09 19 29 39 0B 1B 2B 3B 0D 1D 2D 3D 0F 1F 2F 3F) ++ ++ movdqa xmmD,xmmA ++ punpckldq xmmA,xmmB ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) ++ punpckhdq xmmD,xmmB ; xmmD=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) ++ movdqa xmmH,xmmC ++ punpckldq xmmC,xmmG ; xmmC=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) ++ punpckhdq xmmH,xmmG ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) ++ ++ cmp ecx, byte SIZEOF_XMMWORD ++ jb short .column_st32 ++ ++ test edi, SIZEOF_XMMWORD-1 ++ jnz short .out1 ++ ; --(aligned)------------------- ++ movntdq XMMWORD [edi+0*SIZEOF_XMMWORD], xmmA ++ movntdq XMMWORD [edi+1*SIZEOF_XMMWORD], xmmD ++ movntdq XMMWORD [edi+2*SIZEOF_XMMWORD], xmmC ++ movntdq XMMWORD [edi+3*SIZEOF_XMMWORD], xmmH ++ add edi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; outptr ++ jmp short .out0 ++.out1: ; --(unaligned)----------------- ++ pcmpeqb xmmE,xmmE ; xmmE=(all 1's) ++ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA ++ add edi, byte SIZEOF_XMMWORD ; outptr ++ maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD ++ add edi, byte SIZEOF_XMMWORD ; outptr ++ maskmovdqu xmmC,xmmE ; movntdqu XMMWORD [edi], xmmC ++ add edi, byte SIZEOF_XMMWORD ; outptr ++ maskmovdqu xmmH,xmmE ; movntdqu XMMWORD [edi], xmmH ++ add edi, byte SIZEOF_XMMWORD ; outptr ++.out0: ++ sub ecx, byte SIZEOF_XMMWORD ++ jz near .endcolumn ++ ++ add esi, byte SIZEOF_XMMWORD ; inptr0 ++ dec al ; Yctr ++ jnz near .Yloop_2nd ++ ++ add ebx, byte SIZEOF_XMMWORD ; inptr1 ++ add edx, byte SIZEOF_XMMWORD ; inptr2 ++ jmp near .columnloop ++ alignx 16,7 ++ ++.column_st32: ++ pcmpeqb xmmE,xmmE ; xmmE=(all 1's) ++ cmp ecx, byte SIZEOF_XMMWORD/2 ++ jb short .column_st16 ++ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA ++ add edi, byte SIZEOF_XMMWORD ; outptr ++ maskmovdqu xmmD,xmmE ; movntdqu XMMWORD [edi], xmmD ++ add edi, byte SIZEOF_XMMWORD ; outptr ++ movdqa xmmA,xmmC ++ movdqa xmmD,xmmH ++ sub ecx, byte SIZEOF_XMMWORD/2 ++.column_st16: ++ cmp ecx, byte SIZEOF_XMMWORD/4 ++ jb short .column_st15 ++ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA ++ add edi, byte SIZEOF_XMMWORD ; outptr ++ movdqa xmmA,xmmD ++ sub ecx, byte SIZEOF_XMMWORD/4 ++.column_st15: ++ cmp ecx, byte SIZEOF_XMMWORD/16 ++ jb short .endcolumn ++ mov eax,ecx ++ xor ecx, byte 0x03 ++ inc ecx ++ shl ecx, 4 ++ movd xmmF,ecx ++ psrlq xmmE,xmmF ++ punpcklbw xmmE,xmmE ++ ; ---------------- ++ mov ecx,edi ++ and ecx, byte SIZEOF_XMMWORD-1 ++ jz short .adj0 ++ lea eax, [ecx+eax*4] ; RGB_PIXELSIZE ++ cmp eax, byte SIZEOF_XMMWORD ++ ja short .adj0 ++ and edi, byte (-SIZEOF_XMMWORD) ; align to 16-byte boundary ++ shl ecx, 3 ; pslldq xmmA,ecx & pslldq xmmE,ecx ++ movdqa xmmB,xmmA ++ movdqa xmmG,xmmE ++ pslldq xmmA, SIZEOF_XMMWORD/2 ++ pslldq xmmE, SIZEOF_XMMWORD/2 ++ movd xmmC,ecx ++ sub ecx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT ++ jb short .adj1 ++ movd xmmH,ecx ++ psllq xmmA,xmmH ++ psllq xmmE,xmmH ++ jmp short .adj0 ++.adj1: neg ecx ++ movd xmmH,ecx ++ psrlq xmmA,xmmH ++ psrlq xmmE,xmmH ++ psllq xmmB,xmmC ++ psllq xmmG,xmmC ++ por xmmA,xmmB ++ por xmmE,xmmG ++.adj0: ; ---------------- ++ maskmovdqu xmmA,xmmE ; movntdqu XMMWORD [edi], xmmA ++ ++%endif ; RGB_PIXELSIZE ; --------------- ++ ++.endcolumn: ++ sfence ; flush the write buffer ++ ++.return: ++ pop edi ++ pop esi ++; pop edx ; need not be preserved ++; pop ecx ; need not be preserved ++ pop ebx ++ mov esp,ebp ; esp <- aligned ebp ++ pop esp ; esp <- original ebp ++ pop ebp ++ ret ++ ++; -------------------------------------------------------------------------- ++; ++; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. ++; ++; GLOBAL(void) ++; jsimd_h2v2_merged_upsample_sse2 (JDIMENSION output_width, ++; JSAMPIMAGE input_buf, ++; JDIMENSION in_row_group_ctr, ++; JSAMPARRAY output_buf); ++; ++ ++%define output_width(b) (b)+8 ; JDIMENSION output_width ++%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf ++%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr ++%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf ++ ++ align 16 ++ global EXTN(jsimd_h2v2_merged_upsample_sse2) ++ ++EXTN(jsimd_h2v2_merged_upsample_sse2): ++ push ebp ++ mov ebp,esp ++ push ebx ++; push ecx ; need not be preserved ++; push edx ; need not be preserved ++ push esi ++ push edi ++ ++ mov eax, POINTER [output_width(ebp)] ++ ++ mov edi, JSAMPIMAGE [input_buf(ebp)] ++ mov ecx, JDIMENSION [in_row_group_ctr(ebp)] ++ mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] ++ mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] ++ mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] ++ mov edi, JSAMPARRAY [output_buf(ebp)] ++ lea esi, [esi+ecx*SIZEOF_JSAMPROW] ++ ++ push edx ; inptr2 ++ push ebx ; inptr1 ++ push esi ; inptr00 ++ mov ebx,esp ++ ++ push edi ; output_buf (outptr0) ++ push ecx ; in_row_group_ctr ++ push ebx ; input_buf ++ push eax ; output_width ++ ++ call near EXTN(jsimd_h2v1_merged_upsample_sse2) ++ ++ add esi, byte SIZEOF_JSAMPROW ; inptr01 ++ add edi, byte SIZEOF_JSAMPROW ; outptr1 ++ mov POINTER [ebx+0*SIZEOF_POINTER], esi ++ mov POINTER [ebx-1*SIZEOF_POINTER], edi ++ ++ call near EXTN(jsimd_h2v1_merged_upsample_sse2) ++ ++ add esp, byte 7*SIZEOF_DWORD ++ ++ pop edi ++ pop esi ++; pop edx ; need not be preserved ++; pop ecx ; need not be preserved ++ pop ebx ++ pop ebp ++ ret ++ diff --cc simd/jdsamss2.asm index 0000000,46fcf51..bd967db mode 000000,100644..100644 --- a/simd/jdsamss2.asm +++ b/simd/jdsamss2.asm @@@ -1,0 -1,883 +1,726 @@@ + ; + ; jdsamss2.asm - upsampling (SSE2) + ; ++; Copyright 2009 Pierre Ossman for Cendio AB ++; ++; Based on + ; x86 SIMD extension for IJG JPEG library + ; Copyright (C) 1999-2006, MIYASAKA Masaru. + ; For conditions of distribution and use, see copyright notice in jsimdext.inc + ; + ; This file should be assembled with NASM (Netwide Assembler), + ; can *not* be assembled with Microsoft's MASM or any compatible + ; assembler (including Borland's Turbo Assembler). + ; NASM is available from http://nasm.sourceforge.net/ or + ; http://sourceforge.net/project/showfiles.php?group_id=6208 + ; -; Last Modified : February 4, 2006 -; + ; [TAB8] + -%include "jsimdext.inc" -%include "jcolsamp.inc" - -%ifdef JDSAMPLE_FANCY_SSE2_SUPPORTED ++%include "simd/jsimdext.inc" + + ; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fancy_upsample_sse2) + + EXTN(jconst_fancy_upsample_sse2): + + PW_ONE times 8 dw 1 + PW_TWO times 8 dw 2 + PW_THREE times 8 dw 3 + PW_SEVEN times 8 dw 7 + PW_EIGHT times 8 dw 8 + + alignz 16 + + ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + ; + ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. + ; + ; The upsampling algorithm is linear interpolation between pixel centers, + ; also known as a "triangle filter". This is a good compromise between + ; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 + ; of the way between input pixel centers. + ; + ; GLOBAL(void) -; jpeg_h2v1_fancy_upsample_sse2 (j_decompress_ptr cinfo, -; jpeg_component_info * compptr, -; JSAMPARRAY input_data, -; JSAMPARRAY * output_data_ptr); ++; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor, ++; JDIMENSION downsampled_width, ++; JSAMPARRAY input_data, ++; JSAMPARRAY * output_data_ptr); + ; + -%define cinfo(b) (b)+8 ; j_decompress_ptr cinfo -%define compptr(b) (b)+12 ; jpeg_component_info * compptr ++%define max_v_samp(b) (b)+8 ; int max_v_samp_factor ++%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width + %define input_data(b) (b)+16 ; JSAMPARRAY input_data + %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr + + align 16 - global EXTN(jpeg_h2v1_fancy_upsample_sse2) ++ global EXTN(jsimd_h2v1_fancy_upsample_sse2) + -EXTN(jpeg_h2v1_fancy_upsample_sse2): ++EXTN(jsimd_h2v1_fancy_upsample_sse2): + push ebp + mov ebp,esp + pushpic ebx + ; push ecx ; need not be preserved + ; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + - mov eax, POINTER [compptr(ebp)] - mov eax, JDIMENSION [jcompinfo_downsampled_width(eax)] ; colctr ++ mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr + test eax,eax + jz near .return + - mov ecx, POINTER [cinfo(ebp)] - mov ecx, INT [jdstruct_max_v_samp_factor(ecx)] ; rowctr ++ mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 + .rowloop: + push eax ; colctr + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + + test eax, SIZEOF_XMMWORD-1 + jz short .skip + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + .skip: + pxor xmm0,xmm0 ; xmm0=(all 0's) + pcmpeqb xmm7,xmm7 + psrldq xmm7,(SIZEOF_XMMWORD-1) + pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] + + add eax, byte SIZEOF_XMMWORD-1 + and eax, byte -SIZEOF_XMMWORD + cmp eax, byte SIZEOF_XMMWORD + ja short .columnloop + alignx 16,7 + + .columnloop_last: + pcmpeqb xmm6,xmm6 + pslldq xmm6,(SIZEOF_XMMWORD-1) + pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] + jmp short .upsample + alignx 16,7 + + .columnloop: + movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] + pslldq xmm6,(SIZEOF_XMMWORD-1) + + .upsample: + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] + movdqa xmm2,xmm1 + movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) + pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) + psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) + + por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) + por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) + + movdqa xmm7,xmm1 + psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) + + movdqa xmm4,xmm1 + punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm2 + punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) + punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) + movdqa xmm6,xmm3 + punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) + punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) + + pmullw xmm1,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + paddw xmm2,[GOTOFF(ebx,PW_ONE)] + paddw xmm5,[GOTOFF(ebx,PW_ONE)] + paddw xmm3,[GOTOFF(ebx,PW_TWO)] + paddw xmm6,[GOTOFF(ebx,PW_TWO)] + + paddw xmm2,xmm1 + paddw xmm5,xmm4 + psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) + psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) + paddw xmm3,xmm1 + paddw xmm6,xmm4 + psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) + psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) + + psllw xmm3,BYTE_BIT + psllw xmm6,BYTE_BIT + por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) + por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 + + sub eax, byte SIZEOF_XMMWORD + add esi, byte 1*SIZEOF_XMMWORD ; inptr + add edi, byte 2*SIZEOF_XMMWORD ; outptr + cmp eax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop esi + pop edi + pop eax + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg near .rowloop + + .return: + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; need not be preserved + poppic ebx + pop ebp + ret + + ; -------------------------------------------------------------------------- + ; + ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. + ; Again a triangle filter; see comments for h2v1 case, above. + ; + ; GLOBAL(void) -; jpeg_h2v2_fancy_upsample_sse2 (j_decompress_ptr cinfo, -; jpeg_component_info * compptr, -; JSAMPARRAY input_data, -; JSAMPARRAY * output_data_ptr); ++; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor, ++; JDIMENSION downsampled_width, ++; JSAMPARRAY input_data, ++; JSAMPARRAY * output_data_ptr); + ; + -%define cinfo(b) (b)+8 ; j_decompress_ptr cinfo -%define compptr(b) (b)+12 ; jpeg_component_info * compptr ++%define max_v_samp(b) (b)+8 ; int max_v_samp_factor ++%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width + %define input_data(b) (b)+16 ; JSAMPARRAY input_data + %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr + + %define original_ebp ebp+0 + %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] + %define WK_NUM 4 + %define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr + + align 16 - global EXTN(jpeg_h2v2_fancy_upsample_sse2) ++ global EXTN(jsimd_h2v2_fancy_upsample_sse2) + -EXTN(jpeg_h2v2_fancy_upsample_sse2): ++EXTN(jsimd_h2v2_fancy_upsample_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic eax ; make a room for GOT address + push ebx + ; push ecx ; need not be preserved + ; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + movpic POINTER [gotptr], ebx ; save GOT address + + mov edx,eax ; edx = original ebp - mov eax, POINTER [compptr(edx)] - mov eax, JDIMENSION [jcompinfo_downsampled_width(eax)] ; colctr ++ mov eax, JDIMENSION [downsamp_width(edx)] ; colctr + test eax,eax + jz near .return + - mov ecx, POINTER [cinfo(edx)] - mov ecx, INT [jdstruct_max_v_samp_factor(ecx)] ; rowctr ++ mov ecx, INT [max_v_samp(edx)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(edx)] ; input_data + mov edi, POINTER [output_data_ptr(edx)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 + .rowloop: + push eax ; colctr + push ecx + push edi + push esi + + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + + test eax, SIZEOF_XMMWORD-1 + jz short .skip + push edx + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample + pop edx + .skip: + ; -- process the first column block + + movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] + movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] + movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor xmm3,xmm3 ; xmm3=(all 0's) + movdqa xmm4,xmm0 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm1 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6,xmm2 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + + pcmpeqb xmm7,xmm7 + psrldq xmm7,(SIZEOF_XMMWORD-2) + + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 + + pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) + pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) + + movdqa XMMWORD [wk(0)], xmm1 + movdqa XMMWORD [wk(1)], xmm2 + + poppic ebx + + add eax, byte SIZEOF_XMMWORD-1 + and eax, byte -SIZEOF_XMMWORD + cmp eax, byte SIZEOF_XMMWORD + ja short .columnloop + alignx 16,7 + + .columnloop_last: + ; -- process the last column block + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pcmpeqb xmm1,xmm1 + pslldq xmm1,(SIZEOF_XMMWORD-2) + movdqa xmm2,xmm1 + + pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] + pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] + + movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) + movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) + + jmp near .upsample + alignx 16,7 + + .columnloop: + ; -- process the next column block + + movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] + movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] + + pushpic ebx + movpic ebx, POINTER [gotptr] ; load GOT address + + pxor xmm3,xmm3 ; xmm3=(all 0's) + movdqa xmm4,xmm0 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) + movdqa xmm5,xmm1 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) + movdqa xmm6,xmm2 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) + + pmullw xmm0,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) + + movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save + movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 + + pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) + pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) + + movdqa XMMWORD [wk(2)], xmm1 + movdqa XMMWORD [wk(3)], xmm2 + + .upsample: + ; -- process the upper row + + movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] + movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] + + movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) + movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) + psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) + pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) + movdqa xmm5,xmm7 + movdqa xmm6,xmm3 + psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) + pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) + + por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) + por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) + + movdqa xmm1,xmm7 + movdqa xmm2,xmm3 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) + movdqa xmm4,xmm3 + psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(0)], xmm4 + + pmullw xmm7,[GOTOFF(ebx,PW_THREE)] + pmullw xmm3,[GOTOFF(ebx,PW_THREE)] + paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm5,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm0,[GOTOFF(ebx,PW_SEVEN)] + paddw xmm2,[GOTOFF(ebx,PW_SEVEN)] + + paddw xmm1,xmm7 + paddw xmm5,xmm3 + psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) + psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) + paddw xmm0,xmm7 + paddw xmm2,xmm3 + psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) + psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) + + psllw xmm0,BYTE_BIT + psllw xmm2,BYTE_BIT + por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) + por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 + + ; -- process the lower row + + movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] + movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] + + movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) + movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) + psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) + pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) + movdqa xmm0,xmm6 + movdqa xmm2,xmm4 + psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) + pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) + + por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) + por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) + + movdqa xmm1,xmm6 + movdqa xmm5,xmm4 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) + psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) + movdqa xmm3,xmm4 + psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) + + por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) + por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) + + movdqa XMMWORD [wk(1)], xmm3 + + pmullw xmm6,[GOTOFF(ebx,PW_THREE)] + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] + paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm0,[GOTOFF(ebx,PW_EIGHT)] + paddw xmm7,[GOTOFF(ebx,PW_SEVEN)] + paddw xmm5,[GOTOFF(ebx,PW_SEVEN)] + + paddw xmm1,xmm6 + paddw xmm0,xmm4 + psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) + psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) + paddw xmm7,xmm6 + paddw xmm5,xmm4 + psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) + psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) + + psllw xmm7,BYTE_BIT + psllw xmm5,BYTE_BIT + por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) + por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 + + poppic ebx + + sub eax, byte SIZEOF_XMMWORD + add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) + add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 + add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) + add edx, byte 2*SIZEOF_XMMWORD ; outptr0 + add edi, byte 2*SIZEOF_XMMWORD ; outptr1 + cmp eax, byte SIZEOF_XMMWORD + ja near .columnloop + test eax,eax + jnz near .columnloop_last + + pop esi + pop edi + pop ecx + pop eax + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg near .rowloop + + .return: + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + -%ifdef UPSAMPLE_H1V2_SUPPORTED - -; -------------------------------------------------------------------------- -; -; Fancy processing for the common case of 1:1 horizontal and 2:1 vertical. -; Again a triangle filter; see comments for h2v1 case, above. -; -; GLOBAL(void) -; jpeg_h1v2_fancy_upsample_sse2 (j_decompress_ptr cinfo, -; jpeg_component_info * compptr, -; JSAMPARRAY input_data, -; JSAMPARRAY * output_data_ptr); -; - -%define cinfo(b) (b)+8 ; j_decompress_ptr cinfo -%define compptr(b) (b)+12 ; jpeg_component_info * compptr -%define input_data(b) (b)+16 ; JSAMPARRAY input_data -%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr - -%define gotptr ebp-SIZEOF_POINTER ; void * gotptr - - align 16 - global EXTN(jpeg_h1v2_fancy_upsample_sse2) - -EXTN(jpeg_h1v2_fancy_upsample_sse2): - push ebp - mov ebp,esp - pushpic eax ; make a room for GOT address - push ebx -; push ecx ; need not be preserved -; push edx ; need not be preserved - push esi - push edi - - get_GOT ebx ; get GOT address - movpic POINTER [gotptr], ebx ; save GOT address - - mov eax, POINTER [compptr(ebp)] - mov eax, JDIMENSION [jcompinfo_downsampled_width(eax)] ; colctr - add eax, byte SIZEOF_XMMWORD-1 - and eax, byte -SIZEOF_XMMWORD - jz near .return - - mov ecx, POINTER [cinfo(ebp)] - mov ecx, INT [jdstruct_max_v_samp_factor(ecx)] ; rowctr - test ecx,ecx - jz near .return - - mov esi, JSAMPARRAY [input_data(ebp)] ; input_data - mov edi, POINTER [output_data_ptr(ebp)] - mov edi, JSAMPARRAY [edi] ; output_data - alignx 16,7 -.rowloop: - push eax ; colctr - push ecx - push edi - push esi - - mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) - mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 - mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) - mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 - mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 - - pxor xmm0,xmm0 ; xmm0=(all 0's) - alignx 16,7 - -.columnloop: - movdqa xmm1, XMMWORD [ebx] ; xmm1=row[ 0]( 0 1 2 ... 13 14 15) - movdqa xmm2, XMMWORD [ecx] ; xmm2=row[-1]( 0 1 2 ... 13 14 15) - movdqa xmm3, XMMWORD [esi] ; xmm3=row[+1]( 0 1 2 ... 13 14 15) - - pushpic ebx - movpic ebx, POINTER [gotptr] ; load GOT address - - movdqa xmm4,xmm1 - punpcklbw xmm1,xmm0 ; xmm1=row[ 0]( 0 1 2 3 4 5 6 7) - punpckhbw xmm4,xmm0 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) - movdqa xmm5,xmm2 - punpcklbw xmm2,xmm0 ; xmm2=row[-1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm5,xmm0 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) - movdqa xmm6,xmm3 - punpcklbw xmm3,xmm0 ; xmm3=row[+1]( 0 1 2 3 4 5 6 7) - punpckhbw xmm6,xmm0 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) - - pmullw xmm1,[GOTOFF(ebx,PW_THREE)] - pmullw xmm4,[GOTOFF(ebx,PW_THREE)] - paddw xmm2,[GOTOFF(ebx,PW_ONE)] - paddw xmm5,[GOTOFF(ebx,PW_ONE)] - paddw xmm3,[GOTOFF(ebx,PW_TWO)] - paddw xmm6,[GOTOFF(ebx,PW_TWO)] - - paddw xmm2,xmm1 - paddw xmm5,xmm4 - psrlw xmm2,2 ; xmm2=Out0L=( 0 1 2 3 4 5 6 7) - psrlw xmm5,2 ; xmm5=Out0H=( 8 9 10 11 12 13 14 15) - paddw xmm3,xmm1 - paddw xmm6,xmm4 - psrlw xmm3,2 ; xmm3=Out1L=( 0 1 2 3 4 5 6 7) - psrlw xmm6,2 ; xmm6=Out1H=( 8 9 10 11 12 13 14 15) - - packuswb xmm2,xmm5 ; xmm2=Out0=( 0 1 2 ... 13 14 15) - packuswb xmm3,xmm6 ; xmm3=Out1=( 0 1 2 ... 13 14 15) - - movdqa XMMWORD [edx], xmm2 - movdqa XMMWORD [edi], xmm3 - - poppic ebx - - add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) - add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 - add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) - add edx, byte 1*SIZEOF_XMMWORD ; outptr0 - add edi, byte 1*SIZEOF_XMMWORD ; outptr1 - sub eax, byte SIZEOF_XMMWORD - jnz near .columnloop - - pop esi - pop edi - pop ecx - pop eax - - add esi, byte 1*SIZEOF_JSAMPROW ; input_data - add edi, byte 2*SIZEOF_JSAMPROW ; output_data - sub ecx, byte 2 ; rowctr - jg near .rowloop - -.return: - pop edi - pop esi -; pop edx ; need not be preserved -; pop ecx ; need not be preserved - pop ebx - poppic eax ; remove gotptr - pop ebp - ret - -%endif ; UPSAMPLE_H1V2_SUPPORTED -%endif ; JDSAMPLE_FANCY_SSE2_SUPPORTED - -%ifdef JDSAMPLE_SIMPLE_SSE2_SUPPORTED - -%ifndef JDSAMPLE_FANCY_SSE2_SUPPORTED + ; -------------------------------------------------------------------------- - SECTION SEG_TEXT - BITS 32 -%endif + ; + ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. + ; It's still a box filter. + ; + ; GLOBAL(void) -; jpeg_h2v1_upsample_sse2 (j_decompress_ptr cinfo, -; jpeg_component_info * compptr, -; JSAMPARRAY input_data, -; JSAMPARRAY * output_data_ptr); ++; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor, ++; JDIMENSION output_width, ++; JSAMPARRAY input_data, ++; JSAMPARRAY * output_data_ptr); + ; + -%define cinfo(b) (b)+8 ; j_decompress_ptr cinfo -%define compptr(b) (b)+12 ; jpeg_component_info * compptr ++%define max_v_samp(b) (b)+8 ; int max_v_samp_factor ++%define output_width(b) (b)+12 ; JDIMENSION output_width + %define input_data(b) (b)+16 ; JSAMPARRAY input_data + %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr + + align 16 - global EXTN(jpeg_h2v1_upsample_sse2) ++ global EXTN(jsimd_h2v1_upsample_sse2) + -EXTN(jpeg_h2v1_upsample_sse2): ++EXTN(jsimd_h2v1_upsample_sse2): + push ebp + mov ebp,esp + ; push ebx ; unused + ; push ecx ; need not be preserved + ; push edx ; need not be preserved + push esi + push edi + - mov edx, POINTER [cinfo(ebp)] - mov edx, JDIMENSION [jdstruct_output_width(edx)] ++ mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_XMMWORD)-1 + and edx, byte -(2*SIZEOF_XMMWORD) + jz short .return + - mov ecx, POINTER [cinfo(ebp)] - mov ecx, INT [jdstruct_max_v_samp_factor(ecx)] ; rowctr ++ mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz short .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 + .rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov edi, JSAMPROW [edi] ; outptr + mov eax,edx ; colctr + alignx 16,7 + .columnloop: + + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + + movdqa xmm1,xmm0 + punpcklbw xmm0,xmm0 + punpckhbw xmm1,xmm1 + + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] + + movdqa xmm3,xmm2 + punpcklbw xmm2,xmm2 + punpckhbw xmm3,xmm3 + + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add edi, byte 4*SIZEOF_XMMWORD ; outptr + jmp short .columnloop + alignx 16,7 + + .nextrow: + pop esi + pop edi + + add esi, byte SIZEOF_JSAMPROW ; input_data + add edi, byte SIZEOF_JSAMPROW ; output_data + dec ecx ; rowctr + jg short .rowloop + + .return: + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; need not be preserved + ; pop ebx ; unused + pop ebp + ret + + ; -------------------------------------------------------------------------- + ; + ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. + ; It's still a box filter. + ; + ; GLOBAL(void) -; jpeg_h2v2_upsample_sse2 (j_decompress_ptr cinfo, -; jpeg_component_info * compptr, -; JSAMPARRAY input_data, -; JSAMPARRAY * output_data_ptr); ++; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor, ++; JDIMENSION output_width, ++; JSAMPARRAY input_data, ++; JSAMPARRAY * output_data_ptr); + ; + -%define cinfo(b) (b)+8 ; j_decompress_ptr cinfo -%define compptr(b) (b)+12 ; jpeg_component_info * compptr ++%define max_v_samp(b) (b)+8 ; int max_v_samp_factor ++%define output_width(b) (b)+12 ; JDIMENSION output_width + %define input_data(b) (b)+16 ; JSAMPARRAY input_data + %define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr + + align 16 - global EXTN(jpeg_h2v2_upsample_sse2) ++ global EXTN(jsimd_h2v2_upsample_sse2) + -EXTN(jpeg_h2v2_upsample_sse2): ++EXTN(jsimd_h2v2_upsample_sse2): + push ebp + mov ebp,esp + push ebx + ; push ecx ; need not be preserved + ; push edx ; need not be preserved + push esi + push edi + - mov edx, POINTER [cinfo(ebp)] - mov edx, JDIMENSION [jdstruct_output_width(edx)] ++ mov edx, JDIMENSION [output_width(ebp)] + add edx, byte (2*SIZEOF_XMMWORD)-1 + and edx, byte -(2*SIZEOF_XMMWORD) + jz near .return + - mov ecx, POINTER [cinfo(ebp)] - mov ecx, INT [jdstruct_max_v_samp_factor(ecx)] ; rowctr ++ mov ecx, INT [max_v_samp(ebp)] ; rowctr + test ecx,ecx + jz near .return + + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data + mov edi, POINTER [output_data_ptr(ebp)] + mov edi, JSAMPARRAY [edi] ; output_data + alignx 16,7 + .rowloop: + push edi + push esi + + mov esi, JSAMPROW [esi] ; inptr + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 + mov eax,edx ; colctr + alignx 16,7 + .columnloop: + + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] + + movdqa xmm1,xmm0 + punpcklbw xmm0,xmm0 + punpckhbw xmm1,xmm1 + + movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] + + movdqa xmm3,xmm2 + punpcklbw xmm2,xmm2 + punpckhbw xmm3,xmm3 + + movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 + + sub eax, byte 2*SIZEOF_XMMWORD + jz short .nextrow + + add esi, byte 2*SIZEOF_XMMWORD ; inptr + add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 + add edi, byte 4*SIZEOF_XMMWORD ; outptr1 + jmp short .columnloop + alignx 16,7 + + .nextrow: + pop esi + pop edi + + add esi, byte 1*SIZEOF_JSAMPROW ; input_data + add edi, byte 2*SIZEOF_JSAMPROW ; output_data + sub ecx, byte 2 ; rowctr + jg short .rowloop + + .return: + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + -%endif ; JDSAMPLE_SIMPLE_SSE2_SUPPORTED diff --cc simd/jfss2fst.asm index 0000000,567bcef..e42d225 mode 000000,100644..100644 --- a/simd/jfss2fst.asm +++ b/simd/jfss2fst.asm @@@ -1,0 -1,411 +1,401 @@@ + ; + ; jfss2fst.asm - fast integer FDCT (SSE2) + ; ++; Copyright 2009 Pierre Ossman for Cendio AB ++; ++; Based on + ; x86 SIMD extension for IJG JPEG library + ; Copyright (C) 1999-2006, MIYASAKA Masaru. + ; For conditions of distribution and use, see copyright notice in jsimdext.inc + ; + ; This file should be assembled with NASM (Netwide Assembler), + ; can *not* be assembled with Microsoft's MASM or any compatible + ; assembler (including Borland's Turbo Assembler). + ; NASM is available from http://nasm.sourceforge.net/ or + ; http://sourceforge.net/project/showfiles.php?group_id=6208 + ; + ; This file contains a fast, not so accurate integer implementation of + ; the forward DCT (Discrete Cosine Transform). The following code is + ; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c + ; for more details. + ; -; Last Modified : February 4, 2006 -; + ; [TAB8] + -%include "jsimdext.inc" -%include "jdct.inc" - -%ifdef DCT_IFAST_SUPPORTED -%ifdef JFDCT_INT_SSE2_SUPPORTED - -; This module is specialized to the case DCTSIZE = 8. -; -%if DCTSIZE != 8 -%error "Sorry, this code only copes with 8x8 DCTs." -%endif ++%include "simd/jsimdext.inc" ++%include "simd/jdct.inc" + + ; -------------------------------------------------------------------------- + + %define CONST_BITS 8 ; 14 is also OK. + + %if CONST_BITS == 8 + F_0_382 equ 98 ; FIX(0.382683433) + F_0_541 equ 139 ; FIX(0.541196100) + F_0_707 equ 181 ; FIX(0.707106781) + F_1_306 equ 334 ; FIX(1.306562965) + %else + ; NASM cannot do compile-time arithmetic on floating-point constants. + %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) + F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) + F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) + F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) + F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) + %endif + + ; -------------------------------------------------------------------------- + SECTION SEG_CONST + + ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) + ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + + %define PRE_MULTIPLY_SCALE_BITS 2 + %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 16 + global EXTN(jconst_fdct_ifast_sse2) + + EXTN(jconst_fdct_ifast_sse2): + + PW_F0707 times 8 dw F_0_707 << CONST_SHIFT + PW_F0382 times 8 dw F_0_382 << CONST_SHIFT + PW_F0541 times 8 dw F_0_541 << CONST_SHIFT + PW_F1306 times 8 dw F_1_306 << CONST_SHIFT + + alignz 16 + + ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + ; + ; Perform the forward DCT on one block of samples. + ; + ; GLOBAL(void) -; jpeg_fdct_ifast_sse2 (DCTELEM * data) ++; jsimd_fdct_ifast_sse2 (DCTELEM * data) + ; + + %define data(b) (b)+8 ; DCTELEM * data + + %define original_ebp ebp+0 + %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] + %define WK_NUM 2 + + align 16 - global EXTN(jpeg_fdct_ifast_sse2) ++ global EXTN(jsimd_fdct_ifast_sse2) + -EXTN(jpeg_fdct_ifast_sse2): ++EXTN(jsimd_fdct_ifast_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx + ; push ecx ; unused + ; push edx ; need not be preserved + ; push esi ; unused + ; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=(44 54 64 74 45 55 65 75) + + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6,xmm1 + movdqa xmm3,xmm0 + psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(0)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2,xmm1 + movdqa xmm5,xmm7 + paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm0,xmm6 + psubw xmm3,xmm1 ; xmm3=tmp13 + psubw xmm6,xmm7 ; xmm6=tmp12 + paddw xmm4,xmm1 ; xmm4=tmp10 + paddw xmm0,xmm7 ; xmm0=tmp11 + + paddw xmm6,xmm3 + psllw xmm6,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm6,[GOTOFF(ebx,PW_F0707)] ; xmm6=z1 + + movdqa xmm1,xmm4 + movdqa xmm7,xmm3 + psubw xmm4,xmm0 ; xmm4=data4 + psubw xmm3,xmm6 ; xmm3=data6 + paddw xmm1,xmm0 ; xmm1=data0 + paddw xmm7,xmm6 ; xmm7=data2 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=tmp6 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp7 + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=data4 + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=data6 + + ; -- Odd part + + paddw xmm2,xmm5 ; xmm2=tmp10 + paddw xmm5,xmm0 ; xmm5=tmp11 + paddw xmm0,xmm6 ; xmm0=tmp12, xmm6=tmp7 + + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z3 + + movdqa xmm4,xmm2 ; xmm4=tmp10 + psubw xmm2,xmm0 + pmulhw xmm2,[GOTOFF(ebx,PW_F0382)] ; xmm2=z5 + pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm0,[GOTOFF(ebx,PW_F1306)] ; xmm0=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4,xmm2 ; xmm4=z2 + paddw xmm0,xmm2 ; xmm0=z4 + + movdqa xmm3,xmm6 + psubw xmm6,xmm5 ; xmm6=z13 + paddw xmm3,xmm5 ; xmm3=z11 + + movdqa xmm2,xmm6 + movdqa xmm5,xmm3 + psubw xmm6,xmm4 ; xmm6=data3 + psubw xmm3,xmm0 ; xmm3=data7 + paddw xmm2,xmm4 ; xmm2=data5 + paddw xmm5,xmm0 ; xmm5=data1 + + ; ---- Pass 2: process columns. + + ; mov edx, POINTER [data(eax)] ; (DCTELEM *) + + ; xmm1=(00 10 20 30 40 50 60 70), xmm7=(02 12 22 32 42 52 62 72) + ; xmm5=(01 11 21 31 41 51 61 71), xmm6=(03 13 23 33 43 53 63 73) + + movdqa xmm4,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm5 ; xmm1=(00 01 10 11 20 21 30 31) + punpckhwd xmm4,xmm5 ; xmm4=(40 41 50 51 60 61 70 71) + movdqa xmm0,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm6 ; xmm7=(02 03 12 13 22 23 32 33) + punpckhwd xmm0,xmm6 ; xmm0=(42 43 52 53 62 63 72 73) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=col4 + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=col6 + + ; xmm5=(04 14 24 34 44 54 64 74), xmm6=(06 16 26 36 46 56 66 76) + ; xmm2=(05 15 25 35 45 55 65 75), xmm3=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm7,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm2 ; xmm5=(04 05 14 15 24 25 34 35) + punpckhwd xmm7,xmm2 ; xmm7=(44 45 54 55 64 65 74 75) + movdqa xmm0,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm3 ; xmm6=(06 07 16 17 26 27 36 37) + punpckhwd xmm0,xmm3 ; xmm0=(46 47 56 57 66 67 76 77) + + movdqa xmm2,xmm5 ; transpose coefficients(phase 2) + punpckldq xmm5,xmm6 ; xmm5=(04 05 06 07 14 15 16 17) + punpckhdq xmm2,xmm6 ; xmm2=(24 25 26 27 34 35 36 37) + movdqa xmm3,xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7,xmm0 ; xmm7=(44 45 46 47 54 55 56 57) + punpckhdq xmm3,xmm0 ; xmm3=(64 65 66 67 74 75 76 77) + + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=(02 03 12 13 22 23 32 33) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=(44 45 46 47 54 55 56 57) + + movdqa xmm2,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 10 11 12 13) + punpckhdq xmm2,xmm6 ; xmm2=(20 21 22 23 30 31 32 33) + movdqa xmm7,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm0 ; xmm4=(40 41 42 43 50 51 52 53) + punpckhdq xmm7,xmm0 ; xmm7=(60 61 62 63 70 71 72 73) + + movdqa xmm6,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm5 ; xmm1=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm6,xmm5 ; xmm6=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm0,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm3 ; xmm7=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm0,xmm3 ; xmm0=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm5,xmm6 + movdqa xmm3,xmm1 + psubw xmm6,xmm7 ; xmm6=data1-data6=tmp6 + psubw xmm1,xmm0 ; xmm1=data0-data7=tmp7 + paddw xmm5,xmm7 ; xmm5=data1+data6=tmp1 + paddw xmm3,xmm0 ; xmm3=data0+data7=tmp0 + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(24 25 26 27 34 35 36 37) + movdqa xmm0, XMMWORD [wk(1)] ; xmm0=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm6 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=tmp7 + + movdqa xmm6,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm7 ; xmm2=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm6,xmm7 ; xmm6=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm1,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm0 ; xmm4=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm1,xmm0 ; xmm1=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm7,xmm6 + movdqa xmm0,xmm2 + paddw xmm6,xmm4 ; xmm6=data3+data4=tmp3 + paddw xmm2,xmm1 ; xmm2=data2+data5=tmp2 + psubw xmm7,xmm4 ; xmm7=data3-data4=tmp4 + psubw xmm0,xmm1 ; xmm0=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm1,xmm5 + psubw xmm3,xmm6 ; xmm3=tmp13 + psubw xmm5,xmm2 ; xmm5=tmp12 + paddw xmm4,xmm6 ; xmm4=tmp10 + paddw xmm1,xmm2 ; xmm1=tmp11 + + paddw xmm5,xmm3 + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[GOTOFF(ebx,PW_F0707)] ; xmm5=z1 + + movdqa xmm6,xmm4 + movdqa xmm2,xmm3 + psubw xmm4,xmm1 ; xmm4=data4 + psubw xmm3,xmm5 ; xmm3=data6 + paddw xmm6,xmm1 ; xmm6=data0 + paddw xmm2,xmm5 ; xmm2=data2 + + movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm2 + + ; -- Odd part + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + paddw xmm7,xmm0 ; xmm7=tmp10 + paddw xmm0,xmm1 ; xmm0=tmp11 + paddw xmm1,xmm5 ; xmm1=tmp12, xmm5=tmp7 + + psllw xmm7,PRE_MULTIPLY_SCALE_BITS + psllw xmm1,PRE_MULTIPLY_SCALE_BITS + + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm0,[GOTOFF(ebx,PW_F0707)] ; xmm0=z3 + + movdqa xmm4,xmm7 ; xmm4=tmp10 + psubw xmm7,xmm1 + pmulhw xmm7,[GOTOFF(ebx,PW_F0382)] ; xmm7=z5 + pmulhw xmm4,[GOTOFF(ebx,PW_F0541)] ; xmm4=MULTIPLY(tmp10,FIX_0_541196) + pmulhw xmm1,[GOTOFF(ebx,PW_F1306)] ; xmm1=MULTIPLY(tmp12,FIX_1_306562) + paddw xmm4,xmm7 ; xmm4=z2 + paddw xmm1,xmm7 ; xmm1=z4 + + movdqa xmm3,xmm5 + psubw xmm5,xmm0 ; xmm5=z13 + paddw xmm3,xmm0 ; xmm3=z11 + + movdqa xmm6,xmm5 + movdqa xmm2,xmm3 + psubw xmm5,xmm4 ; xmm5=data3 + psubw xmm3,xmm1 ; xmm3=data7 + paddw xmm6,xmm4 ; xmm6=data5 + paddw xmm2,xmm1 ; xmm2=data1 + + movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm5 + movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm3 + movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm6 + movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm2 + + ; pop edi ; unused + ; pop esi ; unused + ; pop edx ; need not be preserved + ; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + -%endif ; JFDCT_INT_SSE2_SUPPORTED -%endif ; DCT_IFAST_SUPPORTED diff --cc simd/jfss2int.asm index 0000000,106b42c..6e37497 mode 000000,100644..100644 --- a/simd/jfss2int.asm +++ b/simd/jfss2int.asm @@@ -1,0 -1,641 +1,631 @@@ + ; + ; jfss2int.asm - accurate integer FDCT (SSE2) + ; ++; Copyright 2009 Pierre Ossman for Cendio AB ++; ++; Based on + ; x86 SIMD extension for IJG JPEG library + ; Copyright (C) 1999-2006, MIYASAKA Masaru. + ; For conditions of distribution and use, see copyright notice in jsimdext.inc + ; + ; This file should be assembled with NASM (Netwide Assembler), + ; can *not* be assembled with Microsoft's MASM or any compatible + ; assembler (including Borland's Turbo Assembler). + ; NASM is available from http://nasm.sourceforge.net/ or + ; http://sourceforge.net/project/showfiles.php?group_id=6208 + ; + ; This file contains a slow-but-accurate integer implementation of the + ; forward DCT (Discrete Cosine Transform). The following code is based + ; directly on the IJG's original jfdctint.c; see the jfdctint.c for + ; more details. + ; -; Last Modified : February 4, 2006 -; + ; [TAB8] + -%include "jsimdext.inc" -%include "jdct.inc" - -%ifdef DCT_ISLOW_SUPPORTED -%ifdef JFDCT_INT_SSE2_SUPPORTED - -; This module is specialized to the case DCTSIZE = 8. -; -%if DCTSIZE != 8 -%error "Sorry, this code only copes with 8x8 DCTs." -%endif ++%include "simd/jsimdext.inc" ++%include "simd/jdct.inc" + + ; -------------------------------------------------------------------------- + + %define CONST_BITS 13 + %define PASS1_BITS 2 + + %define DESCALE_P1 (CONST_BITS-PASS1_BITS) + %define DESCALE_P2 (CONST_BITS+PASS1_BITS) + + %if CONST_BITS == 13 + F_0_298 equ 2446 ; FIX(0.298631336) + F_0_390 equ 3196 ; FIX(0.390180644) + F_0_541 equ 4433 ; FIX(0.541196100) + F_0_765 equ 6270 ; FIX(0.765366865) + F_0_899 equ 7373 ; FIX(0.899976223) + F_1_175 equ 9633 ; FIX(1.175875602) + F_1_501 equ 12299 ; FIX(1.501321110) + F_1_847 equ 15137 ; FIX(1.847759065) + F_1_961 equ 16069 ; FIX(1.961570560) + F_2_053 equ 16819 ; FIX(2.053119869) + F_2_562 equ 20995 ; FIX(2.562915447) + F_3_072 equ 25172 ; FIX(3.072711026) + %else + ; NASM cannot do compile-time arithmetic on floating-point constants. + %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) + F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) + F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) + F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) + F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) + F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) + F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) + F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) + F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) + F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) + F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) + F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) + F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) + %endif + + ; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_fdct_islow_sse2) + + EXTN(jconst_fdct_islow_sse2): + + PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 + PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) + PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 + PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) + PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 + PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) + PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 + PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) + PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) + PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) + PW_DESCALE_P2X times 8 dw 1 << (PASS1_BITS-1) + + alignz 16 + + ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + ; + ; Perform the forward DCT on one block of samples. + ; + ; GLOBAL(void) -; jpeg_fdct_islow_sse2 (DCTELEM * data) ++; jsimd_fdct_islow_sse2 (DCTELEM * data) + ; + + %define data(b) (b)+8 ; DCTELEM * data + + %define original_ebp ebp+0 + %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] + %define WK_NUM 6 + + align 16 - global EXTN(jpeg_fdct_islow_sse2) ++ global EXTN(jsimd_fdct_islow_sse2) + -EXTN(jpeg_fdct_islow_sse2): ++EXTN(jsimd_fdct_islow_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx + ; push ecx ; unused + ; push edx ; need not be preserved + ; push esi ; unused + ; push edi ; unused + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process rows. + + mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)] + movdqa xmm2, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)] + + ; xmm0=(00 01 02 03 04 05 06 07), xmm2=(20 21 22 23 24 25 26 27) + ; xmm1=(10 11 12 13 14 15 16 17), xmm3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm1 ; xmm0=(00 10 01 11 02 12 03 13) + punpckhwd xmm4,xmm1 ; xmm4=(04 14 05 15 06 16 07 17) + movdqa xmm5,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm3 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm5,xmm3 ; xmm5=(24 34 25 35 26 36 27 37) + + movdqa xmm6, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)] + movdqa xmm7, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)] + movdqa xmm1, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)] + + ; xmm6=( 4 12 20 28 36 44 52 60), xmm1=( 6 14 22 30 38 46 54 62) + ; xmm7=( 5 13 21 29 37 45 53 61), xmm3=( 7 15 23 31 39 47 55 63) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=(20 30 21 31 22 32 23 33) + movdqa XMMWORD [wk(1)], xmm5 ; wk(1)=(24 34 25 35 26 36 27 37) + + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm7 ; xmm2=(44 54 45 55 46 56 47 57) + movdqa xmm5,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm3 ; xmm1=(60 70 61 71 62 72 63 73) + punpckhwd xmm5,xmm3 ; xmm5=(64 74 65 75 66 76 67 77) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm1 ; xmm6=(40 50 60 70 41 51 61 71) + punpckhdq xmm7,xmm1 ; xmm7=(42 52 62 72 43 53 63 73) + movdqa xmm3,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm3,xmm5 ; xmm3=(46 56 66 76 47 57 67 77) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(20 30 21 31 22 32 23 33) + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=(24 34 25 35 26 36 27 37) + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=(42 52 62 72 43 53 63 73) + movdqa XMMWORD [wk(3)], xmm2 ; wk(3)=(44 54 64 74 45 55 65 75) + + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm1 ; xmm0=(00 10 20 30 01 11 21 31) + punpckhdq xmm7,xmm1 ; xmm7=(02 12 22 32 03 13 23 33) + movdqa xmm2,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm5 ; xmm4=(04 14 24 34 05 15 25 35) + punpckhdq xmm2,xmm5 ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa xmm1,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=(00 10 20 30 40 50 60 70)=data0 + punpckhqdq xmm1,xmm6 ; xmm1=(01 11 21 31 41 51 61 71)=data1 + movdqa xmm5,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm3 ; xmm2=(06 16 26 36 46 56 66 76)=data6 + punpckhqdq xmm5,xmm3 ; xmm5=(07 17 27 37 47 57 67 77)=data7 + + movdqa xmm6,xmm1 + movdqa xmm3,xmm0 + psubw xmm1,xmm2 ; xmm1=data1-data6=tmp6 + psubw xmm0,xmm5 ; xmm0=data0-data7=tmp7 + paddw xmm6,xmm2 ; xmm6=data1+data6=tmp1 + paddw xmm3,xmm5 ; xmm3=data0+data7=tmp0 + + movdqa xmm2, XMMWORD [wk(2)] ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5, XMMWORD [wk(3)] ; xmm5=(44 54 64 74 45 55 65 75) + movdqa XMMWORD [wk(0)], xmm1 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp7 + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm2 ; xmm7=(02 12 22 32 42 52 62 72)=data2 + punpckhqdq xmm1,xmm2 ; xmm1=(03 13 23 33 43 53 63 73)=data3 + movdqa xmm0,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm5 ; xmm4=(04 14 24 34 44 54 64 74)=data4 + punpckhqdq xmm0,xmm5 ; xmm0=(05 15 25 35 45 55 65 75)=data5 + + movdqa xmm2,xmm1 + movdqa xmm5,xmm7 + paddw xmm1,xmm4 ; xmm1=data3+data4=tmp3 + paddw xmm7,xmm0 ; xmm7=data2+data5=tmp2 + psubw xmm2,xmm4 ; xmm2=data3-data4=tmp4 + psubw xmm5,xmm0 ; xmm5=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm4,xmm3 + movdqa xmm0,xmm6 + paddw xmm3,xmm1 ; xmm3=tmp10 + paddw xmm6,xmm7 ; xmm6=tmp11 + psubw xmm4,xmm1 ; xmm4=tmp13 + psubw xmm0,xmm7 ; xmm0=tmp12 + + movdqa xmm1,xmm3 + paddw xmm3,xmm6 ; xmm3=tmp10+tmp11 + psubw xmm1,xmm6 ; xmm1=tmp10-tmp11 + + psllw xmm3,PASS1_BITS ; xmm3=data0 + psllw xmm1,PASS1_BITS ; xmm1=data4 + + movdqa XMMWORD [wk(2)], xmm3 ; wk(2)=data0 + movdqa XMMWORD [wk(3)], xmm1 ; wk(3)=data4 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm7,xmm4 ; xmm4=tmp13 + movdqa xmm6,xmm4 + punpcklwd xmm7,xmm0 ; xmm0=tmp12 + punpckhwd xmm6,xmm0 + movdqa xmm4,xmm7 + movdqa xmm0,xmm6 + pmaddwd xmm7,[GOTOFF(ebx,PW_F130_F054)] ; xmm7=data2L + pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=data2H + pmaddwd xmm4,[GOTOFF(ebx,PW_F054_MF130)] ; xmm4=data6L + pmaddwd xmm0,[GOTOFF(ebx,PW_F054_MF130)] ; xmm0=data6H + + paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm7,DESCALE_P1 + psrad xmm6,DESCALE_P1 + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm4,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm7,xmm6 ; xmm7=data2 + packssdw xmm4,xmm0 ; xmm4=data6 + + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=data2 + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=data6 + + ; -- Odd part + + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=tmp6 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp7 + + movdqa xmm6,xmm2 ; xmm2=tmp4 + movdqa xmm0,xmm5 ; xmm5=tmp5 + paddw xmm6,xmm3 ; xmm6=z3 + paddw xmm0,xmm1 ; xmm0=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm7,xmm6 + movdqa xmm4,xmm6 + punpcklwd xmm7,xmm0 + punpckhwd xmm4,xmm0 + movdqa xmm6,xmm7 + movdqa xmm0,xmm4 + pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3L + pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3H + pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4L + pmaddwd xmm0,[GOTOFF(ebx,PW_F117_F078)] ; xmm0=z4H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm7,xmm2 + movdqa xmm4,xmm2 + punpcklwd xmm7,xmm1 + punpckhwd xmm4,xmm1 + movdqa xmm2,xmm7 + movdqa xmm1,xmm4 + pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp4L + pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4H + pmaddwd xmm2,[GOTOFF(ebx,PW_MF089_F060)] ; xmm2=tmp7L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp7H + + paddd xmm7, XMMWORD [wk(0)] ; xmm7=data7L + paddd xmm4, XMMWORD [wk(1)] ; xmm4=data7H + paddd xmm2,xmm6 ; xmm2=data1L + paddd xmm1,xmm0 ; xmm1=data1H + + paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm7,DESCALE_P1 + psrad xmm4,DESCALE_P1 + paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm2,DESCALE_P1 + psrad xmm1,DESCALE_P1 + + packssdw xmm7,xmm4 ; xmm7=data7 + packssdw xmm2,xmm1 ; xmm2=data1 + + movdqa xmm4,xmm5 + movdqa xmm1,xmm5 + punpcklwd xmm4,xmm3 + punpckhwd xmm1,xmm3 + movdqa xmm5,xmm4 + movdqa xmm3,xmm1 + pmaddwd xmm4,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm4=tmp5L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5H + pmaddwd xmm5,[GOTOFF(ebx,PW_MF256_F050)] ; xmm5=tmp6L + pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6H + + paddd xmm4,xmm6 ; xmm4=data5L + paddd xmm1,xmm0 ; xmm1=data5H + paddd xmm5, XMMWORD [wk(0)] ; xmm5=data3L + paddd xmm3, XMMWORD [wk(1)] ; xmm3=data3H + + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm4,DESCALE_P1 + psrad xmm1,DESCALE_P1 + paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P1)] + paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] + psrad xmm5,DESCALE_P1 + psrad xmm3,DESCALE_P1 + + packssdw xmm4,xmm1 ; xmm4=data5 + packssdw xmm5,xmm3 ; xmm5=data3 + + ; ---- Pass 2: process columns. + + ; mov edx, POINTER [data(eax)] ; (DCTELEM *) + + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=col0 + movdqa xmm0, XMMWORD [wk(4)] ; xmm0=col2 + + ; xmm6=(00 10 20 30 40 50 60 70), xmm0=(02 12 22 32 42 52 62 72) + ; xmm2=(01 11 21 31 41 51 61 71), xmm5=(03 13 23 33 43 53 63 73) + + movdqa xmm1,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm2 ; xmm6=(00 01 10 11 20 21 30 31) + punpckhwd xmm1,xmm2 ; xmm1=(40 41 50 51 60 61 70 71) + movdqa xmm3,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm5 ; xmm0=(02 03 12 13 22 23 32 33) + punpckhwd xmm3,xmm5 ; xmm3=(42 43 52 53 62 63 72 73) + + movdqa xmm2, XMMWORD [wk(3)] ; xmm2=col4 + movdqa xmm5, XMMWORD [wk(5)] ; xmm5=col6 + + ; xmm2=(04 14 24 34 44 54 64 74), xmm5=(06 16 26 36 46 56 66 76) + ; xmm4=(05 15 25 35 45 55 65 75), xmm7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=(02 03 12 13 22 23 32 33) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(42 43 52 53 62 63 72 73) + + movdqa xmm0,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm4 ; xmm2=(04 05 14 15 24 25 34 35) + punpckhwd xmm0,xmm4 ; xmm0=(44 45 54 55 64 65 74 75) + movdqa xmm3,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm7 ; xmm5=(06 07 16 17 26 27 36 37) + punpckhwd xmm3,xmm7 ; xmm3=(46 47 56 57 66 67 76 77) + + movdqa xmm4,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm5 ; xmm2=(04 05 06 07 14 15 16 17) + punpckhdq xmm4,xmm5 ; xmm4=(24 25 26 27 34 35 36 37) + movdqa xmm7,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm3 ; xmm0=(44 45 46 47 54 55 56 57) + punpckhdq xmm7,xmm3 ; xmm7=(64 65 66 67 74 75 76 77) + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=(02 03 12 13 22 23 32 33) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(42 43 52 53 62 63 72 73) + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=(24 25 26 27 34 35 36 37) + movdqa XMMWORD [wk(3)], xmm0 ; wk(3)=(44 45 46 47 54 55 56 57) + + movdqa xmm4,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm5 ; xmm6=(00 01 02 03 10 11 12 13) + punpckhdq xmm4,xmm5 ; xmm4=(20 21 22 23 30 31 32 33) + movdqa xmm0,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm3 ; xmm1=(40 41 42 43 50 51 52 53) + punpckhdq xmm0,xmm3 ; xmm0=(60 61 62 63 70 71 72 73) + + movdqa xmm5,xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6,xmm2 ; xmm6=(00 01 02 03 04 05 06 07)=data0 + punpckhqdq xmm5,xmm2 ; xmm5=(10 11 12 13 14 15 16 17)=data1 + movdqa xmm3,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm7 ; xmm0=(60 61 62 63 64 65 66 67)=data6 + punpckhqdq xmm3,xmm7 ; xmm3=(70 71 72 73 74 75 76 77)=data7 + + movdqa xmm2,xmm5 + movdqa xmm7,xmm6 + psubw xmm5,xmm0 ; xmm5=data1-data6=tmp6 + psubw xmm6,xmm3 ; xmm6=data0-data7=tmp7 + paddw xmm2,xmm0 ; xmm2=data1+data6=tmp1 + paddw xmm7,xmm3 ; xmm7=data0+data7=tmp0 + + movdqa xmm0, XMMWORD [wk(2)] ; xmm0=(24 25 26 27 34 35 36 37) + movdqa xmm3, XMMWORD [wk(3)] ; xmm3=(44 45 46 47 54 55 56 57) + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=tmp6 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp7 + + movdqa xmm5,xmm4 ; transpose coefficients(phase 3) + punpcklqdq xmm4,xmm0 ; xmm4=(20 21 22 23 24 25 26 27)=data2 + punpckhqdq xmm5,xmm0 ; xmm5=(30 31 32 33 34 35 36 37)=data3 + movdqa xmm6,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm3 ; xmm1=(40 41 42 43 44 45 46 47)=data4 + punpckhqdq xmm6,xmm3 ; xmm6=(50 51 52 53 54 55 56 57)=data5 + + movdqa xmm0,xmm5 + movdqa xmm3,xmm4 + paddw xmm5,xmm1 ; xmm5=data3+data4=tmp3 + paddw xmm4,xmm6 ; xmm4=data2+data5=tmp2 + psubw xmm0,xmm1 ; xmm0=data3-data4=tmp4 + psubw xmm3,xmm6 ; xmm3=data2-data5=tmp5 + + ; -- Even part + + movdqa xmm1,xmm7 + movdqa xmm6,xmm2 + paddw xmm7,xmm5 ; xmm7=tmp10 + paddw xmm2,xmm4 ; xmm2=tmp11 + psubw xmm1,xmm5 ; xmm1=tmp13 + psubw xmm6,xmm4 ; xmm6=tmp12 + + movdqa xmm5,xmm7 + paddw xmm7,xmm2 ; xmm7=tmp10+tmp11 + psubw xmm5,xmm2 ; xmm5=tmp10-tmp11 + + paddw xmm7,[GOTOFF(ebx,PW_DESCALE_P2X)] + paddw xmm5,[GOTOFF(ebx,PW_DESCALE_P2X)] + psraw xmm7,PASS1_BITS ; xmm7=data0 + psraw xmm5,PASS1_BITS ; xmm5=data4 + + movdqa XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_DCTELEM)], xmm7 + movdqa XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_DCTELEM)], xmm5 + + ; (Original) + ; z1 = (tmp12 + tmp13) * 0.541196100; + ; data2 = z1 + tmp13 * 0.765366865; + ; data6 = z1 + tmp12 * -1.847759065; + ; + ; (This implementation) + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); + + movdqa xmm4,xmm1 ; xmm1=tmp13 + movdqa xmm2,xmm1 + punpcklwd xmm4,xmm6 ; xmm6=tmp12 + punpckhwd xmm2,xmm6 + movdqa xmm1,xmm4 + movdqa xmm6,xmm2 + pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=data2L + pmaddwd xmm2,[GOTOFF(ebx,PW_F130_F054)] ; xmm2=data2H + pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=data6L + pmaddwd xmm6,[GOTOFF(ebx,PW_F054_MF130)] ; xmm6=data6H + + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm4,DESCALE_P2 + psrad xmm2,DESCALE_P2 + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm1,DESCALE_P2 + psrad xmm6,DESCALE_P2 + + packssdw xmm4,xmm2 ; xmm4=data2 + packssdw xmm1,xmm6 ; xmm1=data6 + + movdqa XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_DCTELEM)], xmm1 + + ; -- Odd part + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp6 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp7 + + movdqa xmm2,xmm0 ; xmm0=tmp4 + movdqa xmm6,xmm3 ; xmm3=tmp5 + paddw xmm2,xmm7 ; xmm2=z3 + paddw xmm6,xmm5 ; xmm6=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm4,xmm2 + movdqa xmm1,xmm2 + punpcklwd xmm4,xmm6 + punpckhwd xmm1,xmm6 + movdqa xmm2,xmm4 + movdqa xmm6,xmm1 + pmaddwd xmm4,[GOTOFF(ebx,PW_MF078_F117)] ; xmm4=z3L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF078_F117)] ; xmm1=z3H + pmaddwd xmm2,[GOTOFF(ebx,PW_F117_F078)] ; xmm2=z4L + pmaddwd xmm6,[GOTOFF(ebx,PW_F117_F078)] ; xmm6=z4H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=z3L + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=z3H + + ; (Original) + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; + ; + ; (This implementation) + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); + ; data7 = tmp4 + z3; data5 = tmp5 + z4; + ; data3 = tmp6 + z3; data1 = tmp7 + z4; + + movdqa xmm4,xmm0 + movdqa xmm1,xmm0 + punpcklwd xmm4,xmm5 + punpckhwd xmm1,xmm5 + movdqa xmm0,xmm4 + movdqa xmm5,xmm1 + pmaddwd xmm4,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm4=tmp4L + pmaddwd xmm1,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm1=tmp4H + pmaddwd xmm0,[GOTOFF(ebx,PW_MF089_F060)] ; xmm0=tmp7L + pmaddwd xmm5,[GOTOFF(ebx,PW_MF089_F060)] ; xmm5=tmp7H + + paddd xmm4, XMMWORD [wk(0)] ; xmm4=data7L + paddd xmm1, XMMWORD [wk(1)] ; xmm1=data7H + paddd xmm0,xmm2 ; xmm0=data1L + paddd xmm5,xmm6 ; xmm5=data1H + + paddd xmm4,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm4,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm0,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm0,DESCALE_P2 + psrad xmm5,DESCALE_P2 + + packssdw xmm4,xmm1 ; xmm4=data7 + packssdw xmm0,xmm5 ; xmm0=data1 + + movdqa XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_DCTELEM)], xmm4 + movdqa XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_DCTELEM)], xmm0 + + movdqa xmm1,xmm3 + movdqa xmm5,xmm3 + punpcklwd xmm1,xmm7 + punpckhwd xmm5,xmm7 + movdqa xmm3,xmm1 + movdqa xmm7,xmm5 + pmaddwd xmm1,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm1=tmp5L + pmaddwd xmm5,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm5=tmp5H + pmaddwd xmm3,[GOTOFF(ebx,PW_MF256_F050)] ; xmm3=tmp6L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF256_F050)] ; xmm7=tmp6H + + paddd xmm1,xmm2 ; xmm1=data5L + paddd xmm5,xmm6 ; xmm5=data5H + paddd xmm3, XMMWORD [wk(0)] ; xmm3=data3L + paddd xmm7, XMMWORD [wk(1)] ; xmm7=data3H + + paddd xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm1,DESCALE_P2 + psrad xmm5,DESCALE_P2 + paddd xmm3,[GOTOFF(ebx,PD_DESCALE_P2)] + paddd xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] + psrad xmm3,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm1,xmm5 ; xmm1=data5 + packssdw xmm3,xmm7 ; xmm3=data3 + + movdqa XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_DCTELEM)], xmm1 + movdqa XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_DCTELEM)], xmm3 + + ; pop edi ; unused + ; pop esi ; unused + ; pop edx ; need not be preserved + ; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + -%endif ; JFDCT_INT_SSE2_SUPPORTED -%endif ; DCT_ISLOW_SUPPORTED diff --cc simd/jiss2flt.asm index 0000000,c0565a3..6bb429a mode 000000,100644..100644 --- a/simd/jiss2flt.asm +++ b/simd/jiss2flt.asm @@@ -1,0 -1,508 +1,495 @@@ + ; + ; jiss2flt.asm - floating-point IDCT (SSE & SSE2) + ; ++; Copyright 2009 Pierre Ossman for Cendio AB ++; ++; Based on + ; x86 SIMD extension for IJG JPEG library + ; Copyright (C) 1999-2006, MIYASAKA Masaru. + ; For conditions of distribution and use, see copyright notice in jsimdext.inc + ; + ; This file should be assembled with NASM (Netwide Assembler), + ; can *not* be assembled with Microsoft's MASM or any compatible + ; assembler (including Borland's Turbo Assembler). + ; NASM is available from http://nasm.sourceforge.net/ or + ; http://sourceforge.net/project/showfiles.php?group_id=6208 + ; + ; This file contains a floating-point implementation of the inverse DCT + ; (Discrete Cosine Transform). The following code is based directly on + ; the IJG's original jidctflt.c; see the jidctflt.c for more details. + ; -; Last Modified : February 4, 2006 -; + ; [TAB8] + -%include "jsimdext.inc" -%include "jdct.inc" - -%ifdef DCT_FLOAT_SUPPORTED -%ifdef JIDCT_FLT_SSE_SSE2_SUPPORTED - -; This module is specialized to the case DCTSIZE = 8. -; -%if DCTSIZE != 8 -%error "Sorry, this code only copes with 8x8 DCTs." -%endif ++%include "simd/jsimdext.inc" ++%include "simd/jdct.inc" + + ; -------------------------------------------------------------------------- + + %macro unpcklps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5) + shufps %1,%2,0x44 + %endmacro + + %macro unpckhps2 2 ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7) + shufps %1,%2,0xEE + %endmacro + + ; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_float_sse2) + + EXTN(jconst_idct_float_sse2): + + PD_1_414 times 4 dd 1.414213562373095048801689 + PD_1_847 times 4 dd 1.847759065022573512256366 + PD_1_082 times 4 dd 1.082392200292393968799446 + PD_M2_613 times 4 dd -2.613125929752753055713286 + PD_RNDINT_MAGIC times 4 dd 100663296.0 ; (float)(0x00C00000 << 3) + PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + + ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + ; + ; Perform dequantization and inverse DCT on one block of coefficients. + ; + ; GLOBAL(void) -; jpeg_idct_float_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, -; JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) ++; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block, ++; JSAMPARRAY output_buf, JDIMENSION output_col) + ; + -%define cinfo(b) (b)+8 ; j_decompress_ptr cinfo -%define compptr(b) (b)+12 ; jpeg_component_info * compptr -%define coef_block(b) (b)+16 ; JCOEFPTR coef_block -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf -%define output_col(b) (b)+24 ; JDIMENSION output_col ++%define dct_table(b) (b)+8 ; void * dct_table ++%define coef_block(b) (b)+12 ; JCOEFPTR coef_block ++%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf ++%define output_col(b) (b)+20 ; JDIMENSION output_col + + %define original_ebp ebp+0 + %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] + %define WK_NUM 2 + %define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT + ; FAST_FLOAT workspace[DCTSIZE2] + + align 16 - global EXTN(jpeg_idct_float_sse2) ++ global EXTN(jsimd_idct_float_sse2) + -EXTN(jpeg_idct_float_sse2): ++EXTN(jsimd_idct_float_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [workspace] + push ebx + ; push ecx ; need not be preserved + ; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input, store into work array. + + ; mov eax, [original_ebp] - mov edx, POINTER [compptr(eax)] - mov edx, POINTER [jcompinfo_dct_table(edx)] ; quantptr ++ mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + lea edi, [workspace] ; FAST_FLOAT * wsptr + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 + .columnloop: + %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + - movq xmm1, _MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq xmm2, _MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq xmm3, _MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq xmm4, _MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq xmm5, _MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq xmm6, _MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] - movq xmm7, _MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] ++ movq xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] ++ movq xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] ++ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] ++ movq xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] ++ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] ++ movq xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] ++ movq xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1,xmm2 + por xmm3,xmm4 + por xmm5,xmm6 + por xmm1,xmm3 + por xmm5,xmm7 + por xmm1,xmm5 + packsswb xmm1,xmm1 + movd eax,xmm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + - movq xmm0, _MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] ++ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm1,xmm0 + movaps xmm2,xmm0 + movaps xmm3,xmm0 + + shufps xmm0,xmm0,0x00 ; xmm0=(00 00 00 00) + shufps xmm1,xmm1,0x55 ; xmm1=(01 01 01 01) + shufps xmm2,xmm2,0xAA ; xmm2=(02 02 02 02) + shufps xmm3,xmm3,0xFF ; xmm3=(03 03 03 03) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + jmp near .nextcolumn + alignx 16,7 + %endif + .columnDCT: + + ; -- Even part + - movq xmm0, _MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] - movq xmm1, _MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] - movq xmm2, _MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] - movq xmm3, _MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] ++ movq xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)] ++ movq xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)] ++ movq xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)] ++ movq xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpcklwd xmm1,xmm1 ; xmm1=(20 20 21 21 22 22 23 23) + psrad xmm0,(DWORD_BIT-WORD_BIT) ; xmm0=in0=(00 01 02 03) + psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in2=(20 21 22 23) + cvtdq2ps xmm0,xmm0 ; xmm0=in0=(00 01 02 03) + cvtdq2ps xmm1,xmm1 ; xmm1=in2=(20 21 22 23) + + punpcklwd xmm2,xmm2 ; xmm2=(40 40 41 41 42 42 43 43) + punpcklwd xmm3,xmm3 ; xmm3=(60 60 61 61 62 62 63 63) + psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in4=(40 41 42 43) + psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in6=(60 61 62 63) + cvtdq2ps xmm2,xmm2 ; xmm2=in4=(40 41 42 43) + cvtdq2ps xmm3,xmm3 ; xmm3=in6=(60 61 62 63) + + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[GOTOFF(ebx,PD_1_414)] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + - movq xmm2, _MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] - movq xmm3, _MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] - movq xmm5, _MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] - movq xmm1, _MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] ++ movq xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)] ++ movq xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)] ++ movq xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)] ++ movq xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)] + + punpcklwd xmm2,xmm2 ; xmm2=(10 10 11 11 12 12 13 13) + punpcklwd xmm3,xmm3 ; xmm3=(30 30 31 31 32 32 33 33) + psrad xmm2,(DWORD_BIT-WORD_BIT) ; xmm2=in1=(10 11 12 13) + psrad xmm3,(DWORD_BIT-WORD_BIT) ; xmm3=in3=(30 31 32 33) + cvtdq2ps xmm2,xmm2 ; xmm2=in1=(10 11 12 13) + cvtdq2ps xmm3,xmm3 ; xmm3=in3=(30 31 32 33) + + punpcklwd xmm5,xmm5 ; xmm5=(50 50 51 51 52 52 53 53) + punpcklwd xmm1,xmm1 ; xmm1=(70 70 71 71 72 72 73 73) + psrad xmm5,(DWORD_BIT-WORD_BIT) ; xmm5=in5=(50 51 52 53) + psrad xmm1,(DWORD_BIT-WORD_BIT) ; xmm1=in7=(70 71 72 73) + cvtdq2ps xmm5,xmm5 ; xmm5=in5=(50 51 52 53) + cvtdq2ps xmm1,xmm1 ; xmm1=in7=(70 71 72 73) + + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + mulps xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 01 02 03) + addps xmm7,xmm3 ; xmm7=data1=(10 11 12 13) + subps xmm5,xmm1 ; xmm5=data7=(70 71 72 73) + subps xmm0,xmm3 ; xmm0=data6=(60 61 62 63) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,xmm6 ; transpose coefficients(phase 1) + unpcklps xmm6,xmm7 ; xmm6=(00 10 01 11) + unpckhps xmm1,xmm7 ; xmm1=(02 12 03 13) + movaps xmm3,xmm0 ; transpose coefficients(phase 1) + unpcklps xmm0,xmm5 ; xmm0=(60 70 61 71) + unpckhps xmm3,xmm5 ; xmm3=(62 72 63 73) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movaps xmm5, XMMWORD [wk(1)] ; xmm5=tmp3 + + movaps XMMWORD [wk(0)], xmm0 ; wk(0)=(60 70 61 71) + movaps XMMWORD [wk(1)], xmm3 ; wk(1)=(62 72 63 73) + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm0,xmm7 + movaps xmm3,xmm5 + addps xmm7,xmm2 ; xmm7=data2=(20 21 22 23) + addps xmm5,xmm4 ; xmm5=data4=(40 41 42 43) + subps xmm0,xmm2 ; xmm0=data5=(50 51 52 53) + subps xmm3,xmm4 ; xmm3=data3=(30 31 32 33) + + movaps xmm2,xmm7 ; transpose coefficients(phase 1) + unpcklps xmm7,xmm3 ; xmm7=(20 30 21 31) + unpckhps xmm2,xmm3 ; xmm2=(22 32 23 33) + movaps xmm4,xmm5 ; transpose coefficients(phase 1) + unpcklps xmm5,xmm0 ; xmm5=(40 50 41 51) + unpckhps xmm4,xmm0 ; xmm4=(42 52 43 53) + + movaps xmm3,xmm6 ; transpose coefficients(phase 2) + unpcklps2 xmm6,xmm7 ; xmm6=(00 10 20 30) + unpckhps2 xmm3,xmm7 ; xmm3=(01 11 21 31) + movaps xmm0,xmm1 ; transpose coefficients(phase 2) + unpcklps2 xmm1,xmm2 ; xmm1=(02 12 22 32) + unpckhps2 xmm0,xmm2 ; xmm0=(03 13 23 33) + + movaps xmm7, XMMWORD [wk(0)] ; xmm7=(60 70 61 71) + movaps xmm2, XMMWORD [wk(1)] ; xmm2=(62 72 63 73) + + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 + movaps XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1 + movaps XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0 + + movaps xmm6,xmm5 ; transpose coefficients(phase 2) + unpcklps2 xmm5,xmm7 ; xmm5=(40 50 60 70) + unpckhps2 xmm6,xmm7 ; xmm6=(41 51 61 71) + movaps xmm3,xmm4 ; transpose coefficients(phase 2) + unpcklps2 xmm4,xmm2 ; xmm4=(42 52 62 72) + unpckhps2 xmm3,xmm2 ; xmm3=(43 53 63 73) + + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 + movaps XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4 + movaps XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3 + + .nextcolumn: + add esi, byte 4*SIZEOF_JCOEF ; coef_block + add edx, byte 4*SIZEOF_FLOAT_MULT_TYPE ; quantptr + add edi, 4*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr + dec ecx ; ctr + jnz near .columnloop + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + lea esi, [workspace] ; FAST_FLOAT * wsptr + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + mov ecx, DCTSIZE/4 ; ctr + alignx 16,7 + .rowloop: + + ; -- Even part + + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm0 + movaps xmm5,xmm1 + subps xmm0,xmm2 ; xmm0=tmp11 + subps xmm1,xmm3 + addps xmm4,xmm2 ; xmm4=tmp10 + addps xmm5,xmm3 ; xmm5=tmp13 + + mulps xmm1,[GOTOFF(ebx,PD_1_414)] + subps xmm1,xmm5 ; xmm1=tmp12 + + movaps xmm6,xmm4 + movaps xmm7,xmm0 + subps xmm4,xmm5 ; xmm4=tmp3 + subps xmm0,xmm1 ; xmm0=tmp2 + addps xmm6,xmm5 ; xmm6=tmp0 + addps xmm7,xmm1 ; xmm7=tmp1 + + movaps XMMWORD [wk(1)], xmm4 ; tmp3 + movaps XMMWORD [wk(0)], xmm0 ; tmp2 + + ; -- Odd part + + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] + movaps xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] + + movaps xmm4,xmm2 + movaps xmm0,xmm5 + addps xmm2,xmm1 ; xmm2=z11 + addps xmm5,xmm3 ; xmm5=z13 + subps xmm4,xmm1 ; xmm4=z12 + subps xmm0,xmm3 ; xmm0=z10 + + movaps xmm1,xmm2 + subps xmm2,xmm5 + addps xmm1,xmm5 ; xmm1=tmp7 + + mulps xmm2,[GOTOFF(ebx,PD_1_414)] ; xmm2=tmp11 + + movaps xmm3,xmm0 + addps xmm0,xmm4 + mulps xmm0,[GOTOFF(ebx,PD_1_847)] ; xmm0=z5 + mulps xmm3,[GOTOFF(ebx,PD_M2_613)] ; xmm3=(z10 * -2.613125930) + mulps xmm4,[GOTOFF(ebx,PD_1_082)] ; xmm4=(z12 * 1.082392200) + addps xmm3,xmm0 ; xmm3=tmp12 + subps xmm4,xmm0 ; xmm4=tmp10 + + ; -- Final output stage + + subps xmm3,xmm1 ; xmm3=tmp6 + movaps xmm5,xmm6 + movaps xmm0,xmm7 + addps xmm6,xmm1 ; xmm6=data0=(00 10 20 30) + addps xmm7,xmm3 ; xmm7=data1=(01 11 21 31) + subps xmm5,xmm1 ; xmm5=data7=(07 17 27 37) + subps xmm0,xmm3 ; xmm0=data6=(06 16 26 36) + subps xmm2,xmm3 ; xmm2=tmp5 + + movaps xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm1=[PD_RNDINT_MAGIC] + pcmpeqd xmm3,xmm3 + psrld xmm3,WORD_BIT ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm6,xmm1 ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **) + addps xmm7,xmm1 ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **) + addps xmm0,xmm1 ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **) + addps xmm5,xmm1 ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **) + + pand xmm6,xmm3 ; xmm6=(00 -- 10 -- 20 -- 30 --) + pslld xmm7,WORD_BIT ; xmm7=(-- 01 -- 11 -- 21 -- 31) + pand xmm0,xmm3 ; xmm0=(06 -- 16 -- 26 -- 36 --) + pslld xmm5,WORD_BIT ; xmm5=(-- 07 -- 17 -- 27 -- 37) + por xmm6,xmm7 ; xmm6=(00 01 10 11 20 21 30 31) + por xmm0,xmm5 ; xmm0=(06 07 16 17 26 27 36 37) + + movaps xmm1, XMMWORD [wk(0)] ; xmm1=tmp2 + movaps xmm3, XMMWORD [wk(1)] ; xmm3=tmp3 + + addps xmm4,xmm2 ; xmm4=tmp4 + movaps xmm7,xmm1 + movaps xmm5,xmm3 + addps xmm1,xmm2 ; xmm1=data2=(02 12 22 32) + addps xmm3,xmm4 ; xmm3=data4=(04 14 24 34) + subps xmm7,xmm2 ; xmm7=data5=(05 15 25 35) + subps xmm5,xmm4 ; xmm5=data3=(03 13 23 33) + + movaps xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; xmm2=[PD_RNDINT_MAGIC] + pcmpeqd xmm4,xmm4 + psrld xmm4,WORD_BIT ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..} + + addps xmm3,xmm2 ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **) + addps xmm7,xmm2 ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **) + addps xmm1,xmm2 ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **) + addps xmm5,xmm2 ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **) + + pand xmm3,xmm4 ; xmm3=(04 -- 14 -- 24 -- 34 --) + pslld xmm7,WORD_BIT ; xmm7=(-- 05 -- 15 -- 25 -- 35) + pand xmm1,xmm4 ; xmm1=(02 -- 12 -- 22 -- 32 --) + pslld xmm5,WORD_BIT ; xmm5=(-- 03 -- 13 -- 23 -- 33) + por xmm3,xmm7 ; xmm3=(04 05 14 15 24 25 34 35) + por xmm1,xmm5 ; xmm1=(02 03 12 13 22 23 32 33) + + movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] + + packsswb xmm6,xmm3 ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35) + packsswb xmm1,xmm0 ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37) + paddb xmm6,xmm2 + paddb xmm1,xmm2 + + movdqa xmm4,xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6,xmm1 ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm1 ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + + movdqa xmm7,xmm6 ; transpose coefficients(phase 3) + punpckldq xmm6,xmm4 ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm7,xmm4 ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + + pshufd xmm5,xmm6,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm3,xmm7,0x4E ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + + pushpic ebx ; save GOT address + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movq _MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 - movq _MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 ++ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 ++ movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7 + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq _MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 - movq _MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 ++ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 ++ movq XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3 + + poppic ebx ; restore GOT address + + add esi, byte 4*SIZEOF_FAST_FLOAT ; wsptr + add edi, byte 4*SIZEOF_JSAMPROW + dec ecx ; ctr + jnz near .rowloop + + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; need not be preserved + pop ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + -%endif ; JIDCT_FLT_SSE_SSE2_SUPPORTED -%endif ; DCT_FLOAT_SUPPORTED diff --cc simd/jiss2fst.asm index 0000000,937a260..aafa810 mode 000000,100644..100644 --- a/simd/jiss2fst.asm +++ b/simd/jiss2fst.asm @@@ -1,0 -1,512 +1,499 @@@ + ; + ; jiss2fst.asm - fast integer IDCT (SSE2) + ; ++; Copyright 2009 Pierre Ossman for Cendio AB ++; ++; Based on + ; x86 SIMD extension for IJG JPEG library + ; Copyright (C) 1999-2006, MIYASAKA Masaru. + ; For conditions of distribution and use, see copyright notice in jsimdext.inc + ; + ; This file should be assembled with NASM (Netwide Assembler), + ; can *not* be assembled with Microsoft's MASM or any compatible + ; assembler (including Borland's Turbo Assembler). + ; NASM is available from http://nasm.sourceforge.net/ or + ; http://sourceforge.net/project/showfiles.php?group_id=6208 + ; + ; This file contains a fast, not so accurate integer implementation of + ; the inverse DCT (Discrete Cosine Transform). The following code is + ; based directly on the IJG's original jidctfst.c; see the jidctfst.c + ; for more details. + ; -; Last Modified : February 4, 2006 -; + ; [TAB8] + -%include "jsimdext.inc" -%include "jdct.inc" - -%ifdef DCT_IFAST_SUPPORTED -%ifdef JIDCT_INT_SSE2_SUPPORTED - -; This module is specialized to the case DCTSIZE = 8. -; -%if DCTSIZE != 8 -%error "Sorry, this code only copes with 8x8 DCTs." -%endif ++%include "simd/jsimdext.inc" ++%include "simd/jdct.inc" + + ; -------------------------------------------------------------------------- + + %define CONST_BITS 8 ; 14 is also OK. + %define PASS1_BITS 2 + + %if IFAST_SCALE_BITS != PASS1_BITS + %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'." + %endif + + %if CONST_BITS == 8 + F_1_082 equ 277 ; FIX(1.082392200) + F_1_414 equ 362 ; FIX(1.414213562) + F_1_847 equ 473 ; FIX(1.847759065) + F_2_613 equ 669 ; FIX(2.613125930) + F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1) + %else + ; NASM cannot do compile-time arithmetic on floating-point constants. + %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) + F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200) + F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562) + F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) + F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930) + F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1) + %endif + + ; -------------------------------------------------------------------------- + SECTION SEG_CONST + + ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) + ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) + + %define PRE_MULTIPLY_SCALE_BITS 2 + %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) + + alignz 16 + global EXTN(jconst_idct_ifast_sse2) + + EXTN(jconst_idct_ifast_sse2): + + PW_F1414 times 8 dw F_1_414 << CONST_SHIFT + PW_F1847 times 8 dw F_1_847 << CONST_SHIFT + PW_MF1613 times 8 dw -F_1_613 << CONST_SHIFT + PW_F1082 times 8 dw F_1_082 << CONST_SHIFT + PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + + ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + ; + ; Perform dequantization and inverse DCT on one block of coefficients. + ; + ; GLOBAL(void) -; jpeg_idct_ifast_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, -; JCOEFPTR coef_block, ++; jsimd_idct_ifast_sse2 (void * dct_table, JCOEFPTR coef_block, + ; JSAMPARRAY output_buf, JDIMENSION output_col) + ; + -%define cinfo(b) (b)+8 ; j_decompress_ptr cinfo -%define compptr(b) (b)+12 ; jpeg_component_info * compptr -%define coef_block(b) (b)+16 ; JCOEFPTR coef_block -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf -%define output_col(b) (b)+24 ; JDIMENSION output_col ++%define dct_table(b) (b)+8 ; jpeg_component_info * compptr ++%define coef_block(b) (b)+12 ; JCOEFPTR coef_block ++%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf ++%define output_col(b) (b)+20 ; JDIMENSION output_col + + %define original_ebp ebp+0 + %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] + %define WK_NUM 2 + + align 16 - global EXTN(jpeg_idct_ifast_sse2) ++ global EXTN(jsimd_idct_ifast_sse2) + -EXTN(jpeg_idct_ifast_sse2): ++EXTN(jsimd_idct_ifast_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx + ; push ecx ; unused + ; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + + ; mov eax, [original_ebp] - mov edx, POINTER [compptr(eax)] - mov edx, POINTER [jcompinfo_dct_table(edx)] ; quantptr ++ mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + + %ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1,xmm0 + packsswb xmm1,xmm1 + packsswb xmm1,xmm1 + movd eax,xmm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm7,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm7,xmm7 ; xmm7=(04 04 05 05 06 06 07 07) + + pshufd xmm6,xmm0,0x00 ; xmm6=col0=(00 00 00 00 00 00 00 00) + pshufd xmm2,xmm0,0x55 ; xmm2=col1=(01 01 01 01 01 01 01 01) + pshufd xmm5,xmm0,0xAA ; xmm5=col2=(02 02 02 02 02 02 02 02) + pshufd xmm0,xmm0,0xFF ; xmm0=col3=(03 03 03 03 03 03 03 03) + pshufd xmm1,xmm7,0x00 ; xmm1=col4=(04 04 04 04 04 04 04 04) + pshufd xmm4,xmm7,0x55 ; xmm4=col5=(05 05 05 05 05 05 05 05) + pshufd xmm3,xmm7,0xAA ; xmm3=col6=(06 06 06 06 06 06 06 06) + pshufd xmm7,xmm7,0xFF ; xmm7=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=col3 + jmp near .column_end + alignx 16,7 + %endif + .columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4,xmm0 + movdqa xmm5,xmm1 + psubw xmm0,xmm2 ; xmm0=tmp11 + psubw xmm1,xmm3 + paddw xmm4,xmm2 ; xmm4=tmp10 + paddw xmm5,xmm3 ; xmm5=tmp13 + + psllw xmm1,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm1,[GOTOFF(ebx,PW_F1414)] + psubw xmm1,xmm5 ; xmm1=tmp12 + + movdqa xmm6,xmm4 + movdqa xmm7,xmm0 + psubw xmm4,xmm5 ; xmm4=tmp3 + psubw xmm0,xmm1 ; xmm0=tmp2 + paddw xmm6,xmm5 ; xmm6=tmp0 + paddw xmm7,xmm1 ; xmm7=tmp1 + + movdqa XMMWORD [wk(1)], xmm4 ; wk(1)=tmp3 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=tmp2 + + ; -- Odd part + + movdqa xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)] + movdqa xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)] + + movdqa xmm4,xmm2 + movdqa xmm0,xmm5 + psubw xmm2,xmm1 ; xmm2=z12 + psubw xmm5,xmm3 ; xmm5=z10 + paddw xmm4,xmm1 ; xmm4=z11 + paddw xmm0,xmm3 ; xmm0=z13 + + movdqa xmm1,xmm5 ; xmm1=z10(unscaled) + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + + movdqa xmm3,xmm4 + psubw xmm4,xmm0 + paddw xmm3,xmm0 ; xmm3=tmp7 + + psllw xmm4,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm4,[GOTOFF(ebx,PW_F1414)] ; xmm4=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm0,xmm5 + paddw xmm5,xmm2 + pmulhw xmm5,[GOTOFF(ebx,PW_F1847)] ; xmm5=z5 + pmulhw xmm0,[GOTOFF(ebx,PW_MF1613)] + pmulhw xmm2,[GOTOFF(ebx,PW_F1082)] + psubw xmm0,xmm1 + psubw xmm2,xmm5 ; xmm2=tmp10 + paddw xmm0,xmm5 ; xmm0=tmp12 + + ; -- Final output stage + + psubw xmm0,xmm3 ; xmm0=tmp6 + movdqa xmm1,xmm6 + movdqa xmm5,xmm7 + paddw xmm6,xmm3 ; xmm6=data0=(00 01 02 03 04 05 06 07) + paddw xmm7,xmm0 ; xmm7=data1=(10 11 12 13 14 15 16 17) + psubw xmm1,xmm3 ; xmm1=data7=(70 71 72 73 74 75 76 77) + psubw xmm5,xmm0 ; xmm5=data6=(60 61 62 63 64 65 66 67) + psubw xmm4,xmm0 ; xmm4=tmp5 + + movdqa xmm3,xmm6 ; transpose coefficients(phase 1) + punpcklwd xmm6,xmm7 ; xmm6=(00 10 01 11 02 12 03 13) + punpckhwd xmm3,xmm7 ; xmm3=(04 14 05 15 06 16 07 17) + movdqa xmm0,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm1 ; xmm5=(60 70 61 71 62 72 63 73) + punpckhwd xmm0,xmm1 ; xmm0=(64 74 65 75 66 76 67 77) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp2 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=tmp3 + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(64 74 65 75 66 76 67 77) + + paddw xmm2,xmm4 ; xmm2=tmp4 + movdqa xmm5,xmm7 + movdqa xmm0,xmm1 + paddw xmm7,xmm4 ; xmm7=data2=(20 21 22 23 24 25 26 27) + paddw xmm1,xmm2 ; xmm1=data4=(40 41 42 43 44 45 46 47) + psubw xmm5,xmm4 ; xmm5=data5=(50 51 52 53 54 55 56 57) + psubw xmm0,xmm2 ; xmm0=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm4,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm0 ; xmm7=(20 30 21 31 22 32 23 33) + punpckhwd xmm4,xmm0 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm2,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm5 ; xmm1=(40 50 41 51 42 52 43 53) + punpckhwd xmm2,xmm5 ; xmm2=(44 54 45 55 46 56 47 57) + + movdqa xmm0,xmm3 ; transpose coefficients(phase 2) + punpckldq xmm3,xmm4 ; xmm3=(04 14 24 34 05 15 25 35) + punpckhdq xmm0,xmm4 ; xmm0=(06 16 26 36 07 17 27 37) + movdqa xmm5,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm7 ; xmm6=(00 10 20 30 01 11 21 31) + punpckhdq xmm5,xmm7 ; xmm5=(02 12 22 32 03 13 23 33) + + movdqa xmm4, XMMWORD [wk(0)] ; xmm4=(60 70 61 71 62 72 63 73) + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(0)], xmm3 ; wk(0)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=(06 16 26 36 07 17 27 37) + + movdqa xmm3,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm4 ; xmm1=(40 50 60 70 41 51 61 71) + punpckhdq xmm3,xmm4 ; xmm3=(42 52 62 72 43 53 63 73) + movdqa xmm0,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm7 ; xmm2=(44 54 64 74 45 55 65 75) + punpckhdq xmm0,xmm7 ; xmm0=(46 56 66 76 47 57 67 77) + + movdqa xmm4,xmm6 ; transpose coefficients(phase 3) + punpcklqdq xmm6,xmm1 ; xmm6=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm4,xmm1 ; xmm4=col1=(01 11 21 31 41 51 61 71) + movdqa xmm7,xmm5 ; transpose coefficients(phase 3) + punpcklqdq xmm5,xmm3 ; xmm5=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm7,xmm3 ; xmm7=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm1, XMMWORD [wk(0)] ; xmm1=(04 14 24 34 05 15 25 35) + movdqa xmm3, XMMWORD [wk(1)] ; xmm3=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=col1 + movdqa XMMWORD [wk(1)], xmm7 ; wk(1)=col3 + + movdqa xmm4,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm2 ; xmm1=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm4,xmm2 ; xmm4=col5=(05 15 25 35 45 55 65 75) + movdqa xmm7,xmm3 ; transpose coefficients(phase 3) + punpcklqdq xmm3,xmm0 ; xmm3=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm7,xmm0 ; xmm7=col7=(07 17 27 37 47 57 67 77) + .column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6 + + movdqa xmm2,xmm6 + movdqa xmm0,xmm5 + psubw xmm6,xmm1 ; xmm6=tmp11 + psubw xmm5,xmm3 + paddw xmm2,xmm1 ; xmm2=tmp10 + paddw xmm0,xmm3 ; xmm0=tmp13 + + psllw xmm5,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm5,[GOTOFF(ebx,PW_F1414)] + psubw xmm5,xmm0 ; xmm5=tmp12 + + movdqa xmm1,xmm2 + movdqa xmm3,xmm6 + psubw xmm2,xmm0 ; xmm2=tmp3 + psubw xmm6,xmm5 ; xmm6=tmp2 + paddw xmm1,xmm0 ; xmm1=tmp0 + paddw xmm3,xmm5 ; xmm3=tmp1 + + movdqa xmm0, XMMWORD [wk(0)] ; xmm0=col1 + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=col3 + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp3 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=tmp2 + + ; -- Odd part + + ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7 + + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + psubw xmm0,xmm7 ; xmm0=z12 + psubw xmm4,xmm5 ; xmm4=z10 + paddw xmm2,xmm7 ; xmm2=z11 + paddw xmm6,xmm5 ; xmm6=z13 + + movdqa xmm7,xmm4 ; xmm7=z10(unscaled) + psllw xmm0,PRE_MULTIPLY_SCALE_BITS + psllw xmm4,PRE_MULTIPLY_SCALE_BITS + + movdqa xmm5,xmm2 + psubw xmm2,xmm6 + paddw xmm5,xmm6 ; xmm5=tmp7 + + psllw xmm2,PRE_MULTIPLY_SCALE_BITS + pmulhw xmm2,[GOTOFF(ebx,PW_F1414)] ; xmm2=tmp11 + + ; To avoid overflow... + ; + ; (Original) + ; tmp12 = -2.613125930 * z10 + z5; + ; + ; (This implementation) + ; tmp12 = (-1.613125930 - 1) * z10 + z5; + ; = -1.613125930 * z10 - z10 + z5; + + movdqa xmm6,xmm4 + paddw xmm4,xmm0 + pmulhw xmm4,[GOTOFF(ebx,PW_F1847)] ; xmm4=z5 + pmulhw xmm6,[GOTOFF(ebx,PW_MF1613)] + pmulhw xmm0,[GOTOFF(ebx,PW_F1082)] + psubw xmm6,xmm7 + psubw xmm0,xmm4 ; xmm0=tmp10 + paddw xmm6,xmm4 ; xmm6=tmp12 + + ; -- Final output stage + + psubw xmm6,xmm5 ; xmm6=tmp6 + movdqa xmm7,xmm1 + movdqa xmm4,xmm3 + paddw xmm1,xmm5 ; xmm1=data0=(00 10 20 30 40 50 60 70) + paddw xmm3,xmm6 ; xmm3=data1=(01 11 21 31 41 51 61 71) + psraw xmm1,(PASS1_BITS+3) ; descale + psraw xmm3,(PASS1_BITS+3) ; descale + psubw xmm7,xmm5 ; xmm7=data7=(07 17 27 37 47 57 67 77) + psubw xmm4,xmm6 ; xmm4=data6=(06 16 26 36 46 56 66 76) + psraw xmm7,(PASS1_BITS+3) ; descale + psraw xmm4,(PASS1_BITS+3) ; descale + psubw xmm2,xmm6 ; xmm2=tmp5 + + packsswb xmm1,xmm4 ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3,xmm7 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm5, XMMWORD [wk(1)] ; xmm5=tmp2 + movdqa xmm6, XMMWORD [wk(0)] ; xmm6=tmp3 + + paddw xmm0,xmm2 ; xmm0=tmp4 + movdqa xmm4,xmm5 + movdqa xmm7,xmm6 + paddw xmm5,xmm2 ; xmm5=data2=(02 12 22 32 42 52 62 72) + paddw xmm6,xmm0 ; xmm6=data4=(04 14 24 34 44 54 64 74) + psraw xmm5,(PASS1_BITS+3) ; descale + psraw xmm6,(PASS1_BITS+3) ; descale + psubw xmm4,xmm2 ; xmm4=data5=(05 15 25 35 45 55 65 75) + psubw xmm7,xmm0 ; xmm7=data3=(03 13 23 33 43 53 63 73) + psraw xmm4,(PASS1_BITS+3) ; descale + psraw xmm7,(PASS1_BITS+3) ; descale + + movdqa xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm2=[PB_CENTERJSAMP] + + packsswb xmm5,xmm6 ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm7,xmm4 ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm1,xmm2 + paddb xmm3,xmm2 + paddb xmm5,xmm2 + paddb xmm7,xmm2 + + movdqa xmm0,xmm1 ; transpose coefficients(phase 1) + punpcklbw xmm1,xmm3 ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0,xmm3 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm6,xmm5 ; transpose coefficients(phase 1) + punpcklbw xmm5,xmm7 ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm6,xmm7 ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4,xmm1 ; transpose coefficients(phase 2) + punpcklwd xmm1,xmm5 ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm5 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm2,xmm6 ; transpose coefficients(phase 2) + punpcklwd xmm6,xmm0 ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm2,xmm0 ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm3,xmm1 ; transpose coefficients(phase 3) + punpckldq xmm1,xmm6 ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm3,xmm6 ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm7,xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4,xmm2 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm7,xmm2 ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm5,xmm1,0x4E ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0,xmm3,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm6,xmm4,0x4E ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm2,xmm7,0x4E ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movq _MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 - movq _MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 ++ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 ++ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] - movq _MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 - movq _MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 ++ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 ++ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7 + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq _MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 - movq _MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 ++ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5 ++ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 + mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] - movq _MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 - movq _MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 ++ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 ++ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 + + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + -%endif ; JIDCT_INT_SSE2_SUPPORTED -%endif ; DCT_IFAST_SUPPORTED diff --cc simd/jiss2int.asm index 0000000,b0e7109..4122c64 mode 000000,100644..100644 --- a/simd/jiss2int.asm +++ b/simd/jiss2int.asm @@@ -1,0 -1,869 +1,856 @@@ + ; + ; jiss2int.asm - accurate integer IDCT (SSE2) + ; ++; Copyright 2009 Pierre Ossman for Cendio AB ++; ++; Based on + ; x86 SIMD extension for IJG JPEG library + ; Copyright (C) 1999-2006, MIYASAKA Masaru. + ; For conditions of distribution and use, see copyright notice in jsimdext.inc + ; + ; This file should be assembled with NASM (Netwide Assembler), + ; can *not* be assembled with Microsoft's MASM or any compatible + ; assembler (including Borland's Turbo Assembler). + ; NASM is available from http://nasm.sourceforge.net/ or + ; http://sourceforge.net/project/showfiles.php?group_id=6208 + ; + ; This file contains a slow-but-accurate integer implementation of the + ; inverse DCT (Discrete Cosine Transform). The following code is based + ; directly on the IJG's original jidctint.c; see the jidctint.c for + ; more details. + ; -; Last Modified : February 4, 2006 -; + ; [TAB8] + -%include "jsimdext.inc" -%include "jdct.inc" - -%ifdef DCT_ISLOW_SUPPORTED -%ifdef JIDCT_INT_SSE2_SUPPORTED - -; This module is specialized to the case DCTSIZE = 8. -; -%if DCTSIZE != 8 -%error "Sorry, this code only copes with 8x8 DCTs." -%endif ++%include "simd/jsimdext.inc" ++%include "simd/jdct.inc" + + ; -------------------------------------------------------------------------- + + %define CONST_BITS 13 + %define PASS1_BITS 2 + + %define DESCALE_P1 (CONST_BITS-PASS1_BITS) + %define DESCALE_P2 (CONST_BITS+PASS1_BITS+3) + + %if CONST_BITS == 13 + F_0_298 equ 2446 ; FIX(0.298631336) + F_0_390 equ 3196 ; FIX(0.390180644) + F_0_541 equ 4433 ; FIX(0.541196100) + F_0_765 equ 6270 ; FIX(0.765366865) + F_0_899 equ 7373 ; FIX(0.899976223) + F_1_175 equ 9633 ; FIX(1.175875602) + F_1_501 equ 12299 ; FIX(1.501321110) + F_1_847 equ 15137 ; FIX(1.847759065) + F_1_961 equ 16069 ; FIX(1.961570560) + F_2_053 equ 16819 ; FIX(2.053119869) + F_2_562 equ 20995 ; FIX(2.562915447) + F_3_072 equ 25172 ; FIX(3.072711026) + %else + ; NASM cannot do compile-time arithmetic on floating-point constants. + %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) + F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) + F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) + F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) + F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) + F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) + F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) + F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) + F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) + F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) + F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) + F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) + F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) + %endif + + ; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_islow_sse2) + + EXTN(jconst_idct_islow_sse2): + + PW_F130_F054 times 4 dw (F_0_541+F_0_765), F_0_541 + PW_F054_MF130 times 4 dw F_0_541, (F_0_541-F_1_847) + PW_MF078_F117 times 4 dw (F_1_175-F_1_961), F_1_175 + PW_F117_F078 times 4 dw F_1_175, (F_1_175-F_0_390) + PW_MF060_MF089 times 4 dw (F_0_298-F_0_899),-F_0_899 + PW_MF089_F060 times 4 dw -F_0_899, (F_1_501-F_0_899) + PW_MF050_MF256 times 4 dw (F_2_053-F_2_562),-F_2_562 + PW_MF256_F050 times 4 dw -F_2_562, (F_3_072-F_2_562) + PD_DESCALE_P1 times 4 dd 1 << (DESCALE_P1-1) + PD_DESCALE_P2 times 4 dd 1 << (DESCALE_P2-1) + PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + + ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + ; + ; Perform dequantization and inverse DCT on one block of coefficients. + ; + ; GLOBAL(void) -; jpeg_idct_islow_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, -; JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) ++; jsimd_idct_islow_sse2 (void * dct_table, JCOEFPTR coef_block, ++; JSAMPARRAY output_buf, JDIMENSION output_col) + ; + -%define cinfo(b) (b)+8 ; j_decompress_ptr cinfo -%define compptr(b) (b)+12 ; jpeg_component_info * compptr -%define coef_block(b) (b)+16 ; JCOEFPTR coef_block -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf -%define output_col(b) (b)+24 ; JDIMENSION output_col ++%define dct_table(b) (b)+8 ; jpeg_component_info * compptr ++%define coef_block(b) (b)+12 ; JCOEFPTR coef_block ++%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf ++%define output_col(b) (b)+20 ; JDIMENSION output_col + + %define original_ebp ebp+0 + %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] + %define WK_NUM 12 + + align 16 - global EXTN(jpeg_idct_islow_sse2) ++ global EXTN(jsimd_idct_islow_sse2) + -EXTN(jpeg_idct_islow_sse2): ++EXTN(jsimd_idct_islow_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx + ; push ecx ; unused + ; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + + ; mov eax, [original_ebp] - mov edx, POINTER [compptr(eax)] - mov edx, POINTER [jcompinfo_dct_table(edx)] ; quantptr ++ mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + + %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz near .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm1,xmm0 + packsswb xmm1,xmm1 + packsswb xmm1,xmm1 + movd eax,xmm1 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm5, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm5, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm5,PASS1_BITS + + movdqa xmm4,xmm5 ; xmm5=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm5,xmm5 ; xmm5=(00 00 01 01 02 02 03 03) + punpckhwd xmm4,xmm4 ; xmm4=(04 04 05 05 06 06 07 07) + + pshufd xmm7,xmm5,0x00 ; xmm7=col0=(00 00 00 00 00 00 00 00) + pshufd xmm6,xmm5,0x55 ; xmm6=col1=(01 01 01 01 01 01 01 01) + pshufd xmm1,xmm5,0xAA ; xmm1=col2=(02 02 02 02 02 02 02 02) + pshufd xmm5,xmm5,0xFF ; xmm5=col3=(03 03 03 03 03 03 03 03) + pshufd xmm0,xmm4,0x00 ; xmm0=col4=(04 04 04 04 04 04 04 04) + pshufd xmm3,xmm4,0x55 ; xmm3=col5=(05 05 05 05 05 05 05 05) + pshufd xmm2,xmm4,0xAA ; xmm2=col6=(06 06 06 06 06 06 06 06) + pshufd xmm4,xmm4,0xFF ; xmm4=col7=(07 07 07 07 07 07 07 07) + + movdqa XMMWORD [wk(8)], xmm6 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm5 ; wk(9)=col3 + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + jmp near .column_end + alignx 16,7 + %endif + .columnDCT: + + ; -- Even part + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm4,xmm1 ; xmm1=in2=z2 + movdqa xmm5,xmm1 + punpcklwd xmm4,xmm3 ; xmm3=in6=z3 + punpckhwd xmm5,xmm3 + movdqa xmm1,xmm4 + movdqa xmm3,xmm5 + pmaddwd xmm4,[GOTOFF(ebx,PW_F130_F054)] ; xmm4=tmp3L + pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H + pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L + pmaddwd xmm3,[GOTOFF(ebx,PW_F054_MF130)] ; xmm3=tmp2H + + movdqa xmm6,xmm0 + paddw xmm0,xmm2 ; xmm0=in0+in4 + psubw xmm6,xmm2 ; xmm6=in0-in4 + + pxor xmm7,xmm7 + pxor xmm2,xmm2 + punpcklwd xmm7,xmm0 ; xmm7=tmp0L + punpckhwd xmm2,xmm0 ; xmm2=tmp0H + psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + psrad xmm2,(16-CONST_BITS) ; psrad xmm2,16 & pslld xmm2,CONST_BITS + + movdqa xmm0,xmm7 + paddd xmm7,xmm4 ; xmm7=tmp10L + psubd xmm0,xmm4 ; xmm0=tmp13L + movdqa xmm4,xmm2 + paddd xmm2,xmm5 ; xmm2=tmp10H + psubd xmm4,xmm5 ; xmm4=tmp13H + + movdqa XMMWORD [wk(0)], xmm7 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm2 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm0 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm4 ; wk(3)=tmp13H + + pxor xmm5,xmm5 + pxor xmm7,xmm7 + punpcklwd xmm5,xmm6 ; xmm5=tmp1L + punpckhwd xmm7,xmm6 ; xmm7=tmp1H + psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm7,(16-CONST_BITS) ; psrad xmm7,16 & pslld xmm7,CONST_BITS + + movdqa xmm2,xmm5 + paddd xmm5,xmm1 ; xmm5=tmp11L + psubd xmm2,xmm1 ; xmm2=tmp12L + movdqa xmm0,xmm7 + paddd xmm7,xmm3 ; xmm7=tmp11H + psubd xmm0,xmm3 ; xmm0=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm7 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm0 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm4, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm6, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm6, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm1, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm5,xmm6 + movdqa xmm7,xmm4 + paddw xmm5,xmm3 ; xmm5=z3 + paddw xmm7,xmm1 ; xmm7=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm2,xmm5 + movdqa xmm0,xmm5 + punpcklwd xmm2,xmm7 + punpckhwd xmm0,xmm7 + movdqa xmm5,xmm2 + movdqa xmm7,xmm0 + pmaddwd xmm2,[GOTOFF(ebx,PW_MF078_F117)] ; xmm2=z3L + pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3H + pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L + pmaddwd xmm7,[GOTOFF(ebx,PW_F117_F078)] ; xmm7=z4H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm2,xmm3 + movdqa xmm0,xmm3 + punpcklwd xmm2,xmm4 + punpckhwd xmm0,xmm4 + movdqa xmm3,xmm2 + movdqa xmm4,xmm0 + pmaddwd xmm2,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm2=tmp0L + pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0H + pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3L + pmaddwd xmm4,[GOTOFF(ebx,PW_MF089_F060)] ; xmm4=tmp3H + + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp0L + paddd xmm0, XMMWORD [wk(11)] ; xmm0=tmp0H + paddd xmm3,xmm5 ; xmm3=tmp3L + paddd xmm4,xmm7 ; xmm4=tmp3H + + movdqa XMMWORD [wk(8)], xmm2 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm0 ; wk(9)=tmp0H + + movdqa xmm2,xmm1 + movdqa xmm0,xmm1 + punpcklwd xmm2,xmm6 + punpckhwd xmm0,xmm6 + movdqa xmm1,xmm2 + movdqa xmm6,xmm0 + pmaddwd xmm2,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm2=tmp1L + pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1H + pmaddwd xmm1,[GOTOFF(ebx,PW_MF256_F050)] ; xmm1=tmp2L + pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H + + paddd xmm2,xmm5 ; xmm2=tmp1L + paddd xmm0,xmm7 ; xmm0=tmp1H + paddd xmm1, XMMWORD [wk(10)] ; xmm1=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm2 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm0 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm7, XMMWORD [wk(1)] ; xmm7=tmp10H + + movdqa xmm2,xmm5 + movdqa xmm0,xmm7 + paddd xmm5,xmm3 ; xmm5=data0L + paddd xmm7,xmm4 ; xmm7=data0H + psubd xmm2,xmm3 ; xmm2=data7L + psubd xmm0,xmm4 ; xmm0=data7H + + movdqa xmm3,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm3=[PD_DESCALE_P1] + + paddd xmm5,xmm3 + paddd xmm7,xmm3 + psrad xmm5,DESCALE_P1 + psrad xmm7,DESCALE_P1 + paddd xmm2,xmm3 + paddd xmm0,xmm3 + psrad xmm2,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm5,xmm7 ; xmm5=data0=(00 01 02 03 04 05 06 07) + packssdw xmm2,xmm0 ; xmm2=data7=(70 71 72 73 74 75 76 77) + + movdqa xmm4, XMMWORD [wk(4)] ; xmm4=tmp11L + movdqa xmm3, XMMWORD [wk(5)] ; xmm3=tmp11H + + movdqa xmm7,xmm4 + movdqa xmm0,xmm3 + paddd xmm4,xmm1 ; xmm4=data1L + paddd xmm3,xmm6 ; xmm3=data1H + psubd xmm7,xmm1 ; xmm7=data6L + psubd xmm0,xmm6 ; xmm0=data6H + + movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm1=[PD_DESCALE_P1] + + paddd xmm4,xmm1 + paddd xmm3,xmm1 + psrad xmm4,DESCALE_P1 + psrad xmm3,DESCALE_P1 + paddd xmm7,xmm1 + paddd xmm0,xmm1 + psrad xmm7,DESCALE_P1 + psrad xmm0,DESCALE_P1 + + packssdw xmm4,xmm3 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm7,xmm0 ; xmm7=data6=(60 61 62 63 64 65 66 67) + + movdqa xmm6,xmm5 ; transpose coefficients(phase 1) + punpcklwd xmm5,xmm4 ; xmm5=(00 10 01 11 02 12 03 13) + punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm1,xmm7 ; transpose coefficients(phase 1) + punpcklwd xmm7,xmm2 ; xmm7=(60 70 61 71 62 72 63 73) + punpckhwd xmm1,xmm2 ; xmm1=(64 74 65 75 66 76 67 77) + + movdqa xmm3, XMMWORD [wk(6)] ; xmm3=tmp12L + movdqa xmm0, XMMWORD [wk(7)] ; xmm0=tmp12H + movdqa xmm4, XMMWORD [wk(10)] ; xmm4=tmp1L + movdqa xmm2, XMMWORD [wk(11)] ; xmm2=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 01 11 02 12 03 13) + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=(04 14 05 15 06 16 07 17) + movdqa XMMWORD [wk(4)], xmm7 ; wk(4)=(60 70 61 71 62 72 63 73) + movdqa XMMWORD [wk(5)], xmm1 ; wk(5)=(64 74 65 75 66 76 67 77) + + movdqa xmm5,xmm3 + movdqa xmm6,xmm0 + paddd xmm3,xmm4 ; xmm3=data2L + paddd xmm0,xmm2 ; xmm0=data2H + psubd xmm5,xmm4 ; xmm5=data5L + psubd xmm6,xmm2 ; xmm6=data5H + + movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm7=[PD_DESCALE_P1] + + paddd xmm3,xmm7 + paddd xmm0,xmm7 + psrad xmm3,DESCALE_P1 + psrad xmm0,DESCALE_P1 + paddd xmm5,xmm7 + paddd xmm6,xmm7 + psrad xmm5,DESCALE_P1 + psrad xmm6,DESCALE_P1 + + packssdw xmm3,xmm0 ; xmm3=data2=(20 21 22 23 24 25 26 27) + packssdw xmm5,xmm6 ; xmm5=data5=(50 51 52 53 54 55 56 57) + + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=tmp13L + movdqa xmm4, XMMWORD [wk(3)] ; xmm4=tmp13H + movdqa xmm2, XMMWORD [wk(8)] ; xmm2=tmp0L + movdqa xmm7, XMMWORD [wk(9)] ; xmm7=tmp0H + + movdqa xmm0,xmm1 + movdqa xmm6,xmm4 + paddd xmm1,xmm2 ; xmm1=data3L + paddd xmm4,xmm7 ; xmm4=data3H + psubd xmm0,xmm2 ; xmm0=data4L + psubd xmm6,xmm7 ; xmm6=data4H + + movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1)] ; xmm2=[PD_DESCALE_P1] + + paddd xmm1,xmm2 + paddd xmm4,xmm2 + psrad xmm1,DESCALE_P1 + psrad xmm4,DESCALE_P1 + paddd xmm0,xmm2 + paddd xmm6,xmm2 + psrad xmm0,DESCALE_P1 + psrad xmm6,DESCALE_P1 + + packssdw xmm1,xmm4 ; xmm1=data3=(30 31 32 33 34 35 36 37) + packssdw xmm0,xmm6 ; xmm0=data4=(40 41 42 43 44 45 46 47) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 01 11 02 12 03 13) + movdqa xmm2, XMMWORD [wk(1)] ; xmm2=(04 14 05 15 06 16 07 17) + + movdqa xmm4,xmm3 ; transpose coefficients(phase 1) + punpcklwd xmm3,xmm1 ; xmm3=(20 30 21 31 22 32 23 33) + punpckhwd xmm4,xmm1 ; xmm4=(24 34 25 35 26 36 27 37) + movdqa xmm6,xmm0 ; transpose coefficients(phase 1) + punpcklwd xmm0,xmm5 ; xmm0=(40 50 41 51 42 52 43 53) + punpckhwd xmm6,xmm5 ; xmm6=(44 54 45 55 46 56 47 57) + + movdqa xmm1,xmm7 ; transpose coefficients(phase 2) + punpckldq xmm7,xmm3 ; xmm7=(00 10 20 30 01 11 21 31) + punpckhdq xmm1,xmm3 ; xmm1=(02 12 22 32 03 13 23 33) + movdqa xmm5,xmm2 ; transpose coefficients(phase 2) + punpckldq xmm2,xmm4 ; xmm2=(04 14 24 34 05 15 25 35) + punpckhdq xmm5,xmm4 ; xmm5=(06 16 26 36 07 17 27 37) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=(60 70 61 71 62 72 63 73) + movdqa xmm4, XMMWORD [wk(5)] ; xmm4=(64 74 65 75 66 76 67 77) + + movdqa XMMWORD [wk(6)], xmm2 ; wk(6)=(04 14 24 34 05 15 25 35) + movdqa XMMWORD [wk(7)], xmm5 ; wk(7)=(06 16 26 36 07 17 27 37) + + movdqa xmm2,xmm0 ; transpose coefficients(phase 2) + punpckldq xmm0,xmm3 ; xmm0=(40 50 60 70 41 51 61 71) + punpckhdq xmm2,xmm3 ; xmm2=(42 52 62 72 43 53 63 73) + movdqa xmm5,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm4 ; xmm6=(44 54 64 74 45 55 65 75) + punpckhdq xmm5,xmm4 ; xmm5=(46 56 66 76 47 57 67 77) + + movdqa xmm3,xmm7 ; transpose coefficients(phase 3) + punpcklqdq xmm7,xmm0 ; xmm7=col0=(00 10 20 30 40 50 60 70) + punpckhqdq xmm3,xmm0 ; xmm3=col1=(01 11 21 31 41 51 61 71) + movdqa xmm4,xmm1 ; transpose coefficients(phase 3) + punpcklqdq xmm1,xmm2 ; xmm1=col2=(02 12 22 32 42 52 62 72) + punpckhqdq xmm4,xmm2 ; xmm4=col3=(03 13 23 33 43 53 63 73) + + movdqa xmm0, XMMWORD [wk(6)] ; xmm0=(04 14 24 34 05 15 25 35) + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=(06 16 26 36 07 17 27 37) + + movdqa XMMWORD [wk(8)], xmm3 ; wk(8)=col1 + movdqa XMMWORD [wk(9)], xmm4 ; wk(9)=col3 + + movdqa xmm3,xmm0 ; transpose coefficients(phase 3) + punpcklqdq xmm0,xmm6 ; xmm0=col4=(04 14 24 34 44 54 64 74) + punpckhqdq xmm3,xmm6 ; xmm3=col5=(05 15 25 35 45 55 65 75) + movdqa xmm4,xmm2 ; transpose coefficients(phase 3) + punpcklqdq xmm2,xmm5 ; xmm2=col6=(06 16 26 36 46 56 66 76) + punpckhqdq xmm4,xmm5 ; xmm4=col7=(07 17 27 37 47 57 67 77) + + movdqa XMMWORD [wk(10)], xmm3 ; wk(10)=col5 + movdqa XMMWORD [wk(11)], xmm4 ; wk(11)=col7 + .column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows from work array, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6 + + ; (Original) + ; z1 = (z2 + z3) * 0.541196100; + ; tmp2 = z1 + z3 * -1.847759065; + ; tmp3 = z1 + z2 * 0.765366865; + ; + ; (This implementation) + ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); + ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; + + movdqa xmm6,xmm1 ; xmm1=in2=z2 + movdqa xmm5,xmm1 + punpcklwd xmm6,xmm2 ; xmm2=in6=z3 + punpckhwd xmm5,xmm2 + movdqa xmm1,xmm6 + movdqa xmm2,xmm5 + pmaddwd xmm6,[GOTOFF(ebx,PW_F130_F054)] ; xmm6=tmp3L + pmaddwd xmm5,[GOTOFF(ebx,PW_F130_F054)] ; xmm5=tmp3H + pmaddwd xmm1,[GOTOFF(ebx,PW_F054_MF130)] ; xmm1=tmp2L + pmaddwd xmm2,[GOTOFF(ebx,PW_F054_MF130)] ; xmm2=tmp2H + + movdqa xmm3,xmm7 + paddw xmm7,xmm0 ; xmm7=in0+in4 + psubw xmm3,xmm0 ; xmm3=in0-in4 + + pxor xmm4,xmm4 + pxor xmm0,xmm0 + punpcklwd xmm4,xmm7 ; xmm4=tmp0L + punpckhwd xmm0,xmm7 ; xmm0=tmp0H + psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + psrad xmm0,(16-CONST_BITS) ; psrad xmm0,16 & pslld xmm0,CONST_BITS + + movdqa xmm7,xmm4 + paddd xmm4,xmm6 ; xmm4=tmp10L + psubd xmm7,xmm6 ; xmm7=tmp13L + movdqa xmm6,xmm0 + paddd xmm0,xmm5 ; xmm0=tmp10H + psubd xmm6,xmm5 ; xmm6=tmp13H + + movdqa XMMWORD [wk(0)], xmm4 ; wk(0)=tmp10L + movdqa XMMWORD [wk(1)], xmm0 ; wk(1)=tmp10H + movdqa XMMWORD [wk(2)], xmm7 ; wk(2)=tmp13L + movdqa XMMWORD [wk(3)], xmm6 ; wk(3)=tmp13H + + pxor xmm5,xmm5 + pxor xmm4,xmm4 + punpcklwd xmm5,xmm3 ; xmm5=tmp1L + punpckhwd xmm4,xmm3 ; xmm4=tmp1H + psrad xmm5,(16-CONST_BITS) ; psrad xmm5,16 & pslld xmm5,CONST_BITS + psrad xmm4,(16-CONST_BITS) ; psrad xmm4,16 & pslld xmm4,CONST_BITS + + movdqa xmm0,xmm5 + paddd xmm5,xmm1 ; xmm5=tmp11L + psubd xmm0,xmm1 ; xmm0=tmp12L + movdqa xmm7,xmm4 + paddd xmm4,xmm2 ; xmm4=tmp11H + psubd xmm7,xmm2 ; xmm7=tmp12H + + movdqa XMMWORD [wk(4)], xmm5 ; wk(4)=tmp11L + movdqa XMMWORD [wk(5)], xmm4 ; wk(5)=tmp11H + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=tmp12L + movdqa XMMWORD [wk(7)], xmm7 ; wk(7)=tmp12H + + ; -- Odd part + + movdqa xmm6, XMMWORD [wk(9)] ; xmm6=col3 + movdqa xmm3, XMMWORD [wk(8)] ; xmm3=col1 + movdqa xmm1, XMMWORD [wk(11)] ; xmm1=col7 + movdqa xmm2, XMMWORD [wk(10)] ; xmm2=col5 + + movdqa xmm5,xmm6 + movdqa xmm4,xmm3 + paddw xmm5,xmm1 ; xmm5=z3 + paddw xmm4,xmm2 ; xmm4=z4 + + ; (Original) + ; z5 = (z3 + z4) * 1.175875602; + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; + ; z3 += z5; z4 += z5; + ; + ; (This implementation) + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); + + movdqa xmm0,xmm5 + movdqa xmm7,xmm5 + punpcklwd xmm0,xmm4 + punpckhwd xmm7,xmm4 + movdqa xmm5,xmm0 + movdqa xmm4,xmm7 + pmaddwd xmm0,[GOTOFF(ebx,PW_MF078_F117)] ; xmm0=z3L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF078_F117)] ; xmm7=z3H + pmaddwd xmm5,[GOTOFF(ebx,PW_F117_F078)] ; xmm5=z4L + pmaddwd xmm4,[GOTOFF(ebx,PW_F117_F078)] ; xmm4=z4H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=z3L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=z3H + + ; (Original) + ; z1 = tmp0 + tmp3; z2 = tmp1 + tmp2; + ; tmp0 = tmp0 * 0.298631336; tmp1 = tmp1 * 2.053119869; + ; tmp2 = tmp2 * 3.072711026; tmp3 = tmp3 * 1.501321110; + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; + ; tmp0 += z1 + z3; tmp1 += z2 + z4; + ; tmp2 += z2 + z3; tmp3 += z1 + z4; + ; + ; (This implementation) + ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; + ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; + ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); + ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); + ; tmp0 += z3; tmp1 += z4; + ; tmp2 += z3; tmp3 += z4; + + movdqa xmm0,xmm1 + movdqa xmm7,xmm1 + punpcklwd xmm0,xmm3 + punpckhwd xmm7,xmm3 + movdqa xmm1,xmm0 + movdqa xmm3,xmm7 + pmaddwd xmm0,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm0=tmp0L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF089)] ; xmm7=tmp0H + pmaddwd xmm1,[GOTOFF(ebx,PW_MF089_F060)] ; xmm1=tmp3L + pmaddwd xmm3,[GOTOFF(ebx,PW_MF089_F060)] ; xmm3=tmp3H + + paddd xmm0, XMMWORD [wk(10)] ; xmm0=tmp0L + paddd xmm7, XMMWORD [wk(11)] ; xmm7=tmp0H + paddd xmm1,xmm5 ; xmm1=tmp3L + paddd xmm3,xmm4 ; xmm3=tmp3H + + movdqa XMMWORD [wk(8)], xmm0 ; wk(8)=tmp0L + movdqa XMMWORD [wk(9)], xmm7 ; wk(9)=tmp0H + + movdqa xmm0,xmm2 + movdqa xmm7,xmm2 + punpcklwd xmm0,xmm6 + punpckhwd xmm7,xmm6 + movdqa xmm2,xmm0 + movdqa xmm6,xmm7 + pmaddwd xmm0,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm0=tmp1L + pmaddwd xmm7,[GOTOFF(ebx,PW_MF050_MF256)] ; xmm7=tmp1H + pmaddwd xmm2,[GOTOFF(ebx,PW_MF256_F050)] ; xmm2=tmp2L + pmaddwd xmm6,[GOTOFF(ebx,PW_MF256_F050)] ; xmm6=tmp2H + + paddd xmm0,xmm5 ; xmm0=tmp1L + paddd xmm7,xmm4 ; xmm7=tmp1H + paddd xmm2, XMMWORD [wk(10)] ; xmm2=tmp2L + paddd xmm6, XMMWORD [wk(11)] ; xmm6=tmp2H + + movdqa XMMWORD [wk(10)], xmm0 ; wk(10)=tmp1L + movdqa XMMWORD [wk(11)], xmm7 ; wk(11)=tmp1H + + ; -- Final output stage + + movdqa xmm5, XMMWORD [wk(0)] ; xmm5=tmp10L + movdqa xmm4, XMMWORD [wk(1)] ; xmm4=tmp10H + + movdqa xmm0,xmm5 + movdqa xmm7,xmm4 + paddd xmm5,xmm1 ; xmm5=data0L + paddd xmm4,xmm3 ; xmm4=data0H + psubd xmm0,xmm1 ; xmm0=data7L + psubd xmm7,xmm3 ; xmm7=data7H + + movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm1=[PD_DESCALE_P2] + + paddd xmm5,xmm1 + paddd xmm4,xmm1 + psrad xmm5,DESCALE_P2 + psrad xmm4,DESCALE_P2 + paddd xmm0,xmm1 + paddd xmm7,xmm1 + psrad xmm0,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm5,xmm4 ; xmm5=data0=(00 10 20 30 40 50 60 70) + packssdw xmm0,xmm7 ; xmm0=data7=(07 17 27 37 47 57 67 77) + + movdqa xmm3, XMMWORD [wk(4)] ; xmm3=tmp11L + movdqa xmm1, XMMWORD [wk(5)] ; xmm1=tmp11H + + movdqa xmm4,xmm3 + movdqa xmm7,xmm1 + paddd xmm3,xmm2 ; xmm3=data1L + paddd xmm1,xmm6 ; xmm1=data1H + psubd xmm4,xmm2 ; xmm4=data6L + psubd xmm7,xmm6 ; xmm7=data6H + + movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm2=[PD_DESCALE_P2] + + paddd xmm3,xmm2 + paddd xmm1,xmm2 + psrad xmm3,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm4,xmm2 + paddd xmm7,xmm2 + psrad xmm4,DESCALE_P2 + psrad xmm7,DESCALE_P2 + + packssdw xmm3,xmm1 ; xmm3=data1=(01 11 21 31 41 51 61 71) + packssdw xmm4,xmm7 ; xmm4=data6=(06 16 26 36 46 56 66 76) + + packsswb xmm5,xmm4 ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + packsswb xmm3,xmm0 ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm6, XMMWORD [wk(6)] ; xmm6=tmp12L + movdqa xmm2, XMMWORD [wk(7)] ; xmm2=tmp12H + movdqa xmm1, XMMWORD [wk(10)] ; xmm1=tmp1L + movdqa xmm7, XMMWORD [wk(11)] ; xmm7=tmp1H + + movdqa XMMWORD [wk(0)], xmm5 ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + movdqa xmm4,xmm6 + movdqa xmm0,xmm2 + paddd xmm6,xmm1 ; xmm6=data2L + paddd xmm2,xmm7 ; xmm2=data2H + psubd xmm4,xmm1 ; xmm4=data5L + psubd xmm0,xmm7 ; xmm0=data5H + + movdqa xmm5,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm5=[PD_DESCALE_P2] + + paddd xmm6,xmm5 + paddd xmm2,xmm5 + psrad xmm6,DESCALE_P2 + psrad xmm2,DESCALE_P2 + paddd xmm4,xmm5 + paddd xmm0,xmm5 + psrad xmm4,DESCALE_P2 + psrad xmm0,DESCALE_P2 + + packssdw xmm6,xmm2 ; xmm6=data2=(02 12 22 32 42 52 62 72) + packssdw xmm4,xmm0 ; xmm4=data5=(05 15 25 35 45 55 65 75) + + movdqa xmm3, XMMWORD [wk(2)] ; xmm3=tmp13L + movdqa xmm1, XMMWORD [wk(3)] ; xmm1=tmp13H + movdqa xmm7, XMMWORD [wk(8)] ; xmm7=tmp0L + movdqa xmm5, XMMWORD [wk(9)] ; xmm5=tmp0H + + movdqa xmm2,xmm3 + movdqa xmm0,xmm1 + paddd xmm3,xmm7 ; xmm3=data3L + paddd xmm1,xmm5 ; xmm1=data3H + psubd xmm2,xmm7 ; xmm2=data4L + psubd xmm0,xmm5 ; xmm0=data4H + + movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P2)] ; xmm7=[PD_DESCALE_P2] + + paddd xmm3,xmm7 + paddd xmm1,xmm7 + psrad xmm3,DESCALE_P2 + psrad xmm1,DESCALE_P2 + paddd xmm2,xmm7 + paddd xmm0,xmm7 + psrad xmm2,DESCALE_P2 + psrad xmm0,DESCALE_P2 + + movdqa xmm5,[GOTOFF(ebx,PB_CENTERJSAMP)] ; xmm5=[PB_CENTERJSAMP] + + packssdw xmm3,xmm1 ; xmm3=data3=(03 13 23 33 43 53 63 73) + packssdw xmm2,xmm0 ; xmm2=data4=(04 14 24 34 44 54 64 74) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76) + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77) + + packsswb xmm6,xmm2 ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74) + packsswb xmm3,xmm4 ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75) + + paddb xmm7,xmm5 + paddb xmm1,xmm5 + paddb xmm6,xmm5 + paddb xmm3,xmm5 + + movdqa xmm0,xmm7 ; transpose coefficients(phase 1) + punpcklbw xmm7,xmm1 ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71) + punpckhbw xmm0,xmm1 ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77) + movdqa xmm2,xmm6 ; transpose coefficients(phase 1) + punpcklbw xmm6,xmm3 ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73) + punpckhbw xmm2,xmm3 ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75) + + movdqa xmm4,xmm7 ; transpose coefficients(phase 2) + punpcklwd xmm7,xmm6 ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33) + punpckhwd xmm4,xmm6 ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73) + movdqa xmm5,xmm2 ; transpose coefficients(phase 2) + punpcklwd xmm2,xmm0 ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37) + punpckhwd xmm5,xmm0 ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77) + + movdqa xmm1,xmm7 ; transpose coefficients(phase 3) + punpckldq xmm7,xmm2 ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) + punpckhdq xmm1,xmm2 ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37) + movdqa xmm3,xmm4 ; transpose coefficients(phase 3) + punpckldq xmm4,xmm5 ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57) + punpckhdq xmm3,xmm5 ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77) + + pshufd xmm6,xmm7,0x4E ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07) + pshufd xmm0,xmm1,0x4E ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27) + pshufd xmm2,xmm4,0x4E ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47) + pshufd xmm5,xmm3,0x4E ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW] - movq _MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7 - movq _MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 ++ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm7 ++ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm1 + mov edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW] - movq _MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 - movq _MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 ++ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 ++ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + + mov edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movq _MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 - movq _MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 ++ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6 ++ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0 + mov edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW] - movq _MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 - movq _MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 ++ movq XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm2 ++ movq XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm5 + + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + -%endif ; JIDCT_INT_SSE2_SUPPORTED -%endif ; DCT_ISLOW_SUPPORTED diff --cc simd/jiss2red.asm index 0000000,53af6fe..b31346a mode 000000,100644..100644 --- a/simd/jiss2red.asm +++ b/simd/jiss2red.asm @@@ -1,0 -1,607 +1,591 @@@ + ; + ; jiss2red.asm - reduced-size IDCT (SSE2) + ; ++; Copyright 2009 Pierre Ossman for Cendio AB ++; ++; Based on + ; x86 SIMD extension for IJG JPEG library + ; Copyright (C) 1999-2006, MIYASAKA Masaru. + ; For conditions of distribution and use, see copyright notice in jsimdext.inc + ; + ; This file should be assembled with NASM (Netwide Assembler), + ; can *not* be assembled with Microsoft's MASM or any compatible + ; assembler (including Borland's Turbo Assembler). + ; NASM is available from http://nasm.sourceforge.net/ or + ; http://sourceforge.net/project/showfiles.php?group_id=6208 + ; + ; This file contains inverse-DCT routines that produce reduced-size + ; output: either 4x4 or 2x2 pixels from an 8x8 DCT block. + ; The following code is based directly on the IJG's original jidctred.c; + ; see the jidctred.c for more details. + ; -; Last Modified : February 4, 2006 -; + ; [TAB8] + -%include "jsimdext.inc" -%include "jdct.inc" - -%ifdef IDCT_SCALING_SUPPORTED -%ifdef JIDCT_INT_SSE2_SUPPORTED - -; This module is specialized to the case DCTSIZE = 8. -; -%if DCTSIZE != 8 -%error "Sorry, this code only copes with 8x8 DCTs." -%endif ++%include "simd/jsimdext.inc" ++%include "simd/jdct.inc" + + ; -------------------------------------------------------------------------- + + %define CONST_BITS 13 + %define PASS1_BITS 2 + + %define DESCALE_P1_4 (CONST_BITS-PASS1_BITS+1) + %define DESCALE_P2_4 (CONST_BITS+PASS1_BITS+3+1) + %define DESCALE_P1_2 (CONST_BITS-PASS1_BITS+2) + %define DESCALE_P2_2 (CONST_BITS+PASS1_BITS+3+2) + + %if CONST_BITS == 13 + F_0_211 equ 1730 ; FIX(0.211164243) + F_0_509 equ 4176 ; FIX(0.509795579) + F_0_601 equ 4926 ; FIX(0.601344887) + F_0_720 equ 5906 ; FIX(0.720959822) + F_0_765 equ 6270 ; FIX(0.765366865) + F_0_850 equ 6967 ; FIX(0.850430095) + F_0_899 equ 7373 ; FIX(0.899976223) + F_1_061 equ 8697 ; FIX(1.061594337) + F_1_272 equ 10426 ; FIX(1.272758580) + F_1_451 equ 11893 ; FIX(1.451774981) + F_1_847 equ 15137 ; FIX(1.847759065) + F_2_172 equ 17799 ; FIX(2.172734803) + F_2_562 equ 20995 ; FIX(2.562915447) + F_3_624 equ 29692 ; FIX(3.624509785) + %else + ; NASM cannot do compile-time arithmetic on floating-point constants. + %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) + F_0_211 equ DESCALE( 226735879,30-CONST_BITS) ; FIX(0.211164243) + F_0_509 equ DESCALE( 547388834,30-CONST_BITS) ; FIX(0.509795579) + F_0_601 equ DESCALE( 645689155,30-CONST_BITS) ; FIX(0.601344887) + F_0_720 equ DESCALE( 774124714,30-CONST_BITS) ; FIX(0.720959822) + F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) + F_0_850 equ DESCALE( 913142361,30-CONST_BITS) ; FIX(0.850430095) + F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) + F_1_061 equ DESCALE(1139878239,30-CONST_BITS) ; FIX(1.061594337) + F_1_272 equ DESCALE(1366614119,30-CONST_BITS) ; FIX(1.272758580) + F_1_451 equ DESCALE(1558831516,30-CONST_BITS) ; FIX(1.451774981) + F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) + F_2_172 equ DESCALE(2332956230,30-CONST_BITS) ; FIX(2.172734803) + F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) + F_3_624 equ DESCALE(3891787747,30-CONST_BITS) ; FIX(3.624509785) + %endif + + ; -------------------------------------------------------------------------- + SECTION SEG_CONST + + alignz 16 + global EXTN(jconst_idct_red_sse2) + + EXTN(jconst_idct_red_sse2): + + PW_F184_MF076 times 4 dw F_1_847,-F_0_765 + PW_F256_F089 times 4 dw F_2_562, F_0_899 + PW_F106_MF217 times 4 dw F_1_061,-F_2_172 + PW_MF060_MF050 times 4 dw -F_0_601,-F_0_509 + PW_F145_MF021 times 4 dw F_1_451,-F_0_211 + PW_F362_MF127 times 4 dw F_3_624,-F_1_272 + PW_F085_MF072 times 4 dw F_0_850,-F_0_720 + PD_DESCALE_P1_4 times 4 dd 1 << (DESCALE_P1_4-1) + PD_DESCALE_P2_4 times 4 dd 1 << (DESCALE_P2_4-1) + PD_DESCALE_P1_2 times 4 dd 1 << (DESCALE_P1_2-1) + PD_DESCALE_P2_2 times 4 dd 1 << (DESCALE_P2_2-1) + PB_CENTERJSAMP times 16 db CENTERJSAMPLE + + alignz 16 + + ; -------------------------------------------------------------------------- + SECTION SEG_TEXT + BITS 32 + ; + ; Perform dequantization and inverse DCT on one block of coefficients, + ; producing a reduced-size 4x4 output block. + ; + ; GLOBAL(void) -; jpeg_idct_4x4_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, -; JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) ++; jsimd_idct_4x4_sse2 (void * dct_table, JCOEFPTR coef_block, ++; JSAMPARRAY output_buf, JDIMENSION output_col) + ; + -%define cinfo(b) (b)+8 ; j_decompress_ptr cinfo -%define compptr(b) (b)+12 ; jpeg_component_info * compptr -%define coef_block(b) (b)+16 ; JCOEFPTR coef_block -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf -%define output_col(b) (b)+24 ; JDIMENSION output_col ++%define dct_table(b) (b)+8 ; void * dct_table ++%define coef_block(b) (b)+12 ; JCOEFPTR coef_block ++%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf ++%define output_col(b) (b)+20 ; JDIMENSION output_col + + %define original_ebp ebp+0 + %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] + %define WK_NUM 2 + + align 16 - global EXTN(jpeg_idct_4x4_sse2) ++ global EXTN(jsimd_idct_4x4_sse2) + -EXTN(jpeg_idct_4x4_sse2): ++EXTN(jsimd_idct_4x4_sse2): + push ebp + mov eax,esp ; eax = original ebp + sub esp, byte 4 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits + mov [esp],eax + mov ebp,esp ; ebp = aligned ebp + lea esp, [wk(0)] + pushpic ebx + ; push ecx ; unused + ; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + + ; mov eax, [original_ebp] - mov edx, POINTER [compptr(eax)] - mov edx, POINTER [jcompinfo_dct_table(edx)] ; quantptr ++ mov edx, POINTER [dct_table(eax)] ; quantptr + mov esi, JCOEFPTR [coef_block(eax)] ; inptr + + %ifndef NO_ZERO_COLUMN_TEST_4X4_SSE2 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] + jnz short .columnDCT + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + por xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + por xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + por xmm0,xmm1 + packsswb xmm0,xmm0 + packsswb xmm0,xmm0 + movd eax,xmm0 + test eax,eax + jnz short .columnDCT + + ; -- AC terms all zero + + movdqa xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + psllw xmm0,PASS1_BITS + + movdqa xmm3,xmm0 ; xmm0=in0=(00 01 02 03 04 05 06 07) + punpcklwd xmm0,xmm0 ; xmm0=(00 00 01 01 02 02 03 03) + punpckhwd xmm3,xmm3 ; xmm3=(04 04 05 05 06 06 07 07) + + pshufd xmm1,xmm0,0x50 ; xmm1=[col0 col1]=(00 00 00 00 01 01 01 01) + pshufd xmm0,xmm0,0xFA ; xmm0=[col2 col3]=(02 02 02 02 03 03 03 03) + pshufd xmm6,xmm3,0x50 ; xmm6=[col4 col5]=(04 04 04 04 05 05 05 05) + pshufd xmm3,xmm3,0xFA ; xmm3=[col6 col7]=(06 06 06 06 07 07 07 07) + + jmp near .column_end + alignx 16,7 + %endif + .columnDCT: + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + movdqa xmm4,xmm0 + movdqa xmm5,xmm0 + punpcklwd xmm4,xmm1 + punpckhwd xmm5,xmm1 + movdqa xmm0,xmm4 + movdqa xmm1,xmm5 + pmaddwd xmm4,[GOTOFF(ebx,PW_F256_F089)] ; xmm4=(tmp2L) + pmaddwd xmm5,[GOTOFF(ebx,PW_F256_F089)] ; xmm5=(tmp2H) + pmaddwd xmm0,[GOTOFF(ebx,PW_F106_MF217)] ; xmm0=(tmp0L) + pmaddwd xmm1,[GOTOFF(ebx,PW_F106_MF217)] ; xmm1=(tmp0H) + + movdqa xmm6,xmm2 + movdqa xmm7,xmm2 + punpcklwd xmm6,xmm3 + punpckhwd xmm7,xmm3 + movdqa xmm2,xmm6 + movdqa xmm3,xmm7 + pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2L) + pmaddwd xmm7,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm7=(tmp2H) + pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0L) + pmaddwd xmm3,[GOTOFF(ebx,PW_F145_MF021)] ; xmm3=(tmp0H) + + paddd xmm6,xmm4 ; xmm6=tmp2L + paddd xmm7,xmm5 ; xmm7=tmp2H + paddd xmm2,xmm0 ; xmm2=tmp0L + paddd xmm3,xmm1 ; xmm3=tmp0H + + movdqa XMMWORD [wk(0)], xmm2 ; wk(0)=tmp0L + movdqa XMMWORD [wk(1)], xmm3 ; wk(1)=tmp0H + + ; -- Even part + + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + movdqa xmm5, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)] + movdqa xmm0, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)] + pmullw xmm4, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm5, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm0, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + pxor xmm1,xmm1 + pxor xmm2,xmm2 + punpcklwd xmm1,xmm4 ; xmm1=tmp0L + punpckhwd xmm2,xmm4 ; xmm2=tmp0H + psrad xmm1,(16-CONST_BITS-1) ; psrad xmm1,16 & pslld xmm1,CONST_BITS+1 + psrad xmm2,(16-CONST_BITS-1) ; psrad xmm2,16 & pslld xmm2,CONST_BITS+1 + + movdqa xmm3,xmm5 ; xmm5=in2=z2 + punpcklwd xmm5,xmm0 ; xmm0=in6=z3 + punpckhwd xmm3,xmm0 + pmaddwd xmm5,[GOTOFF(ebx,PW_F184_MF076)] ; xmm5=tmp2L + pmaddwd xmm3,[GOTOFF(ebx,PW_F184_MF076)] ; xmm3=tmp2H + + movdqa xmm4,xmm1 + movdqa xmm0,xmm2 + paddd xmm1,xmm5 ; xmm1=tmp10L + paddd xmm2,xmm3 ; xmm2=tmp10H + psubd xmm4,xmm5 ; xmm4=tmp12L + psubd xmm0,xmm3 ; xmm0=tmp12H + + ; -- Final output stage + + movdqa xmm5,xmm1 + movdqa xmm3,xmm2 + paddd xmm1,xmm6 ; xmm1=data0L + paddd xmm2,xmm7 ; xmm2=data0H + psubd xmm5,xmm6 ; xmm5=data3L + psubd xmm3,xmm7 ; xmm3=data3H + + movdqa xmm6,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm6=[PD_DESCALE_P1_4] + + paddd xmm1,xmm6 + paddd xmm2,xmm6 + psrad xmm1,DESCALE_P1_4 + psrad xmm2,DESCALE_P1_4 + paddd xmm5,xmm6 + paddd xmm3,xmm6 + psrad xmm5,DESCALE_P1_4 + psrad xmm3,DESCALE_P1_4 + + packssdw xmm1,xmm2 ; xmm1=data0=(00 01 02 03 04 05 06 07) + packssdw xmm5,xmm3 ; xmm5=data3=(30 31 32 33 34 35 36 37) + + movdqa xmm7, XMMWORD [wk(0)] ; xmm7=tmp0L + movdqa xmm6, XMMWORD [wk(1)] ; xmm6=tmp0H + + movdqa xmm2,xmm4 + movdqa xmm3,xmm0 + paddd xmm4,xmm7 ; xmm4=data1L + paddd xmm0,xmm6 ; xmm0=data1H + psubd xmm2,xmm7 ; xmm2=data2L + psubd xmm3,xmm6 ; xmm3=data2H + + movdqa xmm7,[GOTOFF(ebx,PD_DESCALE_P1_4)] ; xmm7=[PD_DESCALE_P1_4] + + paddd xmm4,xmm7 + paddd xmm0,xmm7 + psrad xmm4,DESCALE_P1_4 + psrad xmm0,DESCALE_P1_4 + paddd xmm2,xmm7 + paddd xmm3,xmm7 + psrad xmm2,DESCALE_P1_4 + psrad xmm3,DESCALE_P1_4 + + packssdw xmm4,xmm0 ; xmm4=data1=(10 11 12 13 14 15 16 17) + packssdw xmm2,xmm3 ; xmm2=data2=(20 21 22 23 24 25 26 27) + + movdqa xmm6,xmm1 ; transpose coefficients(phase 1) + punpcklwd xmm1,xmm4 ; xmm1=(00 10 01 11 02 12 03 13) + punpckhwd xmm6,xmm4 ; xmm6=(04 14 05 15 06 16 07 17) + movdqa xmm7,xmm2 ; transpose coefficients(phase 1) + punpcklwd xmm2,xmm5 ; xmm2=(20 30 21 31 22 32 23 33) + punpckhwd xmm7,xmm5 ; xmm7=(24 34 25 35 26 36 27 37) + + movdqa xmm0,xmm1 ; transpose coefficients(phase 2) + punpckldq xmm1,xmm2 ; xmm1=[col0 col1]=(00 10 20 30 01 11 21 31) + punpckhdq xmm0,xmm2 ; xmm0=[col2 col3]=(02 12 22 32 03 13 23 33) + movdqa xmm3,xmm6 ; transpose coefficients(phase 2) + punpckldq xmm6,xmm7 ; xmm6=[col4 col5]=(04 14 24 34 05 15 25 35) + punpckhdq xmm3,xmm7 ; xmm3=[col6 col7]=(06 16 26 36 07 17 27 37) + .column_end: + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov eax, [original_ebp] + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(eax)] + + ; -- Even part + + pxor xmm4,xmm4 + punpcklwd xmm4,xmm1 ; xmm4=tmp0 + psrad xmm4,(16-CONST_BITS-1) ; psrad xmm4,16 & pslld xmm4,CONST_BITS+1 + + ; -- Odd part + + punpckhwd xmm1,xmm0 + punpckhwd xmm6,xmm3 + movdqa xmm5,xmm1 + movdqa xmm2,xmm6 + pmaddwd xmm1,[GOTOFF(ebx,PW_F256_F089)] ; xmm1=(tmp2) + pmaddwd xmm6,[GOTOFF(ebx,PW_MF060_MF050)] ; xmm6=(tmp2) + pmaddwd xmm5,[GOTOFF(ebx,PW_F106_MF217)] ; xmm5=(tmp0) + pmaddwd xmm2,[GOTOFF(ebx,PW_F145_MF021)] ; xmm2=(tmp0) + + paddd xmm6,xmm1 ; xmm6=tmp2 + paddd xmm2,xmm5 ; xmm2=tmp0 + + ; -- Even part + + punpcklwd xmm0,xmm3 + pmaddwd xmm0,[GOTOFF(ebx,PW_F184_MF076)] ; xmm0=tmp2 + + movdqa xmm7,xmm4 + paddd xmm4,xmm0 ; xmm4=tmp10 + psubd xmm7,xmm0 ; xmm7=tmp12 + + ; -- Final output stage + + movdqa xmm1,[GOTOFF(ebx,PD_DESCALE_P2_4)] ; xmm1=[PD_DESCALE_P2_4] + + movdqa xmm5,xmm4 + movdqa xmm3,xmm7 + paddd xmm4,xmm6 ; xmm4=data0=(00 10 20 30) + paddd xmm7,xmm2 ; xmm7=data1=(01 11 21 31) + psubd xmm5,xmm6 ; xmm5=data3=(03 13 23 33) + psubd xmm3,xmm2 ; xmm3=data2=(02 12 22 32) + + paddd xmm4,xmm1 + paddd xmm7,xmm1 + psrad xmm4,DESCALE_P2_4 + psrad xmm7,DESCALE_P2_4 + paddd xmm5,xmm1 + paddd xmm3,xmm1 + psrad xmm5,DESCALE_P2_4 + psrad xmm3,DESCALE_P2_4 + + packssdw xmm4,xmm3 ; xmm4=(00 10 20 30 02 12 22 32) + packssdw xmm7,xmm5 ; xmm7=(01 11 21 31 03 13 23 33) + + movdqa xmm0,xmm4 ; transpose coefficients(phase 1) + punpcklwd xmm4,xmm7 ; xmm4=(00 01 10 11 20 21 30 31) + punpckhwd xmm0,xmm7 ; xmm0=(02 03 12 13 22 23 32 33) + + movdqa xmm6,xmm4 ; transpose coefficients(phase 2) + punpckldq xmm4,xmm0 ; xmm4=(00 01 02 03 10 11 12 13) + punpckhdq xmm6,xmm0 ; xmm6=(20 21 22 23 30 31 32 33) + + packsswb xmm4,xmm6 ; xmm4=(00 01 02 03 10 11 12 13 20 ..) + paddb xmm4,[GOTOFF(ebx,PB_CENTERJSAMP)] + + pshufd xmm2,xmm4,0x39 ; xmm2=(10 11 12 13 20 21 22 23 30 ..) + pshufd xmm1,xmm4,0x4E ; xmm1=(20 21 22 23 30 31 32 33 00 ..) + pshufd xmm3,xmm4,0x93 ; xmm3=(30 31 32 33 00 01 02 03 10 ..) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] - movd _DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 - movd _DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 ++ movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm4 ++ movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm2 + mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW] - movd _DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 - movd _DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 ++ movd XMM_DWORD [edx+eax*SIZEOF_JSAMPLE], xmm1 ++ movd XMM_DWORD [esi+eax*SIZEOF_JSAMPLE], xmm3 + + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; unused + poppic ebx + mov esp,ebp ; esp <- aligned ebp + pop esp ; esp <- original ebp + pop ebp + ret + + + ; -------------------------------------------------------------------------- + ; + ; Perform dequantization and inverse DCT on one block of coefficients, + ; producing a reduced-size 2x2 output block. + ; + ; GLOBAL(void) -; jpeg_idct_2x2_sse2 (j_decompress_ptr cinfo, jpeg_component_info * compptr, -; JCOEFPTR coef_block, -; JSAMPARRAY output_buf, JDIMENSION output_col) ++; jsimd_idct_2x2_sse2 (void * dct_table, JCOEFPTR coef_block, ++; JSAMPARRAY output_buf, JDIMENSION output_col) + ; + -%define cinfo(b) (b)+8 ; j_decompress_ptr cinfo -%define compptr(b) (b)+12 ; jpeg_component_info * compptr -%define coef_block(b) (b)+16 ; JCOEFPTR coef_block -%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf -%define output_col(b) (b)+24 ; JDIMENSION output_col ++%define dct_table(b) (b)+8 ; void * dct_table ++%define coef_block(b) (b)+12 ; JCOEFPTR coef_block ++%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf ++%define output_col(b) (b)+20 ; JDIMENSION output_col + + align 16 - global EXTN(jpeg_idct_2x2_sse2) ++ global EXTN(jsimd_idct_2x2_sse2) + -EXTN(jpeg_idct_2x2_sse2): ++EXTN(jsimd_idct_2x2_sse2): + push ebp + mov ebp,esp + push ebx + ; push ecx ; need not be preserved + ; push edx ; need not be preserved + push esi + push edi + + get_GOT ebx ; get GOT address + + ; ---- Pass 1: process columns from input. + - mov edx, POINTER [compptr(ebp)] - mov edx, POINTER [jcompinfo_dct_table(edx)] ; quantptr ++ mov edx, POINTER [dct_table(ebp)] ; quantptr + mov esi, JCOEFPTR [coef_block(ebp)] ; inptr + + ; | input: | result: | + ; | 00 01 ** 03 ** 05 ** 07 | | + ; | 10 11 ** 13 ** 15 ** 17 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 30 31 ** 33 ** 35 ** 37 | A0 A1 A3 A5 A7 | + ; | ** ** ** ** ** ** ** ** | B0 B1 B3 B5 B7 | + ; | 50 51 ** 53 ** 55 ** 57 | | + ; | ** ** ** ** ** ** ** ** | | + ; | 70 71 ** 73 ** 75 ** 77 | | + + ; -- Odd part + + movdqa xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)] + movdqa xmm1, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)] + pmullw xmm0, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + movdqa xmm2, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)] + movdqa xmm3, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)] + pmullw xmm2, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + pmullw xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm0=(10 11 ** 13 ** 15 ** 17), xmm1=(30 31 ** 33 ** 35 ** 37) + ; xmm2=(50 51 ** 53 ** 55 ** 57), xmm3=(70 71 ** 73 ** 75 ** 77) + + pcmpeqd xmm7,xmm7 + pslld xmm7,WORD_BIT ; xmm7={0x0000 0xFFFF 0x0000 0xFFFF ..} + + movdqa xmm4,xmm0 ; xmm4=(10 11 ** 13 ** 15 ** 17) + movdqa xmm5,xmm2 ; xmm5=(50 51 ** 53 ** 55 ** 57) + punpcklwd xmm4,xmm1 ; xmm4=(10 30 11 31 ** ** 13 33) + punpcklwd xmm5,xmm3 ; xmm5=(50 70 51 71 ** ** 53 73) + pmaddwd xmm4,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm5,[GOTOFF(ebx,PW_F085_MF072)] + + psrld xmm0,WORD_BIT ; xmm0=(11 -- 13 -- 15 -- 17 --) + pand xmm1,xmm7 ; xmm1=(-- 31 -- 33 -- 35 -- 37) + psrld xmm2,WORD_BIT ; xmm2=(51 -- 53 -- 55 -- 57 --) + pand xmm3,xmm7 ; xmm3=(-- 71 -- 73 -- 75 -- 77) + por xmm0,xmm1 ; xmm0=(11 31 13 33 15 35 17 37) + por xmm2,xmm3 ; xmm2=(51 71 53 73 55 75 57 77) + pmaddwd xmm0,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm2,[GOTOFF(ebx,PW_F085_MF072)] + + paddd xmm4,xmm5 ; xmm4=tmp0[col0 col1 **** col3] + paddd xmm0,xmm2 ; xmm0=tmp0[col1 col3 col5 col7] + + ; -- Even part + + movdqa xmm6, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)] + pmullw xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)] + + ; xmm6=(00 01 ** 03 ** 05 ** 07) + + movdqa xmm1,xmm6 ; xmm1=(00 01 ** 03 ** 05 ** 07) + pslld xmm6,WORD_BIT ; xmm6=(-- 00 -- ** -- ** -- **) + pand xmm1,xmm7 ; xmm1=(-- 01 -- 03 -- 05 -- 07) + psrad xmm6,(WORD_BIT-CONST_BITS-2) ; xmm6=tmp10[col0 **** **** ****] + psrad xmm1,(WORD_BIT-CONST_BITS-2) ; xmm1=tmp10[col1 col3 col5 col7] + + ; -- Final output stage + + movdqa xmm3,xmm6 + movdqa xmm5,xmm1 + paddd xmm6,xmm4 ; xmm6=data0[col0 **** **** ****]=(A0 ** ** **) + paddd xmm1,xmm0 ; xmm1=data0[col1 col3 col5 col7]=(A1 A3 A5 A7) + psubd xmm3,xmm4 ; xmm3=data1[col0 **** **** ****]=(B0 ** ** **) + psubd xmm5,xmm0 ; xmm5=data1[col1 col3 col5 col7]=(B1 B3 B5 B7) + + movdqa xmm2,[GOTOFF(ebx,PD_DESCALE_P1_2)] ; xmm2=[PD_DESCALE_P1_2] + + punpckldq xmm6,xmm3 ; xmm6=(A0 B0 ** **) + + movdqa xmm7,xmm1 + punpcklqdq xmm1,xmm5 ; xmm1=(A1 A3 B1 B3) + punpckhqdq xmm7,xmm5 ; xmm7=(A5 A7 B5 B7) + + paddd xmm6,xmm2 + psrad xmm6,DESCALE_P1_2 + + paddd xmm1,xmm2 + paddd xmm7,xmm2 + psrad xmm1,DESCALE_P1_2 + psrad xmm7,DESCALE_P1_2 + + ; -- Prefetch the next coefficient block + + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32] + prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32] + + ; ---- Pass 2: process rows, store into output array. + + mov edi, JSAMPARRAY [output_buf(ebp)] ; (JSAMPROW *) + mov eax, JDIMENSION [output_col(ebp)] + + ; | input:| result:| + ; | A0 B0 | | + ; | A1 B1 | C0 C1 | + ; | A3 B3 | D0 D1 | + ; | A5 B5 | | + ; | A7 B7 | | + + ; -- Odd part + + packssdw xmm1,xmm1 ; xmm1=(A1 A3 B1 B3 A1 A3 B1 B3) + packssdw xmm7,xmm7 ; xmm7=(A5 A7 B5 B7 A5 A7 B5 B7) + pmaddwd xmm1,[GOTOFF(ebx,PW_F362_MF127)] + pmaddwd xmm7,[GOTOFF(ebx,PW_F085_MF072)] + + paddd xmm1,xmm7 ; xmm1=tmp0[row0 row1 row0 row1] + + ; -- Even part + + pslld xmm6,(CONST_BITS+2) ; xmm6=tmp10[row0 row1 **** ****] + + ; -- Final output stage + + movdqa xmm4,xmm6 + paddd xmm6,xmm1 ; xmm6=data0[row0 row1 **** ****]=(C0 C1 ** **) + psubd xmm4,xmm1 ; xmm4=data1[row0 row1 **** ****]=(D0 D1 ** **) + + punpckldq xmm6,xmm4 ; xmm6=(C0 D0 C1 D1) + + paddd xmm6,[GOTOFF(ebx,PD_DESCALE_P2_2)] + psrad xmm6,DESCALE_P2_2 + + packssdw xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1) + packsswb xmm6,xmm6 ; xmm6=(C0 D0 C1 D1 C0 D0 C1 D1 ..) + paddb xmm6,[GOTOFF(ebx,PB_CENTERJSAMP)] + + pextrw ebx,xmm6,0x00 ; ebx=(C0 D0 -- --) + pextrw ecx,xmm6,0x01 ; ecx=(C1 D1 -- --) + + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] + mov esi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] + mov WORD [edx+eax*SIZEOF_JSAMPLE], bx + mov WORD [esi+eax*SIZEOF_JSAMPLE], cx + + pop edi + pop esi + ; pop edx ; need not be preserved + ; pop ecx ; need not be preserved + pop ebx + pop ebp + ret + -%endif ; JIDCT_INT_SSE2_SUPPORTED -%endif ; IDCT_SCALING_SUPPORTED diff --cc simd/jsimd.h index 98bcebc,0000000..371586e mode 100644,000000..100644 --- a/simd/jsimd.h +++ b/simd/jsimd.h @@@ -1,162 -1,0 +1,285 @@@ +/* + * simd/jsimd.h + * + * Copyright 2009 Pierre Ossman for Cendio AB + * + * Based on the x86 SIMD extension for IJG JPEG library, + * Copyright (C) 1999-2006, MIYASAKA Masaru. + * + */ + +/* Bitmask for supported acceleration methods */ + +#define JSIMD_NONE 0x00 +#define JSIMD_MMX 0x01 +#define JSIMD_3DNOW 0x02 +#define JSIMD_SSE 0x04 ++#define JSIMD_SSE2 0x08 + +/* Short forms of external names for systems with brain-damaged linkers. */ + +#ifdef NEED_SHORT_EXTERNAL_NAMES +#define jpeg_simd_cpu_support jSiCpuSupport +#define jsimd_rgb_ycc_convert_mmx jSRGBYCCM +#define jsimd_ycc_rgb_convert_mmx jSYCCRGBM ++#define jconst_rgb_ycc_convert_sse2 jSCRGBYCCS2 ++#define jsimd_rgb_ycc_convert_sse2 jSRGBYCCS2 ++#define jconst_ycc_rgb_convert_sse2 jSCYCCRGBS2 ++#define jsimd_ycc_rgb_convert_sse2 jSYCCRGBS2 +#define jsimd_h2v2_downsample_mmx jSDnH2V2M +#define jsimd_h2v1_downsample_mmx jSDnH2V1M ++#define jsimd_h2v2_downsample_sse2 jSDnH2V2S2 ++#define jsimd_h2v1_downsample_sse2 jSDnH2V1S2 +#define jsimd_h2v2_upsample_mmx jSUpH2V2M +#define jsimd_h2v1_upsample_mmx jSUpH2V1M +#define jsimd_h2v2_fancy_upsample_mmx jSFUpH2V2M +#define jsimd_h2v1_fancy_upsample_mmx jSFUpH2V1M +#define jsimd_h2v2_merged_upsample_mmx jSMUpH2V2M +#define jsimd_h2v1_merged_upsample_mmx jSMUpH2V1M ++#define jsimd_h2v2_upsample_sse2 jSUpH2V2S2 ++#define jsimd_h2v1_upsample_sse2 jSUpH2V1S2 ++#define jconst_fancy_upsample_sse2 jSCFUpS2 ++#define jsimd_h2v2_fancy_upsample_sse2 jSFUpH2V2S2 ++#define jsimd_h2v1_fancy_upsample_sse2 jSFUpH2V1S2 ++#define jconst_merged_upsample_sse2 jSCMUpS2 ++#define jsimd_h2v2_merged_upsample_sse2 jSMUpH2V2S2 ++#define jsimd_h2v1_merged_upsample_sse2 jSMUpH2V1S2 +#define jsimd_convsamp_mmx jSConvM ++#define jsimd_convsamp_sse2 jSConvS2 +#define jsimd_convsamp_float_3dnow jSConvF3D +#define jsimd_convsamp_float_sse jSConvFS ++#define jsimd_convsamp_float_sse2 jSConvFS2 +#define jsimd_fdct_islow_mmx jSFDMIS +#define jsimd_fdct_ifast_mmx jSFDMIF ++#define jconst_fdct_islow_sse2 jSCFDS2IS ++#define jsimd_fdct_islow_sse2 jSFDS2IS ++#define jconst_fdct_ifast_sse2 jSCFDS2IF ++#define jsimd_fdct_ifast_sse2 jSFDS2IF +#define jsimd_fdct_float_3dnow jSFD3DF +#define jconst_fdct_float_sse jSCFDSF +#define jsimd_fdct_float_sse jSFDSF +#define jsimd_quantize_mmx jSQuantM ++#define jsimd_quantize_sse2 jSQuantS2 +#define jsimd_quantize_float_3dnow jSQuantF3D +#define jsimd_quantize_float_sse jSQuantFS ++#define jsimd_quantize_float_sse2 jSQuantFS2 +#define jsimd_idct_2x2_mmx jSIDM22 +#define jsimd_idct_4x4_mmx jSIDM44 ++#define jconst_idct_red_sse2 jSCIDS2R ++#define jsimd_idct_2x2_sse2 jSIDS222 ++#define jsimd_idct_4x4_sse2 jSIDS244 +#define jsimd_idct_islow_mmx jSIDMIS +#define jsimd_idct_ifast_mmx jSIDMIF ++#define jconst_idct_islow_sse2 jSCIDS2IS ++#define jsimd_idct_islow_sse2 jSIDS2IS ++#define jconst_idct_ifast_sse2 jSCIDS2IF ++#define jsimd_idct_ifast_sse2 jSIDS2IF +#define jsimd_idct_float_3dnow jSID3DF +#define jconst_fdct_float_sse jSCIDSF +#define jsimd_idct_float_sse jSIDSF ++#define jconst_fdct_float_sse2 jSCIDS2F ++#define jsimd_idct_float_sse2 jSIDS2F +#endif /* NEED_SHORT_EXTERNAL_NAMES */ + +/* SIMD Ext: retrieve SIMD/CPU information */ +EXTERN(unsigned int) jpeg_simd_cpu_support JPP((void)); + +/* SIMD Color Space Conversion */ +EXTERN(void) jsimd_rgb_ycc_convert_mmx + JPP((JDIMENSION img_width, + JSAMPARRAY input_buf, JSAMPIMAGE output_buf, + JDIMENSION output_row, int num_rows)); +EXTERN(void) jsimd_ycc_rgb_convert_mmx + JPP((JDIMENSION out_width, + JSAMPIMAGE input_buf, JDIMENSION input_row, + JSAMPARRAY output_buf, int num_rows)); + ++extern const int jconst_rgb_ycc_convert_sse2[]; ++EXTERN(void) jsimd_rgb_ycc_convert_sse2 ++ JPP((JDIMENSION img_width, ++ JSAMPARRAY input_buf, JSAMPIMAGE output_buf, ++ JDIMENSION output_row, int num_rows)); ++extern const int jconst_ycc_rgb_convert_sse2[]; ++EXTERN(void) jsimd_ycc_rgb_convert_sse2 ++ JPP((JDIMENSION out_width, ++ JSAMPIMAGE input_buf, JDIMENSION input_row, ++ JSAMPARRAY output_buf, int num_rows)); ++ +/* SIMD Downsample */ +EXTERN(void) jsimd_h2v2_downsample_mmx + JPP((JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data)); +EXTERN(void) jsimd_h2v1_downsample_mmx + JPP((JDIMENSION image_width, int max_v_samp_factor, + JDIMENSION v_samp_factor, JDIMENSION width_blocks, + JSAMPARRAY input_data, JSAMPARRAY output_data)); + ++EXTERN(void) jsimd_h2v2_downsample_sse2 ++ JPP((JDIMENSION image_width, int max_v_samp_factor, ++ JDIMENSION v_samp_factor, JDIMENSION width_blocks, ++ JSAMPARRAY input_data, JSAMPARRAY output_data)); ++EXTERN(void) jsimd_h2v1_downsample_sse2 ++ JPP((JDIMENSION image_width, int max_v_samp_factor, ++ JDIMENSION v_samp_factor, JDIMENSION width_blocks, ++ JSAMPARRAY input_data, JSAMPARRAY output_data)); ++ +/* SIMD Upsample */ +EXTERN(void) jsimd_h2v2_upsample_mmx + JPP((int max_v_samp_factor, JDIMENSION output_width, + JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); +EXTERN(void) jsimd_h2v1_upsample_mmx + JPP((int max_v_samp_factor, JDIMENSION output_width, + JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); + +EXTERN(void) jsimd_h2v2_fancy_upsample_mmx + JPP((int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); +EXTERN(void) jsimd_h2v1_fancy_upsample_mmx + JPP((int max_v_samp_factor, JDIMENSION downsampled_width, + JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); + +EXTERN(void) jsimd_h2v2_merged_upsample_mmx + JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); +EXTERN(void) jsimd_h2v1_merged_upsample_mmx + JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, + JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); + ++EXTERN(void) jsimd_h2v2_upsample_sse2 ++ JPP((int max_v_samp_factor, JDIMENSION output_width, ++ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); ++EXTERN(void) jsimd_h2v1_upsample_sse2 ++ JPP((int max_v_samp_factor, JDIMENSION output_width, ++ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); ++ ++extern const int jconst_fancy_upsample_sse2[]; ++EXTERN(void) jsimd_h2v2_fancy_upsample_sse2 ++ JPP((int max_v_samp_factor, JDIMENSION downsampled_width, ++ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); ++EXTERN(void) jsimd_h2v1_fancy_upsample_sse2 ++ JPP((int max_v_samp_factor, JDIMENSION downsampled_width, ++ JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr)); ++ ++extern const int jconst_merged_upsample_sse2[]; ++EXTERN(void) jsimd_h2v2_merged_upsample_sse2 ++ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, ++ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); ++EXTERN(void) jsimd_h2v1_merged_upsample_sse2 ++ JPP((JDIMENSION output_width, JSAMPIMAGE input_buf, ++ JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)); ++ +/* SIMD Sample Conversion */ +EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data, + JDIMENSION start_col, + DCTELEM * workspace)); + ++EXTERN(void) jsimd_convsamp_sse2 JPP((JSAMPARRAY sample_data, ++ JDIMENSION start_col, ++ DCTELEM * workspace)); ++ +EXTERN(void) jsimd_convsamp_float_3dnow JPP((JSAMPARRAY sample_data, + JDIMENSION start_col, + FAST_FLOAT * workspace)); + +EXTERN(void) jsimd_convsamp_float_sse JPP((JSAMPARRAY sample_data, + JDIMENSION start_col, + FAST_FLOAT * workspace)); + ++EXTERN(void) jsimd_convsamp_float_sse2 JPP((JSAMPARRAY sample_data, ++ JDIMENSION start_col, ++ FAST_FLOAT * workspace)); ++ +/* SIMD Forward DCT */ +EXTERN(void) jsimd_fdct_islow_mmx JPP((DCTELEM * data)); +EXTERN(void) jsimd_fdct_ifast_mmx JPP((DCTELEM * data)); + ++extern const int jconst_fdct_ifast_sse2[]; ++EXTERN(void) jsimd_fdct_islow_sse2 JPP((DCTELEM * data)); ++extern const int jconst_fdct_islow_sse2[]; ++EXTERN(void) jsimd_fdct_ifast_sse2 JPP((DCTELEM * data)); ++ +EXTERN(void) jsimd_fdct_float_3dnow JPP((FAST_FLOAT * data)); + +extern const int jconst_fdct_float_sse[]; +EXTERN(void) jsimd_fdct_float_sse JPP((FAST_FLOAT * data)); + +/* SIMD Quantization */ +EXTERN(void) jsimd_quantize_mmx JPP((JCOEFPTR coef_block, + DCTELEM * divisors, + DCTELEM * workspace)); + ++EXTERN(void) jsimd_quantize_sse2 JPP((JCOEFPTR coef_block, ++ DCTELEM * divisors, ++ DCTELEM * workspace)); ++ +EXTERN(void) jsimd_quantize_float_3dnow JPP((JCOEFPTR coef_block, + FAST_FLOAT * divisors, + FAST_FLOAT * workspace)); + +EXTERN(void) jsimd_quantize_float_sse JPP((JCOEFPTR coef_block, + FAST_FLOAT * divisors, + FAST_FLOAT * workspace)); + ++EXTERN(void) jsimd_quantize_float_sse2 JPP((JCOEFPTR coef_block, ++ FAST_FLOAT * divisors, ++ FAST_FLOAT * workspace)); ++ +/* SIMD Reduced Inverse DCT */ +EXTERN(void) jsimd_idct_2x2_mmx JPP((void * dct_table, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col)); +EXTERN(void) jsimd_idct_4x4_mmx JPP((void * dct_table, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col)); + ++extern const int jconst_idct_red_sse2[]; ++EXTERN(void) jsimd_idct_2x2_sse2 JPP((void * dct_table, ++ JCOEFPTR coef_block, ++ JSAMPARRAY output_buf, ++ JDIMENSION output_col)); ++EXTERN(void) jsimd_idct_4x4_sse2 JPP((void * dct_table, ++ JCOEFPTR coef_block, ++ JSAMPARRAY output_buf, ++ JDIMENSION output_col)); ++ +/* SIMD Inverse DCT */ +EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col)); +EXTERN(void) jsimd_idct_ifast_mmx JPP((void * dct_table, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col)); + ++extern const int jconst_idct_islow_sse2[]; ++EXTERN(void) jsimd_idct_islow_sse2 JPP((void * dct_table, ++ JCOEFPTR coef_block, ++ JSAMPARRAY output_buf, ++ JDIMENSION output_col)); ++extern const int jconst_idct_ifast_sse2[]; ++EXTERN(void) jsimd_idct_ifast_sse2 JPP((void * dct_table, ++ JCOEFPTR coef_block, ++ JSAMPARRAY output_buf, ++ JDIMENSION output_col)); ++ +EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col)); + +extern const int jconst_idct_float_sse[]; +EXTERN(void) jsimd_idct_float_sse JPP((void * dct_table, + JCOEFPTR coef_block, + JSAMPARRAY output_buf, + JDIMENSION output_col)); + ++extern const int jconst_idct_float_sse2[]; ++EXTERN(void) jsimd_idct_float_sse2 JPP((void * dct_table, ++ JCOEFPTR coef_block, ++ JSAMPARRAY output_buf, ++ JDIMENSION output_col)); ++ diff --cc simd/jsimdcfg.inc.h index 2779565,0000000..4876038 mode 100644,000000..100644 --- a/simd/jsimdcfg.inc.h +++ b/simd/jsimdcfg.inc.h @@@ -1,136 -1,0 +1,168 @@@ +// This file generates the include file for the assembly +// implementations by abusing the C preprocessor. +// +// Note: Some things are manually defined as they need to +// be mapped to NASM types. + +; +; Automatically generated include file from jsimdcfg.inc.h +; + +#define JPEG_INTERNALS + +#include "../jpeglib.h" +#include "../jconfig.h" +#include "../jmorecfg.h" +#include "jsimd.h" + +#define define(var) %define _cpp_protection_##var +#define definev(var) %define _cpp_protection_##var var + +; +; -- jpeglib.h +; + +definev(DCTSIZE) +definev(DCTSIZE2) + +; +; -- jmorecfg.h +; + +definev(RGB_RED) +definev(RGB_GREEN) +definev(RGB_BLUE) + +definev(RGB_PIXELSIZE) + +; Representation of a single sample (pixel element value). +; On this SIMD implementation, this must be 'unsigned char'. +; + +%define JSAMPLE byte ; unsigned char +%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE) + +definev(CENTERJSAMPLE) + +; Representation of a DCT frequency coefficient. +; On this SIMD implementation, this must be 'short'. +; +%define JCOEF word ; short +%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF) + +; Datatype used for image dimensions. +; On this SIMD implementation, this must be 'unsigned int'. +; +%define JDIMENSION dword ; unsigned int +%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION) + +%define JSAMPROW POINTER ; JSAMPLE FAR * (jpeglib.h) +%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h) +%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h) +%define JCOEFPTR POINTER ; JCOEF FAR * (jpeglib.h) +%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW) +%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY) +%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE) +%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR) + +; +; -- jdct.h +; + +; A forward DCT routine is given a pointer to a work area of type DCTELEM[]; +; the DCT is to be performed in-place in that buffer. +; To maximize parallelism, Type DCTELEM is changed to short (originally, int). +; +%define DCTELEM word ; short +%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM) + +%define FAST_FLOAT FP32 ; float +%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(FAST_FLOAT) + +; To maximize parallelism, Type MULTIPLIER is changed to short. +; +%define ISLOW_MULT_TYPE word ; must be short +%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE) + +%define IFAST_MULT_TYPE word ; must be short +%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE) +%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors + +%define FLOAT_MULT_TYPE FP32 ; must be float +%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE) + +; +; -- jsimd.h +; + +definev(JSIMD_NONE) +definev(JSIMD_MMX) +definev(JSIMD_3DNOW) +definev(JSIMD_SSE) ++definev(JSIMD_SSE2) + +; Short forms of external names for systems with brain-damaged linkers. +; +#ifdef NEED_SHORT_EXTERNAL_NAMES +definev(jpeg_simd_cpu_support) +definev(jsimd_rgb_ycc_convert_mmx) +definev(jsimd_ycc_rgb_convert_mmx) ++definev(jconst_rgb_ycc_convert_sse2) ++definev(jsimd_rgb_ycc_convert_sse2) ++definev(jconst_ycc_rgb_convert_sse2) ++definev(jsimd_ycc_rgb_convert_sse2) +definev(jsimd_h2v2_downsample_mmx) +definev(jsimd_h2v1_downsample_mmx) ++definev(jsimd_h2v2_downsample_sse2) ++definev(jsimd_h2v1_downsample_sse2) +definev(jsimd_h2v2_upsample_mmx) +definev(jsimd_h2v1_upsample_mmx) +definev(jsimd_h2v1_fancy_upsample_mmx) +definev(jsimd_h2v2_fancy_upsample_mmx) +definev(jsimd_h2v1_merged_upsample_mmx) +definev(jsimd_h2v2_merged_upsample_mmx) ++definev(jsimd_h2v2_upsample_sse2) ++definev(jsimd_h2v1_upsample_sse2) ++definev(jconst_fancy_upsample_sse2) ++definev(jsimd_h2v1_fancy_upsample_sse2) ++definev(jsimd_h2v2_fancy_upsample_sse2) ++definev(jconst_merged_upsample_sse2) ++definev(jsimd_h2v1_merged_upsample_sse2) ++definev(jsimd_h2v2_merged_upsample_sse2) +definev(jsimd_convsamp_mmx) ++definev(jsimd_convsamp_sse2) +definev(jsimd_convsamp_float_3dnow) +definev(jsimd_convsamp_float_sse) ++definev(jsimd_convsamp_float_sse2) +definev(jsimd_fdct_islow_mmx) +definev(jsimd_fdct_ifast_mmx) ++definev(jconst_fdct_islow_sse2) ++definev(jsimd_fdct_islow_sse2) ++definev(jconst_fdct_ifast_sse2) ++definev(jsimd_fdct_ifast_sse2) +definev(jsimd_fdct_float_3dnow) +definev(jconst_fdct_float_sse) +definev(jsimd_fdct_float_sse) +definev(jsimd_quantize_mmx) ++definev(jsimd_quantize_sse2) +definev(jsimd_quantize_float_3dnow) +definev(jsimd_quantize_float_sse) ++definev(jsimd_quantize_float_sse2) +definev(jsimd_idct_2x2_mmx) +definev(jsimd_idct_4x4_mmx) ++definev(jconst_idct_red_sse2) ++definev(jsimd_idct_2x2_sse2) ++definev(jsimd_idct_4x4_sse2) +definev(jsimd_idct_islow_mmx) +definev(jsimd_idct_ifast_mmx) ++definev(jconst_idct_islow_sse2) ++definev(jsimd_idct_islow_sse2) ++definev(jconst_idct_ifast_sse2) ++definev(jsimd_idct_ifast_sse2) +definev(jsimd_idct_float_3dnow) +definev(jconst_idct_float_sse) +definev(jsimd_idct_float_sse) ++definev(jconst_idct_float_sse2) ++definev(jsimd_idct_float_sse2) +#endif /* NEED_SHORT_EXTERNAL_NAMES */ + diff --cc simd/jsimdext.inc index 5fcd7be,a502c07..8bbf64e --- a/simd/jsimdext.inc +++ b/simd/jsimdext.inc @@@ -100,42 -103,119 +100,46 @@@ ; ========================================================================== -; ---- jpeglib.h ----------------------------------------------------------- - -%define DCTSIZE 8 ; The basic DCT block is 8x8 samples -%define DCTSIZE2 64 ; DCTSIZE squared; # of elements in a block - -%define JSIMD_NONE 0x00 ; bitflags for jpeg_simd_*_support() -%define JSIMD_MMX 0x01 -%define JSIMD_3DNOW 0x02 -%define JSIMD_SSE 0x04 -%define JSIMD_SSE2 0x08 -%define JSIMD_ALL (JSIMD_MMX | JSIMD_3DNOW | JSIMD_SSE | JSIMD_SSE2) - -; ---- jpegint.h ----------------------------------------------------------- - -; Short forms of external names for systems with brain-damaged linkers. -; -%ifdef NEED_SHORT_EXTERNAL_NAMES -%define jpeg_simd_cpu_support jSiCpuSupport -%define jpeg_simd_os_support jSiOsSupport -%endif ; NEED_SHORT_EXTERNAL_NAMES - -; ---- jmorecfg.h ---------------------------------------------------------- -; -; BITS_IN_JSAMPLE==8 (8-bit sample values) is the only valid setting -; on this SIMD implementation. -; -%define BITS_IN_JSAMPLE 8 ; Caution: Cannot be changed - -; Representation of a single sample (pixel element value). -; On this SIMD implementation, this must be 'unsigned char'. -; -%define JSAMPLE byte ; unsigned char -%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE) -%define MAXJSAMPLE 255 -%define CENTERJSAMPLE 128 - -; Representation of a DCT frequency coefficient. -; On this SIMD implementation, this must be 'short'. -; -%define JCOEF word ; short -%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF) - -; INT32 must hold at least signed 32-bit values. -; On this SIMD implementation, this must be 'long'. -; -%define INT32 dword ; long -%define SIZEOF_INT32 SIZEOF_DWORD ; sizeof(INT32) - -; Datatype used for image dimensions. -; On this SIMD implementation, this must be 'unsigned int'. -; -%define JDIMENSION dword ; unsigned int -%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION) - ; -------------------------------------------------------------------------- - -%define JSAMPROW POINTER ; JSAMPLE FAR * (jpeglib.h) -%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h) -%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h) -%define JCOEFPTR POINTER ; JCOEF FAR * (jpeglib.h) -%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW) -%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY) -%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE) -%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR) - -%define POINTER dword ; general pointer type -%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) -%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT - -%define INT dword ; signed integer type -%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) -%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT - -%define FP32 dword ; IEEE754 single -%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) -%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT - -%define FP64 qword ; IEEE754 double -%define SIZEOF_FP64 SIZEOF_QWORD ; sizeof(FP64) -%define FP64_BIT QWORD_BIT ; sizeof(FP64)*BYTE_BIT - -%define FP80 tword ; IEEE754 double-extended(x86) -%define SIZEOF_FP80 SIZEOF_TWORD ; sizeof(FP80) -%define FP80_BIT TWORD_BIT ; sizeof(FP80)*BYTE_BIT - -%define MMWORD qword ; int64 (MMX register) -%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) -%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT - -%define XMMWORD dqword ; int128 (SSE register) -%define SIZEOF_XMMWORD SIZEOF_DQWORD ; sizeof(XMMWORD) -%define XMMWORD_BIT DQWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT - -%define SIZEOF_BYTE 1 ; sizeof(BYTE) -%define SIZEOF_WORD 2 ; sizeof(WORD) -%define SIZEOF_DWORD 4 ; sizeof(DWORD) -%define SIZEOF_QWORD 8 ; sizeof(QWORD) -%define SIZEOF_TBYTE 10 ; sizeof(TBYTE) -%define SIZEOF_TWORD 10 ; sizeof(TWORD) -%define SIZEOF_DQWORD 16 ; sizeof(DQWORD) - -%define BYTE_BIT 8 ; CHAR_BIT in C -%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT -%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT -%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT -%define TBYTE_BIT 80 ; sizeof(TBYTE)*BYTE_BIT -%define TWORD_BIT 80 ; sizeof(TWORD)*BYTE_BIT -%define DQWORD_BIT 128 ; sizeof(DQWORD)*BYTE_BIT - -%idefine TBYTE TWORD ; NASM uses the keyword 'TWORD' instead of 'TBYTE' -%idefine DQWORD ; currently not supported by NASM -%idefine _MMWORD ; -%idefine _DWORD ; +; Common types +; +%define POINTER dword ; general pointer type +%define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) +%define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT + +%define INT dword ; signed integer type +%define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) +%define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT + +%define FP32 dword ; IEEE754 single +%define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) +%define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT + +%define MMWORD qword ; int64 (MMX register) +%define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) +%define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT + +; NASM is buggy and doesn't properly handle operand sizes for SSE +; instructions, so for now we have to define XMMWORD as blank. +%define XMMWORD ; int128 (SSE register) +%define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) +%define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT + ++; Similar hacks for when we load a dword or MMWORD into an xmm# register ++%define XMM_DWORD ++%define XMM_MMWORD ++ +%define SIZEOF_BYTE 1 ; sizeof(BYTE) +%define SIZEOF_WORD 2 ; sizeof(WORD) +%define SIZEOF_DWORD 4 ; sizeof(DWORD) +%define SIZEOF_QWORD 8 ; sizeof(QWORD) +%define SIZEOF_OWORD 16 ; sizeof(OWORD) + +%define BYTE_BIT 8 ; CHAR_BIT in C +%define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT +%define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT +%define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT +%define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT ; -------------------------------------------------------------------------- ; External Symbol Name