From: Pierre Ossman <ossman@cendio.se>
Date: Mon, 9 Mar 2009 13:31:56 +0000 (+0000)
Subject: Add SSE SIMD implementation of computationally intensive routines.
X-Git-Tag: 0.0.90~133
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=018fc42974f125bb8791eb81390137c562d15693;p=libjpeg-turbo

Add SSE SIMD implementation of computationally intensive routines.


git-svn-id: svn+ssh://svn.code.sf.net/p/libjpeg-turbo/code/trunk@21 632fc199-4ca6-4c93-a231-07263d6284db
---

018fc42974f125bb8791eb81390137c562d15693
diff --cc Makefile.am
index 9df4160,0000000..655e207
mode 100644,000000..100644
--- a/Makefile.am
+++ b/Makefile.am
@@@ -1,38 -1,0 +1,39 @@@
 +noinst_LTLIBRARIES = libjpeg.la
 +
 +HDRS = jchuff.h jdct.h jdhuff.h jerror.h jinclude.h jmemsys.h jmorecfg.h \
 +	jpegint.h jpeglib.h jversion.h jsimd.h jsimddct.h
 +
 +libjpeg_la_SOURCES = $(HDRS) jcapimin.c jcapistd.c jccoefct.c jccolor.c \
 +	jcdctmgr.c jchuff.c jcinit.c jcmainct.c jcmarker.c jcmaster.c \
 +	jcomapi.c jcparam.c jcphuff.c jcprepct.c jcsample.c jctrans.c \
 +	jdapimin.c jdapistd.c jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c \
 +	jddctmgr.c jdhuff.c jdinput.c jdmainct.c jdmarker.c jdmaster.c \
 +	jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c \
 +	jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c \
 +	jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c \
 +	jsimd.c
 +
 +if WITH_SIMD
 +
 +BUILT_SOURCES = simd/jsimdcfg.inc
 +
 +EXTRA_DIST = nasm_lt.sh
 +
 +libjpeg_la_SOURCES += simd/jsimd.h simd/jsimdcfg.inc.h \
 +	simd/jsimdext.inc simd/jcolsamp.inc simd/jdct.inc \
 +	simd/jsimdcpu.asm \
 +	simd/jccolmmx.asm simd/jdcolmmx.asm \
 +	simd/jcsammmx.asm simd/jdsammmx.asm simd/jdmermmx.asm \
 +	simd/jcqntmmx.asm simd/jfmmxfst.asm simd/jfmmxint.asm \
 +	simd/jimmxred.asm simd/jimmxint.asm simd/jimmxfst.asm \
- 	simd/jcqnt3dn.asm simd/jf3dnflt.asm simd/ji3dnflt.asm
++	simd/jcqnt3dn.asm simd/jf3dnflt.asm simd/ji3dnflt.asm \
++	simd/jcqntsse.asm simd/jfsseflt.asm simd/jisseflt.asm
 +
 +endif
 +
 +.asm.lo:
 +	$(LIBTOOL) --mode=compile --tag NASM ./nasm_lt.sh $(NASM) $(NAFLAGS) $< -o $@
 +
 +simd/jsimdcfg.inc: simd/jsimdcfg.inc.h jpeglib.h jconfig.h jmorecfg.h
 +	$(CPP) $< | grep ^[\;%] | sed 's%_cpp_protection_%%' > $@
 +
diff --cc jsimd.c
index d8acba5,0000000..6c60b5b
mode 100644,000000..100644
--- a/jsimd.c
+++ b/jsimd.c
@@@ -1,712 -1,0 +1,741 @@@
 +/*
 + * jsimd.c
 + *
 + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 + * 
 + * Based on the x86 SIMD extension for IJG JPEG library,
 + * Copyright (C) 1999-2006, MIYASAKA Masaru.
 + *
 + * This file contains the interface between the "normal" portions
 + * of the library and the SIMD implementations.
 + */
 +
 +#define JPEG_INTERNALS
 +#include "jinclude.h"
 +#include "jpeglib.h"
 +#include "jsimd.h"
 +#include "jdct.h"
 +#include "jsimddct.h"
 +#include "simd/jsimd.h"
 +
++/*
++ * In the PIC cases, we have no guarantee that constants will keep
++ * their alignment. This macro allows us to verify it at runtime.
++ */
++#ifdef WITH_SIMD
++#define IS_ALIGNED(ptr, order) (((unsigned)ptr & ((1 << order) - 1)) == 0)
++#else
++#define IS_ALIGNED(ptr, order) (0)
++#endif
++
++#define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
++
 +static unsigned int simd_support = ~0;
 +
 +/*
 + * Check what SIMD accelerations are supported.
 + *
 + * FIXME: This code is racy under a multi-threaded environment.
 + */
 +LOCAL(void)
 +init_simd (void)
 +{
 +  if (simd_support != ~0)
 +    return;
 +
 +#ifdef WITH_SIMD
 +  simd_support = jpeg_simd_cpu_support();
 +#else
 +  simd_support = JSIMD_NONE;
 +#endif
 +}
 +
 +GLOBAL(int)
 +jsimd_can_rgb_ycc (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(int)
 +jsimd_can_ycc_rgb (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(void)
 +jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
 +                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
 +                       JDIMENSION output_row, int num_rows)
 +{
 +#ifdef WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_rgb_ycc_convert_mmx(cinfo->image_width, input_buf,
 +        output_buf, output_row, num_rows);
 +#endif
 +}
 +
 +GLOBAL(void)
 +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
 +                       JSAMPIMAGE input_buf, JDIMENSION input_row,
 +                       JSAMPARRAY output_buf, int num_rows)
 +{
 +#ifdef WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_ycc_rgb_convert_mmx(cinfo->output_width, input_buf,
 +        input_row, output_buf, num_rows);
 +#endif
 +}
 +
 +GLOBAL(int)
 +jsimd_can_h2v2_downsample (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(int)
 +jsimd_can_h2v1_downsample (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(void)
 +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 +                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 +{
 +#ifdef WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_h2v2_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
 +        compptr->v_samp_factor, compptr->width_in_blocks,
 +        input_data, output_data);
 +#endif
 +}
 +
 +GLOBAL(void)
 +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
 +                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 +{
 +#ifdef WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_h2v1_downsample_mmx(cinfo->image_width, cinfo->max_v_samp_factor,
 +        compptr->v_samp_factor, compptr->width_in_blocks,
 +        input_data, output_data);
 +#endif
 +}
 +
 +GLOBAL(int)
 +jsimd_can_h2v2_upsample (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(int)
 +jsimd_can_h2v1_upsample (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(void)
 +jsimd_h2v2_upsample (j_decompress_ptr cinfo,
 +                     jpeg_component_info * compptr, 
 +                     JSAMPARRAY input_data,
 +                     JSAMPARRAY * output_data_ptr)
 +{
 +#ifdef WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_h2v2_upsample_mmx(cinfo->max_v_samp_factor,
 +        cinfo->output_width, input_data, output_data_ptr);
 +#endif
 +}
 +
 +GLOBAL(void)
 +jsimd_h2v1_upsample (j_decompress_ptr cinfo,
 +                     jpeg_component_info * compptr, 
 +                     JSAMPARRAY input_data,
 +                     JSAMPARRAY * output_data_ptr)
 +{
 +#ifdef WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_h2v1_upsample_mmx(cinfo->max_v_samp_factor,
 +        cinfo->output_width, input_data, output_data_ptr);
 +#endif
 +}
 +
 +GLOBAL(int)
 +jsimd_can_h2v2_fancy_upsample (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(int)
 +jsimd_can_h2v1_fancy_upsample (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(void)
 +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
 +                           jpeg_component_info * compptr, 
 +                           JSAMPARRAY input_data,
 +                           JSAMPARRAY * output_data_ptr)
 +{
 +#ifdef WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_h2v2_fancy_upsample_mmx(cinfo->max_v_samp_factor,
 +        compptr->downsampled_width, input_data, output_data_ptr);
 +#endif
 +}
 +
 +GLOBAL(void)
 +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
 +                           jpeg_component_info * compptr, 
 +                           JSAMPARRAY input_data,
 +                           JSAMPARRAY * output_data_ptr)
 +{
 +#ifdef WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_h2v1_fancy_upsample_mmx(cinfo->max_v_samp_factor,
 +        compptr->downsampled_width, input_data, output_data_ptr);
 +#endif
 +}
 +
 +GLOBAL(int)
 +jsimd_can_h2v2_merged_upsample (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(int)
 +jsimd_can_h2v1_merged_upsample (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(void)
 +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
 +                            JSAMPIMAGE input_buf,
 +                            JDIMENSION in_row_group_ctr,
 +                            JSAMPARRAY output_buf)
 +{
 +#ifdef WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_h2v2_merged_upsample_mmx(cinfo->output_width, input_buf,
 +        in_row_group_ctr, output_buf);
 +#endif
 +}
 +
 +GLOBAL(void)
 +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
 +                            JSAMPIMAGE input_buf,
 +                            JDIMENSION in_row_group_ctr,
 +                            JSAMPARRAY output_buf)
 +{
 +#ifdef WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_h2v1_merged_upsample_mmx(cinfo->output_width, input_buf,
 +        in_row_group_ctr, output_buf);
 +#endif
 +}
 +
 +GLOBAL(int)
 +jsimd_can_convsamp (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (DCTSIZE != 8)
 +    return 0;
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +  if (sizeof(DCTELEM) != 2)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(int)
 +jsimd_can_convsamp_float (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (DCTSIZE != 8)
 +    return 0;
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +  if (sizeof(FAST_FLOAT) != 4)
 +    return 0;
 +
++  if (simd_support & JSIMD_SSE)
++    return 1;
 +  if (simd_support & JSIMD_3DNOW)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(void)
 +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
 +                DCTELEM * workspace)
 +{
 +#ifdef WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_convsamp_mmx(sample_data, start_col, workspace);
 +#endif
 +}
 +
 +GLOBAL(void)
 +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
 +                      FAST_FLOAT * workspace)
 +{
 +#ifdef WITH_SIMD
-   if (simd_support & JSIMD_3DNOW)
++  if (simd_support & JSIMD_SSE)
++    jsimd_convsamp_float_sse(sample_data, start_col, workspace);
++  else if (simd_support & JSIMD_3DNOW)
 +    jsimd_convsamp_float_3dnow(sample_data, start_col, workspace);
 +#endif
 +}
 +
 +GLOBAL(int)
 +jsimd_can_fdct_islow (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (DCTSIZE != 8)
 +    return 0;
 +  if (sizeof(DCTELEM) != 2)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(int)
 +jsimd_can_fdct_ifast (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (DCTSIZE != 8)
 +    return 0;
 +  if (sizeof(DCTELEM) != 2)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(int)
 +jsimd_can_fdct_float (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (DCTSIZE != 8)
 +    return 0;
 +  if (sizeof(FAST_FLOAT) != 4)
 +    return 0;
 +
++  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
++    return 1;
 +  if (simd_support & JSIMD_3DNOW)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(void)
 +jsimd_fdct_islow (DCTELEM * data)
 +{
 +#ifdef WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_fdct_islow_mmx(data);
 +#endif
 +}
 +
 +GLOBAL(void)
 +jsimd_fdct_ifast (DCTELEM * data)
 +{
 +#ifdef WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_fdct_ifast_mmx(data);
 +#endif
 +}
 +
 +GLOBAL(void)
 +jsimd_fdct_float (FAST_FLOAT * data)
 +{
 +#ifdef WITH_SIMD
-   if (simd_support & JSIMD_3DNOW)
++  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_fdct_float_sse))
++    jsimd_fdct_float_sse(data);
++  else if (simd_support & JSIMD_3DNOW)
 +    jsimd_fdct_float_3dnow(data);
 +#endif
 +}
 +
 +GLOBAL(int)
 +jsimd_can_quantize (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (DCTSIZE != 8)
 +    return 0;
 +  if (sizeof(JCOEF) != 2)
 +    return 0;
 +  if (sizeof(DCTELEM) != 2)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(int)
 +jsimd_can_quantize_float (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (DCTSIZE != 8)
 +    return 0;
 +  if (sizeof(JCOEF) != 2)
 +    return 0;
 +  if (sizeof(FAST_FLOAT) != 4)
 +    return 0;
 +
++  if (simd_support & JSIMD_SSE)
++    return 1;
 +  if (simd_support & JSIMD_3DNOW)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(void)
 +jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
 +                DCTELEM * workspace)
 +{
 +#ifdef WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_quantize_mmx(coef_block, divisors, workspace);
 +#endif
 +}
 +
 +GLOBAL(void)
 +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
 +                      FAST_FLOAT * workspace)
 +{
 +#ifdef WITH_SIMD
-   if (simd_support & JSIMD_3DNOW)
++  if (simd_support & JSIMD_SSE)
++    jsimd_quantize_float_sse(coef_block, divisors, workspace);
++  else if (simd_support & JSIMD_3DNOW)
 +    jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
 +#endif
 +}
 +
 +GLOBAL(int)
 +jsimd_can_idct_2x2 (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (DCTSIZE != 8)
 +    return 0;
 +  if (sizeof(JCOEF) != 2)
 +    return 0;
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +  if (sizeof(ISLOW_MULT_TYPE) != 2)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(int)
 +jsimd_can_idct_4x4 (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (DCTSIZE != 8)
 +    return 0;
 +  if (sizeof(JCOEF) != 2)
 +    return 0;
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +  if (sizeof(ISLOW_MULT_TYPE) != 2)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(void)
 +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 +                JCOEFPTR coef_block, JSAMPARRAY output_buf,
 +                JDIMENSION output_col)
 +{
 +#if WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_idct_2x2_mmx(compptr->dct_table, coef_block, output_buf, output_col);
 +#endif
 +}
 +
 +GLOBAL(void)
 +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 +                JCOEFPTR coef_block, JSAMPARRAY output_buf,
 +                JDIMENSION output_col)
 +{
 +#if WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_idct_4x4_mmx(compptr->dct_table, coef_block, output_buf, output_col);
 +#endif
 +}
 +
 +GLOBAL(int)
 +jsimd_can_idct_islow (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (DCTSIZE != 8)
 +    return 0;
 +  if (sizeof(JCOEF) != 2)
 +    return 0;
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +  if (sizeof(ISLOW_MULT_TYPE) != 2)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(int)
 +jsimd_can_idct_ifast (void)
 +{
 +  init_simd();
 +
 +  /* The code is optimised for these values only */
 +  if (DCTSIZE != 8)
 +    return 0;
 +  if (sizeof(JCOEF) != 2)
 +    return 0;
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +  if (sizeof(IFAST_MULT_TYPE) != 2)
 +    return 0;
 +  if (IFAST_SCALE_BITS != 2)
 +    return 0;
 +
 +  if (simd_support & JSIMD_MMX)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(int)
 +jsimd_can_idct_float (void)
 +{
 +  init_simd();
 +
 +  if (DCTSIZE != 8)
 +    return 0;
 +  if (sizeof(JCOEF) != 2)
 +    return 0;
 +  if (BITS_IN_JSAMPLE != 8)
 +    return 0;
 +  if (sizeof(JDIMENSION) != 4)
 +    return 0;
 +  if (sizeof(FAST_FLOAT) != 4)
 +    return 0;
 +  if (sizeof(FLOAT_MULT_TYPE) != 4)
 +    return 0;
 +
++  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
++    return 1;
 +  if (simd_support & JSIMD_3DNOW)
 +    return 1;
 +
 +  return 0;
 +}
 +
 +GLOBAL(void)
 +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 +                JCOEFPTR coef_block, JSAMPARRAY output_buf,
 +                JDIMENSION output_col)
 +{
 +#if WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_idct_islow_mmx(compptr->dct_table, coef_block, output_buf, output_col);
 +#endif
 +}
 +
 +GLOBAL(void)
 +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 +                JCOEFPTR coef_block, JSAMPARRAY output_buf,
 +                JDIMENSION output_col)
 +{
 +#if WITH_SIMD
 +  if (simd_support & JSIMD_MMX)
 +    jsimd_idct_ifast_mmx(compptr->dct_table, coef_block, output_buf, output_col);
 +#endif
 +}
 +
 +GLOBAL(void)
 +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 +                JCOEFPTR coef_block, JSAMPARRAY output_buf,
 +                JDIMENSION output_col)
 +{
 +#if WITH_SIMD
-   if (simd_support & JSIMD_3DNOW)
++  if ((simd_support & JSIMD_SSE) && IS_ALIGNED_SSE(jconst_idct_float_sse))
++    jsimd_idct_float_sse(compptr->dct_table, coef_block,
++        output_buf, output_col);
++  else if (simd_support & JSIMD_3DNOW)
 +    jsimd_idct_float_3dnow(compptr->dct_table, coef_block,
 +        output_buf, output_col);
 +#endif
 +}
 +
diff --cc simd/jcqntsse.asm
index 0000000,fe99a20..da7a3ff
mode 000000,100644..100644
--- a/simd/jcqntsse.asm
+++ b/simd/jcqntsse.asm
@@@ -1,0 -1,218 +1,208 @@@
+ ;
+ ; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)
+ ;
++; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
++;
++; Based on
+ ; x86 SIMD extension for IJG JPEG library
+ ; Copyright (C) 1999-2006, MIYASAKA Masaru.
+ ; For conditions of distribution and use, see copyright notice in jsimdext.inc
+ ;
+ ; This file should be assembled with NASM (Netwide Assembler),
+ ; can *not* be assembled with Microsoft's MASM or any compatible
+ ; assembler (including Borland's Turbo Assembler).
+ ; NASM is available from http://nasm.sourceforge.net/ or
+ ; http://sourceforge.net/project/showfiles.php?group_id=6208
+ ;
 -; Last Modified : January 12, 2005
 -;
+ ; [TAB8]
+ 
 -%include "jsimdext.inc"
 -%include "jdct.inc"
 -
 -%ifdef DCT_FLOAT_SUPPORTED
 -%ifdef JFDCT_FLT_SSE_MMX_SUPPORTED
 -
 -; This module is specialized to the case DCTSIZE = 8.
 -;
 -%if DCTSIZE != 8
 -%error "Sorry, this code only copes with 8x8 DCTs."
 -%endif
++%include "simd/jsimdext.inc"
++%include "simd/jdct.inc"
+ 
+ ; --------------------------------------------------------------------------
+ 	SECTION	SEG_TEXT
+ 	BITS	32
+ ;
+ ; Load data into workspace, applying unsigned->signed conversion
+ ;
+ ; GLOBAL(void)
 -; jpeg_convsamp_flt_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
 -;                        FAST_FLOAT * workspace);
++; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
++;                           FAST_FLOAT * workspace);
+ ;
+ 
+ %define sample_data	ebp+8		; JSAMPARRAY sample_data
+ %define start_col	ebp+12		; JDIMENSION start_col
+ %define workspace	ebp+16		; FAST_FLOAT * workspace
+ 
+ 	align	16
 -	global	EXTN(jpeg_convsamp_flt_sse)
++	global	EXTN(jsimd_convsamp_float_sse)
+ 
 -EXTN(jpeg_convsamp_flt_sse):
++EXTN(jsimd_convsamp_float_sse):
+ 	push	ebp
+ 	mov	ebp,esp
+ 	push	ebx
+ ;	push	ecx		; need not be preserved
+ ;	push	edx		; need not be preserved
+ 	push	esi
+ 	push	edi
+ 
+ 	pcmpeqw  mm7,mm7
+ 	psllw    mm7,7
+ 	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
+ 
+ 	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
+ 	mov	eax, JDIMENSION [start_col]
+ 	mov	edi, POINTER [workspace]	; (DCTELEM *)
+ 	mov	ecx, DCTSIZE/2
+ 	alignx	16,7
+ .convloop:
+ 	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+ 	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
+ 
+ 	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
+ 	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
+ 
+ 	psubb	mm0,mm7				; mm0=(01234567)
+ 	psubb	mm1,mm7				; mm1=(89ABCDEF)
+ 
+ 	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
+ 	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
+ 	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
+ 	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
+ 
+ 	punpcklwd mm4,mm2			; mm4=(***0***1)
+ 	punpckhwd mm2,mm2			; mm2=(***2***3)
+ 	punpcklwd mm5,mm0			; mm5=(***4***5)
+ 	punpckhwd mm0,mm0			; mm0=(***6***7)
+ 
+ 	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
+ 	psrad     mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
+ 	cvtpi2ps  xmm0,mm4			; xmm0=(01**)
+ 	cvtpi2ps  xmm1,mm2			; xmm1=(23**)
+ 	psrad     mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
+ 	psrad     mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
+ 	cvtpi2ps  xmm2,mm5			; xmm2=(45**)
+ 	cvtpi2ps  xmm3,mm0			; xmm3=(67**)
+ 
+ 	punpcklwd mm6,mm3			; mm6=(***8***9)
+ 	punpckhwd mm3,mm3			; mm3=(***A***B)
+ 	punpcklwd mm4,mm1			; mm4=(***C***D)
+ 	punpckhwd mm1,mm1			; mm1=(***E***F)
+ 
+ 	psrad     mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
+ 	psrad     mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
+ 	cvtpi2ps  xmm4,mm6			; xmm4=(89**)
+ 	cvtpi2ps  xmm5,mm3			; xmm5=(AB**)
+ 	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
+ 	psrad     mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
+ 	cvtpi2ps  xmm6,mm4			; xmm6=(CD**)
+ 	cvtpi2ps  xmm7,mm1			; xmm7=(EF**)
+ 
+ 	movlhps   xmm0,xmm1			; xmm0=(0123)
+ 	movlhps   xmm2,xmm3			; xmm2=(4567)
+ 	movlhps   xmm4,xmm5			; xmm4=(89AB)
+ 	movlhps   xmm6,xmm7			; xmm6=(CDEF)
+ 
+ 	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+ 	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+ 	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
+ 	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+ 
+ 	add	esi, byte 2*SIZEOF_JSAMPROW
+ 	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
+ 	dec	ecx
+ 	jnz	near .convloop
+ 
+ 	emms		; empty MMX state
+ 
+ 	pop	edi
+ 	pop	esi
+ ;	pop	edx		; need not be preserved
+ ;	pop	ecx		; need not be preserved
+ 	pop	ebx
+ 	pop	ebp
+ 	ret
+ 
+ 
+ ; --------------------------------------------------------------------------
+ ;
+ ; Quantize/descale the coefficients, and store into coef_block
+ ;
+ ; GLOBAL(void)
 -; jpeg_quantize_flt_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
 -;                        FAST_FLOAT * workspace);
++; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
++;                           FAST_FLOAT * workspace);
+ ;
+ 
+ %define coef_block	ebp+8		; JCOEFPTR coef_block
+ %define divisors	ebp+12		; FAST_FLOAT * divisors
+ %define workspace	ebp+16		; FAST_FLOAT * workspace
+ 
+ 	align	16
 -	global	EXTN(jpeg_quantize_flt_sse)
++	global	EXTN(jsimd_quantize_float_sse)
+ 
 -EXTN(jpeg_quantize_flt_sse):
++EXTN(jsimd_quantize_float_sse):
+ 	push	ebp
+ 	mov	ebp,esp
+ ;	push	ebx		; unused
+ ;	push	ecx		; unused
+ ;	push	edx		; need not be preserved
+ 	push	esi
+ 	push	edi
+ 
+ 	mov	esi, POINTER [workspace]
+ 	mov	edx, POINTER [divisors]
+ 	mov	edi, JCOEFPTR [coef_block]
+ 	mov	eax, DCTSIZE2/16
+ 	alignx	16,7
+ .quantloop:
+ 	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
+ 	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ 	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
+ 	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ 	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+ 
+ 	movhlps  xmm4,xmm0
+ 	movhlps  xmm5,xmm1
+ 
+ 	cvtps2pi mm0,xmm0
+ 	cvtps2pi mm1,xmm1
+ 	cvtps2pi mm4,xmm4
+ 	cvtps2pi mm5,xmm5
+ 
+ 	movhlps  xmm6,xmm2
+ 	movhlps  xmm7,xmm3
+ 
+ 	cvtps2pi mm2,xmm2
+ 	cvtps2pi mm3,xmm3
+ 	cvtps2pi mm6,xmm6
+ 	cvtps2pi mm7,xmm7
+ 
+ 	packssdw mm0,mm4
+ 	packssdw mm1,mm5
+ 	packssdw mm2,mm6
+ 	packssdw mm3,mm7
+ 
+ 	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
+ 	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
+ 	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
+ 	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
+ 
+ 	add	esi, byte 16*SIZEOF_FAST_FLOAT
+ 	add	edx, byte 16*SIZEOF_FAST_FLOAT
+ 	add	edi, byte 16*SIZEOF_JCOEF
+ 	dec	eax
+ 	jnz	short .quantloop
+ 
+ 	emms		; empty MMX state
+ 
+ 	pop	edi
+ 	pop	esi
+ ;	pop	edx		; need not be preserved
+ ;	pop	ecx		; unused
+ ;	pop	ebx		; unused
+ 	pop	ebp
+ 	ret
+ 
 -%endif ; JFDCT_FLT_SSE_MMX_SUPPORTED
 -%endif ; DCT_FLOAT_SUPPORTED
diff --cc simd/jdct.inc
index e53f66e,0000000..cc62704
mode 100644,000000..100644
--- a/simd/jdct.inc
+++ b/simd/jdct.inc
@@@ -1,27 -1,0 +1,28 @@@
 +;
 +; jdct.inc - private declarations for forward & reverse DCT subsystems
 +;
 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 +;
 +; Based on
 +; x86 SIMD extension for IJG JPEG library
 +; Copyright (C) 1999-2006, MIYASAKA Masaru.
 +; For conditions of distribution and use, see copyright notice in jsimdext.inc
 +;
 +; [TAB8]
 +
 +; Each IDCT routine is responsible for range-limiting its results and
 +; converting them to unsigned form (0..MAXJSAMPLE).  The raw outputs could
 +; be quite far out of range if the input data is corrupt, so a bulletproof
 +; range-limiting step is required.  We use a mask-and-table-lookup method
 +; to do the combined operations quickly.
 +;
 +%define RANGE_MASK  (MAXJSAMPLE * 4 + 3)  ; 2 bits wider than legal samples
 +
 +%define ROW(n,b,s)		((b)+(n)*(s))
 +%define COL(n,b,s)		((b)+(n)*(s)*DCTSIZE)
 +
 +%define DWBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_DWORD)
 +%define MMBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_MMWORD)
++%define XMMBLOCK(m,n,b,s)	((b)+(m)*DCTSIZE*(s)+(n)*SIZEOF_XMMWORD)
 +
 +; --------------------------------------------------------------------------
diff --cc simd/jfsseflt.asm
index 0000000,98b0973..6469f3c
mode 000000,100644..100644
--- a/simd/jfsseflt.asm
+++ b/simd/jfsseflt.asm
@@@ -1,0 -1,383 +1,367 @@@
+ ;
+ ; jfsseflt.asm - floating-point FDCT (SSE)
+ ;
++; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
++;
++; Based on
+ ; x86 SIMD extension for IJG JPEG library
+ ; Copyright (C) 1999-2006, MIYASAKA Masaru.
+ ; For conditions of distribution and use, see copyright notice in jsimdext.inc
+ ;
+ ; This file should be assembled with NASM (Netwide Assembler),
+ ; can *not* be assembled with Microsoft's MASM or any compatible
+ ; assembler (including Borland's Turbo Assembler).
+ ; NASM is available from http://nasm.sourceforge.net/ or
+ ; http://sourceforge.net/project/showfiles.php?group_id=6208
+ ;
+ ; This file contains a floating-point implementation of the forward DCT
+ ; (Discrete Cosine Transform). The following code is based directly on
+ ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
+ ;
 -; Last Modified : February 4, 2006
 -;
+ ; [TAB8]
+ 
 -%include "jsimdext.inc"
 -%include "jdct.inc"
 -
 -%ifdef DCT_FLOAT_SUPPORTED
 -%ifdef JFDCT_FLT_SSE_MMX_SUPPORTED
 -%define JFDCT_FLT_SSE_SUPPORTED
 -%endif
 -%ifdef JFDCT_FLT_SSE_SSE2_SUPPORTED
 -%define JFDCT_FLT_SSE_SUPPORTED
 -%endif
 -%ifdef JFDCT_FLT_SSE_SUPPORTED
 -
 -; This module is specialized to the case DCTSIZE = 8.
 -;
 -%if DCTSIZE != 8
 -%error "Sorry, this code only copes with 8x8 DCTs."
 -%endif
++%include "simd/jsimdext.inc"
++%include "simd/jdct.inc"
+ 
+ ; --------------------------------------------------------------------------
+ 
+ %macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ 	shufps	%1,%2,0x44
+ %endmacro
+ 
+ %macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ 	shufps	%1,%2,0xEE
+ %endmacro
+ 
+ ; --------------------------------------------------------------------------
+ 	SECTION	SEG_CONST
+ 
+ 	alignz	16
+ 	global	EXTN(jconst_fdct_float_sse)
+ 
+ EXTN(jconst_fdct_float_sse):
+ 
+ PD_0_382	times 4 dd  0.382683432365089771728460
+ PD_0_707	times 4 dd  0.707106781186547524400844
+ PD_0_541	times 4 dd  0.541196100146196984399723
+ PD_1_306	times 4 dd  1.306562964876376527856643
+ 
+ 	alignz	16
+ 
+ ; --------------------------------------------------------------------------
+ 	SECTION	SEG_TEXT
+ 	BITS	32
+ ;
+ ; Perform the forward DCT on one block of samples.
+ ;
+ ; GLOBAL(void)
 -; jpeg_fdct_float_sse (FAST_FLOAT * data)
++; jsimd_fdct_float_sse (FAST_FLOAT * data)
+ ;
+ 
+ %define data(b)		(b)+8		; FAST_FLOAT * data
+ 
+ %define original_ebp	ebp+0
+ %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+ %define WK_NUM		2
+ 
+ 	align	16
 -	global	EXTN(jpeg_fdct_float_sse)
++	global	EXTN(jsimd_fdct_float_sse)
+ 
 -EXTN(jpeg_fdct_float_sse):
++EXTN(jsimd_fdct_float_sse):
+ 	push	ebp
+ 	mov	eax,esp				; eax = original ebp
+ 	sub	esp, byte 4
+ 	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+ 	mov	[esp],eax
+ 	mov	ebp,esp				; ebp = aligned ebp
+ 	lea	esp, [wk(0)]
+ 	pushpic	ebx
+ ;	push	ecx		; need not be preserved
+ ;	push	edx		; need not be preserved
+ ;	push	esi		; unused
+ ;	push	edi		; unused
+ 
+ 	get_GOT	ebx		; get GOT address
+ 
+ 	; ---- Pass 1: process rows.
+ 
+ 	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+ 	mov	ecx, DCTSIZE/4
+ 	alignx	16,7
+ .rowloop:
+ 
+ 	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm2, XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm3, XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)]
+ 
+ 	; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
+ 	; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
+ 
+ 	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
+ 	unpcklps xmm0,xmm1		; xmm0=(20 30 21 31)
+ 	unpckhps xmm4,xmm1		; xmm4=(22 32 23 33)
+ 	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
+ 	unpcklps xmm2,xmm3		; xmm2=(24 34 25 35)
+ 	unpckhps xmm5,xmm3		; xmm5=(26 36 27 37)
+ 
+ 	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
+ 
+ 	; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
+ 	; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
+ 
+ 	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 32 23 33)
+ 	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(24 34 25 35)
+ 
+ 	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
+ 	unpcklps xmm6,xmm7		; xmm6=(00 10 01 11)
+ 	unpckhps xmm4,xmm7		; xmm4=(02 12 03 13)
+ 	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
+ 	unpcklps xmm1,xmm3		; xmm1=(04 14 05 15)
+ 	unpckhps xmm2,xmm3		; xmm2=(06 16 07 17)
+ 
+ 	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
+ 	unpcklps2 xmm6,xmm0		; xmm6=(00 10 20 30)=data0
+ 	unpckhps2 xmm7,xmm0		; xmm7=(01 11 21 31)=data1
+ 	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
+ 	unpcklps2 xmm2,xmm5		; xmm2=(06 16 26 36)=data6
+ 	unpckhps2 xmm3,xmm5		; xmm3=(07 17 27 37)=data7
+ 
+ 	movaps	xmm0,xmm7
+ 	movaps	xmm5,xmm6
+ 	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
+ 	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
+ 	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
+ 	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
+ 
+ 	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 32 23 33)
+ 	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(24 34 25 35)
+ 	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
+ 	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+ 
+ 	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
+ 	unpcklps2 xmm4,xmm2		; xmm4=(02 12 22 32)=data2
+ 	unpckhps2 xmm7,xmm2		; xmm7=(03 13 23 33)=data3
+ 	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
+ 	unpcklps2 xmm1,xmm3		; xmm1=(04 14 24 34)=data4
+ 	unpckhps2 xmm6,xmm3		; xmm6=(05 15 25 35)=data5
+ 
+ 	movaps	xmm2,xmm7
+ 	movaps	xmm3,xmm4
+ 	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
+ 	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
+ 	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
+ 	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
+ 
+ 	; -- Even part
+ 
+ 	movaps	xmm1,xmm5
+ 	movaps	xmm6,xmm0
+ 	subps	xmm5,xmm7		; xmm5=tmp13
+ 	subps	xmm0,xmm4		; xmm0=tmp12
+ 	addps	xmm1,xmm7		; xmm1=tmp10
+ 	addps	xmm6,xmm4		; xmm6=tmp11
+ 
+ 	addps	xmm0,xmm5
+ 	mulps	xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+ 
+ 	movaps	xmm7,xmm1
+ 	movaps	xmm4,xmm5
+ 	subps	xmm1,xmm6		; xmm1=data4
+ 	subps	xmm5,xmm0		; xmm5=data6
+ 	addps	xmm7,xmm6		; xmm7=data0
+ 	addps	xmm4,xmm0		; xmm4=data2
+ 
+ 	movaps	XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], xmm1
+ 	movaps	XMMWORD [XMMBLOCK(2,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+ 	movaps	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ 	movaps	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+ 
+ 	; -- Odd part
+ 
+ 	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
+ 	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
+ 
+ 	addps	xmm2,xmm3		; xmm2=tmp10
+ 	addps	xmm3,xmm6		; xmm3=tmp11
+ 	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
+ 
+ 	mulps	xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+ 
+ 	movaps	xmm1,xmm2		; xmm1=tmp10
+ 	subps	xmm2,xmm6
+ 	mulps	xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+ 	mulps	xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ 	mulps	xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ 	addps	xmm1,xmm2		; xmm1=z2
+ 	addps	xmm6,xmm2		; xmm6=z4
+ 
+ 	movaps	xmm5,xmm0
+ 	subps	xmm0,xmm3		; xmm0=z13
+ 	addps	xmm5,xmm3		; xmm5=z11
+ 
+ 	movaps	xmm7,xmm0
+ 	movaps	xmm4,xmm5
+ 	subps	xmm0,xmm1		; xmm0=data3
+ 	subps	xmm5,xmm6		; xmm5=data7
+ 	addps	xmm7,xmm1		; xmm7=data5
+ 	addps	xmm4,xmm6		; xmm4=data1
+ 
+ 	movaps	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+ 	movaps	XMMWORD [XMMBLOCK(3,1,edx,SIZEOF_FAST_FLOAT)], xmm5
+ 	movaps	XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], xmm7
+ 	movaps	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+ 
+ 	add	edx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
+ 	dec	ecx
+ 	jnz	near .rowloop
+ 
+ 	; ---- Pass 2: process columns.
+ 
+ 	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
+ 	mov	ecx, DCTSIZE/4
+ 	alignx	16,7
+ .columnloop:
+ 
+ 	movaps	xmm0, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm1, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm2, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm3, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
+ 
+ 	; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
+ 	; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
+ 
+ 	movaps   xmm4,xmm0		; transpose coefficients(phase 1)
+ 	unpcklps xmm0,xmm1		; xmm0=(02 03 12 13)
+ 	unpckhps xmm4,xmm1		; xmm4=(22 23 32 33)
+ 	movaps   xmm5,xmm2		; transpose coefficients(phase 1)
+ 	unpcklps xmm2,xmm3		; xmm2=(42 43 52 53)
+ 	unpckhps xmm5,xmm3		; xmm5=(62 63 72 73)
+ 
+ 	movaps	xmm6, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm7, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm1, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm3, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
+ 
+ 	; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
+ 	; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
+ 
+ 	movaps	XMMWORD [wk(0)], xmm4	; wk(0)=(22 23 32 33)
+ 	movaps	XMMWORD [wk(1)], xmm2	; wk(1)=(42 43 52 53)
+ 
+ 	movaps   xmm4,xmm6		; transpose coefficients(phase 1)
+ 	unpcklps xmm6,xmm7		; xmm6=(00 01 10 11)
+ 	unpckhps xmm4,xmm7		; xmm4=(20 21 30 31)
+ 	movaps   xmm2,xmm1		; transpose coefficients(phase 1)
+ 	unpcklps xmm1,xmm3		; xmm1=(40 41 50 51)
+ 	unpckhps xmm2,xmm3		; xmm2=(60 61 70 71)
+ 
+ 	movaps    xmm7,xmm6		; transpose coefficients(phase 2)
+ 	unpcklps2 xmm6,xmm0		; xmm6=(00 01 02 03)=data0
+ 	unpckhps2 xmm7,xmm0		; xmm7=(10 11 12 13)=data1
+ 	movaps    xmm3,xmm2		; transpose coefficients(phase 2)
+ 	unpcklps2 xmm2,xmm5		; xmm2=(60 61 62 63)=data6
+ 	unpckhps2 xmm3,xmm5		; xmm3=(70 71 72 73)=data7
+ 
+ 	movaps	xmm0,xmm7
+ 	movaps	xmm5,xmm6
+ 	subps	xmm7,xmm2		; xmm7=data1-data6=tmp6
+ 	subps	xmm6,xmm3		; xmm6=data0-data7=tmp7
+ 	addps	xmm0,xmm2		; xmm0=data1+data6=tmp1
+ 	addps	xmm5,xmm3		; xmm5=data0+data7=tmp0
+ 
+ 	movaps	xmm2, XMMWORD [wk(0)]	; xmm2=(22 23 32 33)
+ 	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=(42 43 52 53)
+ 	movaps	XMMWORD [wk(0)], xmm7	; wk(0)=tmp6
+ 	movaps	XMMWORD [wk(1)], xmm6	; wk(1)=tmp7
+ 
+ 	movaps    xmm7,xmm4		; transpose coefficients(phase 2)
+ 	unpcklps2 xmm4,xmm2		; xmm4=(20 21 22 23)=data2
+ 	unpckhps2 xmm7,xmm2		; xmm7=(30 31 32 33)=data3
+ 	movaps    xmm6,xmm1		; transpose coefficients(phase 2)
+ 	unpcklps2 xmm1,xmm3		; xmm1=(40 41 42 43)=data4
+ 	unpckhps2 xmm6,xmm3		; xmm6=(50 51 52 53)=data5
+ 
+ 	movaps	xmm2,xmm7
+ 	movaps	xmm3,xmm4
+ 	addps	xmm7,xmm1		; xmm7=data3+data4=tmp3
+ 	addps	xmm4,xmm6		; xmm4=data2+data5=tmp2
+ 	subps	xmm2,xmm1		; xmm2=data3-data4=tmp4
+ 	subps	xmm3,xmm6		; xmm3=data2-data5=tmp5
+ 
+ 	; -- Even part
+ 
+ 	movaps	xmm1,xmm5
+ 	movaps	xmm6,xmm0
+ 	subps	xmm5,xmm7		; xmm5=tmp13
+ 	subps	xmm0,xmm4		; xmm0=tmp12
+ 	addps	xmm1,xmm7		; xmm1=tmp10
+ 	addps	xmm6,xmm4		; xmm6=tmp11
+ 
+ 	addps	xmm0,xmm5
+ 	mulps	xmm0,[GOTOFF(ebx,PD_0_707)] ; xmm0=z1
+ 
+ 	movaps	xmm7,xmm1
+ 	movaps	xmm4,xmm5
+ 	subps	xmm1,xmm6		; xmm1=data4
+ 	subps	xmm5,xmm0		; xmm5=data6
+ 	addps	xmm7,xmm6		; xmm7=data0
+ 	addps	xmm4,xmm0		; xmm4=data2
+ 
+ 	movaps	XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], xmm1
+ 	movaps	XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+ 	movaps	XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ 	movaps	XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+ 
+ 	; -- Odd part
+ 
+ 	movaps	xmm6, XMMWORD [wk(0)]	; xmm6=tmp6
+ 	movaps	xmm0, XMMWORD [wk(1)]	; xmm0=tmp7
+ 
+ 	addps	xmm2,xmm3		; xmm2=tmp10
+ 	addps	xmm3,xmm6		; xmm3=tmp11
+ 	addps	xmm6,xmm0		; xmm6=tmp12, xmm0=tmp7
+ 
+ 	mulps	xmm3,[GOTOFF(ebx,PD_0_707)] ; xmm3=z3
+ 
+ 	movaps	xmm1,xmm2		; xmm1=tmp10
+ 	subps	xmm2,xmm6
+ 	mulps	xmm2,[GOTOFF(ebx,PD_0_382)] ; xmm2=z5
+ 	mulps	xmm1,[GOTOFF(ebx,PD_0_541)] ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
+ 	mulps	xmm6,[GOTOFF(ebx,PD_1_306)] ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
+ 	addps	xmm1,xmm2		; xmm1=z2
+ 	addps	xmm6,xmm2		; xmm6=z4
+ 
+ 	movaps	xmm5,xmm0
+ 	subps	xmm0,xmm3		; xmm0=z13
+ 	addps	xmm5,xmm3		; xmm5=z11
+ 
+ 	movaps	xmm7,xmm0
+ 	movaps	xmm4,xmm5
+ 	subps	xmm0,xmm1		; xmm0=data3
+ 	subps	xmm5,xmm6		; xmm5=data7
+ 	addps	xmm7,xmm1		; xmm7=data5
+ 	addps	xmm4,xmm6		; xmm4=data1
+ 
+ 	movaps	XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], xmm0
+ 	movaps	XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], xmm5
+ 	movaps	XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], xmm7
+ 	movaps	XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], xmm4
+ 
+ 	add	edx, byte 4*SIZEOF_FAST_FLOAT
+ 	dec	ecx
+ 	jnz	near .columnloop
+ 
+ ;	pop	edi		; unused
+ ;	pop	esi		; unused
+ ;	pop	edx		; need not be preserved
+ ;	pop	ecx		; need not be preserved
+ 	poppic	ebx
+ 	mov	esp,ebp		; esp <- aligned ebp
+ 	pop	esp		; esp <- original ebp
+ 	pop	ebp
+ 	ret
+ 
 -%endif ; JFDCT_FLT_SSE_SUPPORTED
 -%endif ; DCT_FLOAT_SUPPORTED
diff --cc simd/jisseflt.asm
index 0000000,20eaeeb..6de93ad
mode 000000,100644..100644
--- a/simd/jisseflt.asm
+++ b/simd/jisseflt.asm
@@@ -1,0 -1,582 +1,569 @@@
+ ;
+ ; jisseflt.asm - floating-point IDCT (SSE & MMX)
+ ;
++; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
++;
++; Based on
+ ; x86 SIMD extension for IJG JPEG library
+ ; Copyright (C) 1999-2006, MIYASAKA Masaru.
+ ; For conditions of distribution and use, see copyright notice in jsimdext.inc
+ ;
+ ; This file should be assembled with NASM (Netwide Assembler),
+ ; can *not* be assembled with Microsoft's MASM or any compatible
+ ; assembler (including Borland's Turbo Assembler).
+ ; NASM is available from http://nasm.sourceforge.net/ or
+ ; http://sourceforge.net/project/showfiles.php?group_id=6208
+ ;
+ ; This file contains a floating-point implementation of the inverse DCT
+ ; (Discrete Cosine Transform). The following code is based directly on
+ ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
+ ;
 -; Last Modified : February 4, 2006
 -;
+ ; [TAB8]
+ 
 -%include "jsimdext.inc"
 -%include "jdct.inc"
 -
 -%ifdef DCT_FLOAT_SUPPORTED
 -%ifdef JIDCT_FLT_SSE_MMX_SUPPORTED
 -
 -; This module is specialized to the case DCTSIZE = 8.
 -;
 -%if DCTSIZE != 8
 -%error "Sorry, this code only copes with 8x8 DCTs."
 -%endif
++%include "simd/jsimdext.inc"
++%include "simd/jdct.inc"
+ 
+ ; --------------------------------------------------------------------------
+ 
+ %macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
+ 	shufps	%1,%2,0x44
+ %endmacro
+ 
+ %macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
+ 	shufps	%1,%2,0xEE
+ %endmacro
+ 
+ ; --------------------------------------------------------------------------
+ 	SECTION	SEG_CONST
+ 
+ 	alignz	16
+ 	global	EXTN(jconst_idct_float_sse)
+ 
+ EXTN(jconst_idct_float_sse):
+ 
+ PD_1_414	times 4 dd  1.414213562373095048801689
+ PD_1_847	times 4 dd  1.847759065022573512256366
+ PD_1_082	times 4 dd  1.082392200292393968799446
+ PD_M2_613	times 4 dd -2.613125929752753055713286
+ PD_0_125	times 4 dd  0.125	; 1/8
+ PB_CENTERJSAMP	times 8 db  CENTERJSAMPLE
+ 
+ 	alignz	16
+ 
+ ; --------------------------------------------------------------------------
+ 	SECTION	SEG_TEXT
+ 	BITS	32
+ ;
+ ; Perform dequantization and inverse DCT on one block of coefficients.
+ ;
+ ; GLOBAL(void)
 -; jpeg_idct_float_sse (j_decompress_ptr cinfo, jpeg_component_info * compptr,
 -;                      JCOEFPTR coef_block,
 -;                      JSAMPARRAY output_buf, JDIMENSION output_col)
++; jsimd_idct_float_sse (void * dct_table, JCOEFPTR coef_block,
++;                       JSAMPARRAY output_buf, JDIMENSION output_col)
+ ;
+ 
 -%define cinfo(b)	(b)+8		; j_decompress_ptr cinfo
 -%define compptr(b)	(b)+12		; jpeg_component_info * compptr
 -%define coef_block(b)	(b)+16		; JCOEFPTR coef_block
 -%define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
 -%define output_col(b)	(b)+24		; JDIMENSION output_col
++%define dct_table(b)	(b)+8			; void * dct_table
++%define coef_block(b)	(b)+12		; JCOEFPTR coef_block
++%define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
++%define output_col(b)	(b)+20		; JDIMENSION output_col
+ 
+ %define original_ebp	ebp+0
+ %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
+ %define WK_NUM		2
+ %define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
+ 					; FAST_FLOAT workspace[DCTSIZE2]
+ 
+ 	align	16
 -	global	EXTN(jpeg_idct_float_sse)
++	global	EXTN(jsimd_idct_float_sse)
+ 
 -EXTN(jpeg_idct_float_sse):
++EXTN(jsimd_idct_float_sse):
+ 	push	ebp
+ 	mov	eax,esp				; eax = original ebp
+ 	sub	esp, byte 4
+ 	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
+ 	mov	[esp],eax
+ 	mov	ebp,esp				; ebp = aligned ebp
+ 	lea	esp, [workspace]
+ 	push	ebx
+ ;	push	ecx		; need not be preserved
+ ;	push	edx		; need not be preserved
+ 	push	esi
+ 	push	edi
+ 
+ 	get_GOT	ebx		; get GOT address
+ 
+ 	; ---- Pass 1: process columns from input, store into work array.
+ 
+ ;	mov	eax, [original_ebp]
 -	mov	edx, POINTER [compptr(eax)]
 -	mov	edx, POINTER [jcompinfo_dct_table(edx)]	; quantptr
++	mov	edx, POINTER [dct_table(eax)]	; quantptr
+ 	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
+ 	lea	edi, [workspace]			; FAST_FLOAT * wsptr
+ 	mov	ecx, DCTSIZE/4				; ctr
+ 	alignx	16,7
+ .columnloop:
+ %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
+ 	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ 	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ 	jnz	near .columnDCT
+ 
+ 	movq	mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ 	movq	mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ 	por	mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ 	por	mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ 	por	mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ 	por	mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ 	por	mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ 	por	mm1,mm0
+ 	packsswb mm1,mm1
+ 	movd	eax,mm1
+ 	test	eax,eax
+ 	jnz	short .columnDCT
+ 
+ 	; -- AC terms all zero
+ 
+ 	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ 
+ 	punpckhwd mm1,mm0			; mm1=(** 02 ** 03)
+ 	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
+ 	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in0H=(02 03)
+ 	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
+ 	cvtpi2ps  xmm3,mm1			; xmm3=(02 03 ** **)
+ 	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
+ 	movlhps   xmm0,xmm3			; xmm0=in0=(00 01 02 03)
+ 
+ 	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ 
+ 	movaps	xmm1,xmm0
+ 	movaps	xmm2,xmm0
+ 	movaps	xmm3,xmm0
+ 
+ 	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
+ 	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
+ 	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
+ 	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
+ 
+ 	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+ 	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
+ 	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ 	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
+ 	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
+ 	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
+ 	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ 	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+ 	jmp	near .nextcolumn
+ 	alignx	16,7
+ %endif
+ .columnDCT:
+ 
+ 	; -- Even part
+ 
+ 	movq      mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
+ 	movq      mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
+ 	movq      mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
+ 	movq      mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
+ 
+ 	punpckhwd mm4,mm0			; mm4=(** 02 ** 03)
+ 	punpcklwd mm0,mm0			; mm0=(00 00 01 01)
+ 	punpckhwd mm5,mm1			; mm5=(** 22 ** 23)
+ 	punpcklwd mm1,mm1			; mm1=(20 20 21 21)
+ 
+ 	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in0H=(02 03)
+ 	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in0L=(00 01)
+ 	cvtpi2ps  xmm4,mm4			; xmm4=(02 03 ** **)
+ 	cvtpi2ps  xmm0,mm0			; xmm0=(00 01 ** **)
+ 	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in2H=(22 23)
+ 	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in2L=(20 21)
+ 	cvtpi2ps  xmm5,mm5			; xmm5=(22 23 ** **)
+ 	cvtpi2ps  xmm1,mm1			; xmm1=(20 21 ** **)
+ 
+ 	punpckhwd mm6,mm2			; mm6=(** 42 ** 43)
+ 	punpcklwd mm2,mm2			; mm2=(40 40 41 41)
+ 	punpckhwd mm7,mm3			; mm7=(** 62 ** 63)
+ 	punpcklwd mm3,mm3			; mm3=(60 60 61 61)
+ 
+ 	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in4H=(42 43)
+ 	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in4L=(40 41)
+ 	cvtpi2ps  xmm6,mm6			; xmm6=(42 43 ** **)
+ 	cvtpi2ps  xmm2,mm2			; xmm2=(40 41 ** **)
+ 	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in6H=(62 63)
+ 	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in6L=(60 61)
+ 	cvtpi2ps  xmm7,mm7			; xmm7=(62 63 ** **)
+ 	cvtpi2ps  xmm3,mm3			; xmm3=(60 61 ** **)
+ 
+ 	movlhps   xmm0,xmm4			; xmm0=in0=(00 01 02 03)
+ 	movlhps   xmm1,xmm5			; xmm1=in2=(20 21 22 23)
+ 	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ 	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ 
+ 	movlhps   xmm2,xmm6			; xmm2=in4=(40 41 42 43)
+ 	movlhps   xmm3,xmm7			; xmm3=in6=(60 61 62 63)
+ 	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ 	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ 
+ 	movaps	xmm4,xmm0
+ 	movaps	xmm5,xmm1
+ 	subps	xmm0,xmm2		; xmm0=tmp11
+ 	subps	xmm1,xmm3
+ 	addps	xmm4,xmm2		; xmm4=tmp10
+ 	addps	xmm5,xmm3		; xmm5=tmp13
+ 
+ 	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+ 	subps	xmm1,xmm5		; xmm1=tmp12
+ 
+ 	movaps	xmm6,xmm4
+ 	movaps	xmm7,xmm0
+ 	subps	xmm4,xmm5		; xmm4=tmp3
+ 	subps	xmm0,xmm1		; xmm0=tmp2
+ 	addps	xmm6,xmm5		; xmm6=tmp0
+ 	addps	xmm7,xmm1		; xmm7=tmp1
+ 
+ 	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+ 	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+ 
+ 	; -- Odd part
+ 
+ 	movq      mm4, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
+ 	movq      mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
+ 	movq      mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
+ 	movq      mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
+ 
+ 	punpckhwd mm6,mm4			; mm6=(** 12 ** 13)
+ 	punpcklwd mm4,mm4			; mm4=(10 10 11 11)
+ 	punpckhwd mm2,mm0			; mm2=(** 32 ** 33)
+ 	punpcklwd mm0,mm0			; mm0=(30 30 31 31)
+ 
+ 	psrad     mm6,(DWORD_BIT-WORD_BIT)	; mm6=in1H=(12 13)
+ 	psrad     mm4,(DWORD_BIT-WORD_BIT)	; mm4=in1L=(10 11)
+ 	cvtpi2ps  xmm4,mm6			; xmm4=(12 13 ** **)
+ 	cvtpi2ps  xmm2,mm4			; xmm2=(10 11 ** **)
+ 	psrad     mm2,(DWORD_BIT-WORD_BIT)	; mm2=in3H=(32 33)
+ 	psrad     mm0,(DWORD_BIT-WORD_BIT)	; mm0=in3L=(30 31)
+ 	cvtpi2ps  xmm0,mm2			; xmm0=(32 33 ** **)
+ 	cvtpi2ps  xmm3,mm0			; xmm3=(30 31 ** **)
+ 
+ 	punpckhwd mm7,mm5			; mm7=(** 52 ** 53)
+ 	punpcklwd mm5,mm5			; mm5=(50 50 51 51)
+ 	punpckhwd mm3,mm1			; mm3=(** 72 ** 73)
+ 	punpcklwd mm1,mm1			; mm1=(70 70 71 71)
+ 
+ 	movlhps   xmm2,xmm4			; xmm2=in1=(10 11 12 13)
+ 	movlhps   xmm3,xmm0			; xmm3=in3=(30 31 32 33)
+ 
+ 	psrad     mm7,(DWORD_BIT-WORD_BIT)	; mm7=in5H=(52 53)
+ 	psrad     mm5,(DWORD_BIT-WORD_BIT)	; mm5=in5L=(50 51)
+ 	cvtpi2ps  xmm4,mm7			; xmm4=(52 53 ** **)
+ 	cvtpi2ps  xmm5,mm5			; xmm5=(50 51 ** **)
+ 	psrad     mm3,(DWORD_BIT-WORD_BIT)	; mm3=in7H=(72 73)
+ 	psrad     mm1,(DWORD_BIT-WORD_BIT)	; mm1=in7L=(70 71)
+ 	cvtpi2ps  xmm0,mm3			; xmm0=(72 73 ** **)
+ 	cvtpi2ps  xmm1,mm1			; xmm1=(70 71 ** **)
+ 
+ 	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ 	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ 
+ 	movlhps   xmm5,xmm4			; xmm5=in5=(50 51 52 53)
+ 	movlhps   xmm1,xmm0			; xmm1=in7=(70 71 72 73)
+ 	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ 	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
+ 
+ 	movaps	xmm4,xmm2
+ 	movaps	xmm0,xmm5
+ 	addps	xmm2,xmm1		; xmm2=z11
+ 	addps	xmm5,xmm3		; xmm5=z13
+ 	subps	xmm4,xmm1		; xmm4=z12
+ 	subps	xmm0,xmm3		; xmm0=z10
+ 
+ 	movaps	xmm1,xmm2
+ 	subps	xmm2,xmm5
+ 	addps	xmm1,xmm5		; xmm1=tmp7
+ 
+ 	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+ 
+ 	movaps	xmm3,xmm0
+ 	addps	xmm0,xmm4
+ 	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+ 	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+ 	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+ 	addps	xmm3,xmm0		; xmm3=tmp12
+ 	subps	xmm4,xmm0		; xmm4=tmp10
+ 
+ 	; -- Final output stage
+ 
+ 	subps	xmm3,xmm1		; xmm3=tmp6
+ 	movaps	xmm5,xmm6
+ 	movaps	xmm0,xmm7
+ 	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
+ 	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
+ 	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
+ 	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
+ 	subps	xmm2,xmm3		; xmm2=tmp5
+ 
+ 	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
+ 	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
+ 	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
+ 	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
+ 	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
+ 	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
+ 
+ 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
+ 	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
+ 
+ 	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
+ 	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
+ 
+ 	addps	xmm4,xmm2		; xmm4=tmp4
+ 	movaps	xmm0,xmm7
+ 	movaps	xmm3,xmm5
+ 	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
+ 	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
+ 	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
+ 	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
+ 
+ 	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
+ 	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
+ 	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
+ 	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
+ 	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
+ 	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
+ 
+ 	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
+ 	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
+ 	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
+ 	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
+ 	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
+ 	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
+ 
+ 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
+ 	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
+ 
+ 	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
+ 	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
+ 	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
+ 	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
+ 
+ 	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
+ 	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
+ 	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
+ 	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
+ 	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
+ 	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
+ 
+ 	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
+ 	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
+ 	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
+ 	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
+ 
+ .nextcolumn:
+ 	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
+ 	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
+ 	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
+ 	dec	ecx					; ctr
+ 	jnz	near .columnloop
+ 
+ 	; -- Prefetch the next coefficient block
+ 
+ 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
+ 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
+ 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
+ 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
+ 
+ 	; ---- Pass 2: process rows from work array, store into output array.
+ 
+ 	mov	eax, [original_ebp]
+ 	lea	esi, [workspace]			; FAST_FLOAT * wsptr
+ 	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
+ 	mov	eax, JDIMENSION [output_col(eax)]
+ 	mov	ecx, DCTSIZE/4				; ctr
+ 	alignx	16,7
+ .rowloop:
+ 
+ 	; -- Even part
+ 
+ 	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
+ 
+ 	movaps	xmm4,xmm0
+ 	movaps	xmm5,xmm1
+ 	subps	xmm0,xmm2		; xmm0=tmp11
+ 	subps	xmm1,xmm3
+ 	addps	xmm4,xmm2		; xmm4=tmp10
+ 	addps	xmm5,xmm3		; xmm5=tmp13
+ 
+ 	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
+ 	subps	xmm1,xmm5		; xmm1=tmp12
+ 
+ 	movaps	xmm6,xmm4
+ 	movaps	xmm7,xmm0
+ 	subps	xmm4,xmm5		; xmm4=tmp3
+ 	subps	xmm0,xmm1		; xmm0=tmp2
+ 	addps	xmm6,xmm5		; xmm6=tmp0
+ 	addps	xmm7,xmm1		; xmm7=tmp1
+ 
+ 	movaps	XMMWORD [wk(1)], xmm4	; tmp3
+ 	movaps	XMMWORD [wk(0)], xmm0	; tmp2
+ 
+ 	; -- Odd part
+ 
+ 	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
+ 	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
+ 
+ 	movaps	xmm4,xmm2
+ 	movaps	xmm0,xmm5
+ 	addps	xmm2,xmm1		; xmm2=z11
+ 	addps	xmm5,xmm3		; xmm5=z13
+ 	subps	xmm4,xmm1		; xmm4=z12
+ 	subps	xmm0,xmm3		; xmm0=z10
+ 
+ 	movaps	xmm1,xmm2
+ 	subps	xmm2,xmm5
+ 	addps	xmm1,xmm5		; xmm1=tmp7
+ 
+ 	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
+ 
+ 	movaps	xmm3,xmm0
+ 	addps	xmm0,xmm4
+ 	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
+ 	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
+ 	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
+ 	addps	xmm3,xmm0		; xmm3=tmp12
+ 	subps	xmm4,xmm0		; xmm4=tmp10
+ 
+ 	; -- Final output stage
+ 
+ 	subps	xmm3,xmm1		; xmm3=tmp6
+ 	movaps	xmm5,xmm6
+ 	movaps	xmm0,xmm7
+ 	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
+ 	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
+ 	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
+ 	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
+ 	subps	xmm2,xmm3		; xmm2=tmp5
+ 
+ 	movaps	xmm1,[GOTOFF(ebx,PD_0_125)]	; xmm1=[PD_0_125]
+ 
+ 	mulps	xmm6,xmm1		; descale(1/8)
+ 	mulps	xmm7,xmm1		; descale(1/8)
+ 	mulps	xmm5,xmm1		; descale(1/8)
+ 	mulps	xmm0,xmm1		; descale(1/8)
+ 
+ 	movhlps   xmm3,xmm6
+ 	movhlps   xmm1,xmm7
+ 	cvtps2pi  mm0,xmm6		; round to int32, mm0=data0L=(00 10)
+ 	cvtps2pi  mm1,xmm7		; round to int32, mm1=data1L=(01 11)
+ 	cvtps2pi  mm2,xmm3		; round to int32, mm2=data0H=(20 30)
+ 	cvtps2pi  mm3,xmm1		; round to int32, mm3=data1H=(21 31)
+ 	packssdw  mm0,mm2		; mm0=data0=(00 10 20 30)
+ 	packssdw  mm1,mm3		; mm1=data1=(01 11 21 31)
+ 
+ 	movhlps   xmm6,xmm5
+ 	movhlps   xmm7,xmm0
+ 	cvtps2pi  mm4,xmm5		; round to int32, mm4=data7L=(07 17)
+ 	cvtps2pi  mm5,xmm0		; round to int32, mm5=data6L=(06 16)
+ 	cvtps2pi  mm6,xmm6		; round to int32, mm6=data7H=(27 37)
+ 	cvtps2pi  mm7,xmm7		; round to int32, mm7=data6H=(26 36)
+ 	packssdw  mm4,mm6		; mm4=data7=(07 17 27 37)
+ 	packssdw  mm5,mm7		; mm5=data6=(06 16 26 36)
+ 
+ 	packsswb  mm0,mm5		; mm0=(00 10 20 30 06 16 26 36)
+ 	packsswb  mm1,mm4		; mm1=(01 11 21 31 07 17 27 37)
+ 
+ 	movaps	xmm3, XMMWORD [wk(0)]	; xmm3=tmp2
+ 	movaps	xmm1, XMMWORD [wk(1)]	; xmm1=tmp3
+ 
+ 	movaps	xmm6,[GOTOFF(ebx,PD_0_125)]	; xmm6=[PD_0_125]
+ 
+ 	addps	xmm4,xmm2		; xmm4=tmp4
+ 	movaps	xmm5,xmm3
+ 	movaps	xmm0,xmm1
+ 	addps	xmm3,xmm2		; xmm3=data2=(02 12 22 32)
+ 	addps	xmm1,xmm4		; xmm1=data4=(04 14 24 34)
+ 	subps	xmm5,xmm2		; xmm5=data5=(05 15 25 35)
+ 	subps	xmm0,xmm4		; xmm0=data3=(03 13 23 33)
+ 
+ 	mulps	xmm3,xmm6		; descale(1/8)
+ 	mulps	xmm1,xmm6		; descale(1/8)
+ 	mulps	xmm5,xmm6		; descale(1/8)
+ 	mulps	xmm0,xmm6		; descale(1/8)
+ 
+ 	movhlps   xmm7,xmm3
+ 	movhlps   xmm2,xmm1
+ 	cvtps2pi  mm2,xmm3		; round to int32, mm2=data2L=(02 12)
+ 	cvtps2pi  mm3,xmm1		; round to int32, mm3=data4L=(04 14)
+ 	cvtps2pi  mm6,xmm7		; round to int32, mm6=data2H=(22 32)
+ 	cvtps2pi  mm7,xmm2		; round to int32, mm7=data4H=(24 34)
+ 	packssdw  mm2,mm6		; mm2=data2=(02 12 22 32)
+ 	packssdw  mm3,mm7		; mm3=data4=(04 14 24 34)
+ 
+ 	movhlps   xmm4,xmm5
+ 	movhlps   xmm6,xmm0
+ 	cvtps2pi  mm5,xmm5		; round to int32, mm5=data5L=(05 15)
+ 	cvtps2pi  mm4,xmm0		; round to int32, mm4=data3L=(03 13)
+ 	cvtps2pi  mm6,xmm4		; round to int32, mm6=data5H=(25 35)
+ 	cvtps2pi  mm7,xmm6		; round to int32, mm7=data3H=(23 33)
+ 	packssdw  mm5,mm6		; mm5=data5=(05 15 25 35)
+ 	packssdw  mm4,mm7		; mm4=data3=(03 13 23 33)
+ 
+ 	movq      mm6,[GOTOFF(ebx,PB_CENTERJSAMP)]	; mm6=[PB_CENTERJSAMP]
+ 
+ 	packsswb  mm2,mm3		; mm2=(02 12 22 32 04 14 24 34)
+ 	packsswb  mm4,mm5		; mm4=(03 13 23 33 05 15 25 35)
+ 
+ 	paddb     mm0,mm6
+ 	paddb     mm1,mm6
+ 	paddb     mm2,mm6
+ 	paddb     mm4,mm6
+ 
+ 	movq      mm7,mm0		; transpose coefficients(phase 1)
+ 	punpcklbw mm0,mm1		; mm0=(00 01 10 11 20 21 30 31)
+ 	punpckhbw mm7,mm1		; mm7=(06 07 16 17 26 27 36 37)
+ 	movq      mm3,mm2		; transpose coefficients(phase 1)
+ 	punpcklbw mm2,mm4		; mm2=(02 03 12 13 22 23 32 33)
+ 	punpckhbw mm3,mm4		; mm3=(04 05 14 15 24 25 34 35)
+ 
+ 	movq      mm5,mm0		; transpose coefficients(phase 2)
+ 	punpcklwd mm0,mm2		; mm0=(00 01 02 03 10 11 12 13)
+ 	punpckhwd mm5,mm2		; mm5=(20 21 22 23 30 31 32 33)
+ 	movq      mm6,mm3		; transpose coefficients(phase 2)
+ 	punpcklwd mm3,mm7		; mm3=(04 05 06 07 14 15 16 17)
+ 	punpckhwd mm6,mm7		; mm6=(24 25 26 27 34 35 36 37)
+ 
+ 	movq      mm1,mm0		; transpose coefficients(phase 3)
+ 	punpckldq mm0,mm3		; mm0=(00 01 02 03 04 05 06 07)
+ 	punpckhdq mm1,mm3		; mm1=(10 11 12 13 14 15 16 17)
+ 	movq      mm4,mm5		; transpose coefficients(phase 3)
+ 	punpckldq mm5,mm6		; mm5=(20 21 22 23 24 25 26 27)
+ 	punpckhdq mm4,mm6		; mm4=(30 31 32 33 34 35 36 37)
+ 
+ 	pushpic	ebx			; save GOT address
+ 
+ 	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
+ 	mov	ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
+ 	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm0
+ 	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
+ 	mov	edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
+ 	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
+ 	movq	MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
+ 	movq	MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm4
+ 
+ 	poppic	ebx			; restore GOT address
+ 
+ 	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
+ 	add	edi, byte 4*SIZEOF_JSAMPROW
+ 	dec	ecx				; ctr
+ 	jnz	near .rowloop
+ 
+ 	emms		; empty MMX state
+ 
+ 	pop	edi
+ 	pop	esi
+ ;	pop	edx		; need not be preserved
+ ;	pop	ecx		; need not be preserved
+ 	pop	ebx
+ 	mov	esp,ebp		; esp <- aligned ebp
+ 	pop	esp		; esp <- original ebp
+ 	pop	ebp
+ 	ret
+ 
 -%endif ; JIDCT_FLT_SSE_MMX_SUPPORTED
 -%endif ; DCT_FLOAT_SUPPORTED
diff --cc simd/jsimd.h
index 8e78eab,0000000..98bcebc
mode 100644,000000..100644
--- a/simd/jsimd.h
+++ b/simd/jsimd.h
@@@ -1,138 -1,0 +1,162 @@@
 +/*
 + * simd/jsimd.h
 + *
 + * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 + * 
 + * Based on the x86 SIMD extension for IJG JPEG library,
 + * Copyright (C) 1999-2006, MIYASAKA Masaru.
 + *
 + */
 +
 +/* Bitmask for supported acceleration methods */
 +
 +#define JSIMD_NONE    0x00
 +#define JSIMD_MMX     0x01
 +#define JSIMD_3DNOW   0x02
++#define JSIMD_SSE     0x04
 +
 +/* Short forms of external names for systems with brain-damaged linkers. */
 +
 +#ifdef NEED_SHORT_EXTERNAL_NAMES
 +#define jpeg_simd_cpu_support                 jSiCpuSupport
 +#define jsimd_rgb_ycc_convert_mmx             jSRGBYCCM
 +#define jsimd_ycc_rgb_convert_mmx             jSYCCRGBM
 +#define jsimd_h2v2_downsample_mmx             jSDnH2V2M
 +#define jsimd_h2v1_downsample_mmx             jSDnH2V1M
 +#define jsimd_h2v2_upsample_mmx               jSUpH2V2M
 +#define jsimd_h2v1_upsample_mmx               jSUpH2V1M
 +#define jsimd_h2v2_fancy_upsample_mmx         jSFUpH2V2M
 +#define jsimd_h2v1_fancy_upsample_mmx         jSFUpH2V1M
 +#define jsimd_h2v2_merged_upsample_mmx        jSMUpH2V2M
 +#define jsimd_h2v1_merged_upsample_mmx        jSMUpH2V1M
 +#define jsimd_convsamp_mmx                    jSConvM
 +#define jsimd_convsamp_float_3dnow            jSConvF3D
++#define jsimd_convsamp_float_sse              jSConvFS
 +#define jsimd_fdct_islow_mmx                  jSFDMIS
 +#define jsimd_fdct_ifast_mmx                  jSFDMIF
 +#define jsimd_fdct_float_3dnow                jSFD3DF
++#define jconst_fdct_float_sse                 jSCFDSF
++#define jsimd_fdct_float_sse                  jSFDSF
 +#define jsimd_quantize_mmx                    jSQuantM
 +#define jsimd_quantize_float_3dnow            jSQuantF3D
++#define jsimd_quantize_float_sse              jSQuantFS
 +#define jsimd_idct_2x2_mmx                    jSIDM22
 +#define jsimd_idct_4x4_mmx                    jSIDM44
 +#define jsimd_idct_islow_mmx                  jSIDMIS
 +#define jsimd_idct_ifast_mmx                  jSIDMIF
 +#define jsimd_idct_float_3dnow                jSID3DF
++#define jconst_fdct_float_sse                 jSCIDSF
++#define jsimd_idct_float_sse                  jSIDSF
 +#endif /* NEED_SHORT_EXTERNAL_NAMES */
 +
 +/* SIMD Ext: retrieve SIMD/CPU information */
 +EXTERN(unsigned int) jpeg_simd_cpu_support JPP((void));
 +
 +/* SIMD Color Space Conversion */
 +EXTERN(void) jsimd_rgb_ycc_convert_mmx
 +        JPP((JDIMENSION img_width,
 +             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
 +             JDIMENSION output_row, int num_rows));
 +EXTERN(void) jsimd_ycc_rgb_convert_mmx
 +        JPP((JDIMENSION out_width,
 +             JSAMPIMAGE input_buf, JDIMENSION input_row,
 +             JSAMPARRAY output_buf, int num_rows));
 +
 +/* SIMD Downsample */
 +EXTERN(void) jsimd_h2v2_downsample_mmx
 +        JPP((JDIMENSION image_width, int max_v_samp_factor,
 +             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
 +             JSAMPARRAY input_data, JSAMPARRAY output_data));
 +EXTERN(void) jsimd_h2v1_downsample_mmx
 +        JPP((JDIMENSION image_width, int max_v_samp_factor,
 +             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
 +             JSAMPARRAY input_data, JSAMPARRAY output_data));
 +
 +/* SIMD Upsample */
 +EXTERN(void) jsimd_h2v2_upsample_mmx
 +        JPP((int max_v_samp_factor, JDIMENSION output_width,
 +             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
 +EXTERN(void) jsimd_h2v1_upsample_mmx
 +        JPP((int max_v_samp_factor, JDIMENSION output_width,
 +             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
 +
 +EXTERN(void) jsimd_h2v2_fancy_upsample_mmx
 +        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
 +             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
 +EXTERN(void) jsimd_h2v1_fancy_upsample_mmx
 +        JPP((int max_v_samp_factor, JDIMENSION downsampled_width,
 +             JSAMPARRAY input_data, JSAMPARRAY * output_data_ptr));
 +
 +EXTERN(void) jsimd_h2v2_merged_upsample_mmx
 +        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
 +             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
 +EXTERN(void) jsimd_h2v1_merged_upsample_mmx
 +        JPP((JDIMENSION output_width, JSAMPIMAGE input_buf,
 +             JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf));
 +
 +/* SIMD Sample Conversion */
 +EXTERN(void) jsimd_convsamp_mmx JPP((JSAMPARRAY sample_data,
 +                                     JDIMENSION start_col,
 +                                     DCTELEM * workspace));
 +
 +EXTERN(void) jsimd_convsamp_float_3dnow JPP((JSAMPARRAY sample_data,
 +                                             JDIMENSION start_col,
 +                                             FAST_FLOAT * workspace));
 +
++EXTERN(void) jsimd_convsamp_float_sse JPP((JSAMPARRAY sample_data,
++                                           JDIMENSION start_col,
++                                           FAST_FLOAT * workspace));
++
 +/* SIMD Forward DCT */
 +EXTERN(void) jsimd_fdct_islow_mmx JPP((DCTELEM * data));
 +EXTERN(void) jsimd_fdct_ifast_mmx JPP((DCTELEM * data));
 +
 +EXTERN(void) jsimd_fdct_float_3dnow JPP((FAST_FLOAT * data));
 +
++extern const int jconst_fdct_float_sse[];
++EXTERN(void) jsimd_fdct_float_sse JPP((FAST_FLOAT * data));
++
 +/* SIMD Quantization */
 +EXTERN(void) jsimd_quantize_mmx JPP((JCOEFPTR coef_block,
 +                                     DCTELEM * divisors,
 +                                     DCTELEM * workspace));
 +
 +EXTERN(void) jsimd_quantize_float_3dnow JPP((JCOEFPTR coef_block,
 +                                             FAST_FLOAT * divisors,
 +                                             FAST_FLOAT * workspace));
 +
++EXTERN(void) jsimd_quantize_float_sse JPP((JCOEFPTR coef_block,
++                                           FAST_FLOAT * divisors,
++                                           FAST_FLOAT * workspace));
++
 +/* SIMD Reduced Inverse DCT */
 +EXTERN(void) jsimd_idct_2x2_mmx JPP((void * dct_table,
 +                                     JCOEFPTR coef_block,
 +                                     JSAMPARRAY output_buf,
 +                                     JDIMENSION output_col));
 +EXTERN(void) jsimd_idct_4x4_mmx JPP((void * dct_table,
 +                                     JCOEFPTR coef_block,
 +                                     JSAMPARRAY output_buf,
 +                                     JDIMENSION output_col));
 +
 +/* SIMD Inverse DCT */
 +EXTERN(void) jsimd_idct_islow_mmx JPP((void * dct_table,
 +                                       JCOEFPTR coef_block,
 +                                       JSAMPARRAY output_buf,
 +                                       JDIMENSION output_col));
 +EXTERN(void) jsimd_idct_ifast_mmx JPP((void * dct_table,
 +                                       JCOEFPTR coef_block,
 +                                       JSAMPARRAY output_buf,
 +                                       JDIMENSION output_col));
 +
 +EXTERN(void) jsimd_idct_float_3dnow JPP((void * dct_table,
 +                                         JCOEFPTR coef_block,
 +                                         JSAMPARRAY output_buf,
 +                                         JDIMENSION output_col));
 +
++extern const int jconst_idct_float_sse[];
++EXTERN(void) jsimd_idct_float_sse JPP((void * dct_table,
++                                       JCOEFPTR coef_block,
++                                       JSAMPARRAY output_buf,
++                                       JDIMENSION output_col));
++
diff --cc simd/jsimdcfg.inc.h
index e1f2bdd,0000000..2779565
mode 100644,000000..100644
--- a/simd/jsimdcfg.inc.h
+++ b/simd/jsimdcfg.inc.h
@@@ -1,129 -1,0 +1,136 @@@
 +// This file generates the include file for the assembly
 +// implementations by abusing the C preprocessor.
 +//
 +// Note: Some things are manually defined as they need to
 +// be mapped to NASM types.
 +
 +;
 +; Automatically generated include file from jsimdcfg.inc.h
 +;
 +
 +#define JPEG_INTERNALS
 +
 +#include "../jpeglib.h"
 +#include "../jconfig.h"
 +#include "../jmorecfg.h"
 +#include "jsimd.h"
 +
 +#define define(var) %define _cpp_protection_##var
 +#define definev(var) %define _cpp_protection_##var var
 +
 +;
 +; -- jpeglib.h
 +;
 +
 +definev(DCTSIZE)
 +definev(DCTSIZE2)
 +
 +;
 +; -- jmorecfg.h
 +;
 +
 +definev(RGB_RED)
 +definev(RGB_GREEN)
 +definev(RGB_BLUE)
 +
 +definev(RGB_PIXELSIZE)
 +
 +; Representation of a single sample (pixel element value).
 +; On this SIMD implementation, this must be 'unsigned char'.
 +;
 +
 +%define JSAMPLE                 byte          ; unsigned char
 +%define SIZEOF_JSAMPLE          SIZEOF_BYTE   ; sizeof(JSAMPLE)
 +
 +definev(CENTERJSAMPLE)
 +
 +; Representation of a DCT frequency coefficient.
 +; On this SIMD implementation, this must be 'short'.
 +;
 +%define JCOEF                   word          ; short
 +%define SIZEOF_JCOEF            SIZEOF_WORD   ; sizeof(JCOEF)
 +
 +; Datatype used for image dimensions.
 +; On this SIMD implementation, this must be 'unsigned int'.
 +;
 +%define JDIMENSION              dword         ; unsigned int
 +%define SIZEOF_JDIMENSION       SIZEOF_DWORD  ; sizeof(JDIMENSION)
 +
 +%define JSAMPROW                POINTER       ; JSAMPLE FAR * (jpeglib.h)
 +%define JSAMPARRAY              POINTER       ; JSAMPROW *    (jpeglib.h)
 +%define JSAMPIMAGE              POINTER       ; JSAMPARRAY *  (jpeglib.h)
 +%define JCOEFPTR                POINTER       ; JCOEF FAR *   (jpeglib.h)
 +%define SIZEOF_JSAMPROW         SIZEOF_POINTER  ; sizeof(JSAMPROW)
 +%define SIZEOF_JSAMPARRAY       SIZEOF_POINTER  ; sizeof(JSAMPARRAY)
 +%define SIZEOF_JSAMPIMAGE       SIZEOF_POINTER  ; sizeof(JSAMPIMAGE)
 +%define SIZEOF_JCOEFPTR         SIZEOF_POINTER  ; sizeof(JCOEFPTR)
 +
 +;
 +; -- jdct.h
 +;
 +
 +; A forward DCT routine is given a pointer to a work area of type DCTELEM[];
 +; the DCT is to be performed in-place in that buffer.
 +; To maximize parallelism, Type DCTELEM is changed to short (originally, int).
 +;
 +%define DCTELEM                 word          ; short
 +%define SIZEOF_DCTELEM          SIZEOF_WORD   ; sizeof(DCTELEM)
 +
 +%define FAST_FLOAT              FP32            ; float
 +%define SIZEOF_FAST_FLOAT       SIZEOF_FP32     ; sizeof(FAST_FLOAT)
 +
 +; To maximize parallelism, Type MULTIPLIER is changed to short.
 +;
 +%define ISLOW_MULT_TYPE         word          ; must be short
 +%define SIZEOF_ISLOW_MULT_TYPE  SIZEOF_WORD   ; sizeof(ISLOW_MULT_TYPE)
 +
 +%define IFAST_MULT_TYPE         word          ; must be short
 +%define SIZEOF_IFAST_MULT_TYPE  SIZEOF_WORD   ; sizeof(IFAST_MULT_TYPE)
 +%define IFAST_SCALE_BITS        2             ; fractional bits in scale factors
 +
 +%define FLOAT_MULT_TYPE         FP32          ; must be float
 +%define SIZEOF_FLOAT_MULT_TYPE  SIZEOF_FP32   ; sizeof(FLOAT_MULT_TYPE)
 +
 +;
 +; -- jsimd.h
 +;
 +
 +definev(JSIMD_NONE)
 +definev(JSIMD_MMX)
 +definev(JSIMD_3DNOW)
++definev(JSIMD_SSE)
 +
 +; Short forms of external names for systems with brain-damaged linkers.
 +;
 +#ifdef NEED_SHORT_EXTERNAL_NAMES
 +definev(jpeg_simd_cpu_support)
 +definev(jsimd_rgb_ycc_convert_mmx)
 +definev(jsimd_ycc_rgb_convert_mmx)
 +definev(jsimd_h2v2_downsample_mmx)
 +definev(jsimd_h2v1_downsample_mmx)
 +definev(jsimd_h2v2_upsample_mmx)
 +definev(jsimd_h2v1_upsample_mmx)
 +definev(jsimd_h2v1_fancy_upsample_mmx)
 +definev(jsimd_h2v2_fancy_upsample_mmx)
 +definev(jsimd_h2v1_merged_upsample_mmx)
 +definev(jsimd_h2v2_merged_upsample_mmx)
 +definev(jsimd_convsamp_mmx)
 +definev(jsimd_convsamp_float_3dnow)
++definev(jsimd_convsamp_float_sse)
 +definev(jsimd_fdct_islow_mmx)
 +definev(jsimd_fdct_ifast_mmx)
 +definev(jsimd_fdct_float_3dnow)
++definev(jconst_fdct_float_sse)
++definev(jsimd_fdct_float_sse)
 +definev(jsimd_quantize_mmx)
 +definev(jsimd_quantize_float_3dnow)
++definev(jsimd_quantize_float_sse)
 +definev(jsimd_idct_2x2_mmx)
 +definev(jsimd_idct_4x4_mmx)
 +definev(jsimd_idct_islow_mmx)
 +definev(jsimd_idct_ifast_mmx)
 +definev(jsimd_idct_float_3dnow)
++definev(jconst_idct_float_sse)
++definev(jsimd_idct_float_sse)
 +#endif /* NEED_SHORT_EXTERNAL_NAMES */
 +
diff --cc simd/jsimdcpu.asm
index b65699e,1c851d1..3561b08
--- a/simd/jsimdcpu.asm
+++ b/simd/jsimdcpu.asm
@@@ -66,6 -76,14 +66,10 @@@ EXTN(jpeg_simd_cpu_support)
  	jz	short .no_mmx
  	or	edi, byte JSIMD_MMX
  .no_mmx:
+ 	test	eax, 1<<25		; bit25:SSE
+ 	jz	short .no_sse
+ 	or	edi, byte JSIMD_SSE
+ .no_sse:
 -	test	eax, 1<<26		; bit26:SSE2
 -	jz	short .no_sse2
 -	or	edi, byte JSIMD_SSE2
 -.no_sse2:
  
  	; Check for 3DNow! instruction support
  	mov	eax, 0x80000000
diff --cc simd/jsimdext.inc
index 509e95d,a502c07..5fcd7be
--- a/simd/jsimdext.inc
+++ b/simd/jsimdext.inc
@@@ -100,34 -103,119 +100,42 @@@
  
  ; ==========================================================================
  
 -; ---- jpeglib.h -----------------------------------------------------------
 -
 -%define DCTSIZE		8	; The basic DCT block is 8x8 samples
 -%define DCTSIZE2	64	; DCTSIZE squared; # of elements in a block
 -
 -%define JSIMD_NONE	0x00	; bitflags for jpeg_simd_*_support()
 -%define JSIMD_MMX	0x01
 -%define JSIMD_3DNOW	0x02
 -%define JSIMD_SSE	0x04
 -%define JSIMD_SSE2	0x08
 -%define JSIMD_ALL	(JSIMD_MMX | JSIMD_3DNOW | JSIMD_SSE | JSIMD_SSE2)
 -
 -; ---- jpegint.h -----------------------------------------------------------
 -
 -; Short forms of external names for systems with brain-damaged linkers.
 -;
 -%ifdef NEED_SHORT_EXTERNAL_NAMES
 -%define jpeg_simd_cpu_support	jSiCpuSupport
 -%define jpeg_simd_os_support	jSiOsSupport
 -%endif ; NEED_SHORT_EXTERNAL_NAMES
 -
 -; ---- jmorecfg.h ----------------------------------------------------------
 -;
 -; BITS_IN_JSAMPLE==8 (8-bit sample values) is the only valid setting
 -; on this SIMD implementation.
 -;
 -%define BITS_IN_JSAMPLE	8	; Caution: Cannot be changed
 -
 -; Representation of a single sample (pixel element value).
 -; On this SIMD implementation, this must be 'unsigned char'.
 -;
 -%define JSAMPLE		byte		; unsigned char
 -%define SIZEOF_JSAMPLE	SIZEOF_BYTE	; sizeof(JSAMPLE)
 -%define MAXJSAMPLE	255
 -%define CENTERJSAMPLE	128
 -
 -; Representation of a DCT frequency coefficient.
 -; On this SIMD implementation, this must be 'short'.
 -;
 -%define JCOEF		word		; short
 -%define SIZEOF_JCOEF	SIZEOF_WORD	; sizeof(JCOEF)
 -
 -; INT32 must hold at least signed 32-bit values.
 -; On this SIMD implementation, this must be 'long'.
 -;
 -%define INT32		dword		; long
 -%define SIZEOF_INT32	SIZEOF_DWORD	; sizeof(INT32)
 -
 -; Datatype used for image dimensions.
 -; On this SIMD implementation, this must be 'unsigned int'.
 -;
 -%define JDIMENSION		dword		; unsigned int
 -%define SIZEOF_JDIMENSION	SIZEOF_DWORD	; sizeof(JDIMENSION)
 -
  ; --------------------------------------------------------------------------
 -
 -%define JSAMPROW		POINTER		; JSAMPLE FAR * (jpeglib.h)
 -%define JSAMPARRAY		POINTER		; JSAMPROW *    (jpeglib.h)
 -%define JSAMPIMAGE		POINTER		; JSAMPARRAY *  (jpeglib.h)
 -%define JCOEFPTR		POINTER		; JCOEF FAR *   (jpeglib.h)
 -%define SIZEOF_JSAMPROW		SIZEOF_POINTER	; sizeof(JSAMPROW)
 -%define SIZEOF_JSAMPARRAY	SIZEOF_POINTER	; sizeof(JSAMPARRAY)
 -%define SIZEOF_JSAMPIMAGE	SIZEOF_POINTER	; sizeof(JSAMPIMAGE)
 -%define SIZEOF_JCOEFPTR		SIZEOF_POINTER	; sizeof(JCOEFPTR)
 -
 -%define POINTER			dword		; general pointer type
 -%define SIZEOF_POINTER		SIZEOF_DWORD	; sizeof(POINTER)
 -%define POINTER_BIT		DWORD_BIT	; sizeof(POINTER)*BYTE_BIT
 -
 -%define INT			dword		; signed integer type
 -%define SIZEOF_INT		SIZEOF_DWORD	; sizeof(INT)
 -%define INT_BIT			DWORD_BIT	; sizeof(INT)*BYTE_BIT
 -
 -%define FP32			dword		; IEEE754 single
 -%define SIZEOF_FP32		SIZEOF_DWORD	; sizeof(FP32)
 -%define FP32_BIT		DWORD_BIT	; sizeof(FP32)*BYTE_BIT
 -
 -%define FP64			qword		; IEEE754 double
 -%define SIZEOF_FP64		SIZEOF_QWORD	; sizeof(FP64)
 -%define FP64_BIT		QWORD_BIT	; sizeof(FP64)*BYTE_BIT
 -
 -%define FP80			tword		; IEEE754 double-extended(x86)
 -%define SIZEOF_FP80		SIZEOF_TWORD	; sizeof(FP80)
 -%define FP80_BIT		TWORD_BIT	; sizeof(FP80)*BYTE_BIT
 -
 -%define MMWORD			qword		; int64  (MMX register)
 -%define SIZEOF_MMWORD		SIZEOF_QWORD	; sizeof(MMWORD)
 -%define MMWORD_BIT		QWORD_BIT	; sizeof(MMWORD)*BYTE_BIT
 -
 -%define XMMWORD			dqword		; int128 (SSE register)
 -%define SIZEOF_XMMWORD		SIZEOF_DQWORD	; sizeof(XMMWORD)
 -%define XMMWORD_BIT		DQWORD_BIT	; sizeof(XMMWORD)*BYTE_BIT
 -
 -%define SIZEOF_BYTE		1		; sizeof(BYTE)
 -%define SIZEOF_WORD		2		; sizeof(WORD)
 -%define SIZEOF_DWORD		4		; sizeof(DWORD)
 -%define SIZEOF_QWORD		8		; sizeof(QWORD)
 -%define SIZEOF_TBYTE		10		; sizeof(TBYTE)
 -%define SIZEOF_TWORD		10		; sizeof(TWORD)
 -%define SIZEOF_DQWORD		16		; sizeof(DQWORD)
 -
 -%define BYTE_BIT		8		; CHAR_BIT in C
 -%define WORD_BIT		16		; sizeof(WORD)*BYTE_BIT
 -%define DWORD_BIT		32		; sizeof(DWORD)*BYTE_BIT
 -%define QWORD_BIT		64		; sizeof(QWORD)*BYTE_BIT
 -%define TBYTE_BIT		80		; sizeof(TBYTE)*BYTE_BIT
 -%define TWORD_BIT		80		; sizeof(TWORD)*BYTE_BIT
 -%define DQWORD_BIT		128		; sizeof(DQWORD)*BYTE_BIT
 -
 -%idefine TBYTE	TWORD	; NASM uses the keyword 'TWORD' instead of 'TBYTE'
 -%idefine DQWORD		; currently not supported by NASM
 -%idefine _MMWORD	;
 -%idefine _DWORD		;
 +;  Common types
 +;
 +%define POINTER                 dword           ; general pointer type
 +%define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
 +%define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
 +
 +%define INT                     dword           ; signed integer type
 +%define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
 +%define INT_BIT                 DWORD_BIT       ; sizeof(INT)*BYTE_BIT
 +
 +%define FP32                    dword           ; IEEE754 single
 +%define SIZEOF_FP32             SIZEOF_DWORD    ; sizeof(FP32)
 +%define FP32_BIT                DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
 +
 +%define MMWORD                  qword           ; int64  (MMX register)
 +%define SIZEOF_MMWORD           SIZEOF_QWORD    ; sizeof(MMWORD)
 +%define MMWORD_BIT              QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
 +
++; NASM is buggy and doesn't properly handle operand sizes for SSE
++; instructions, so for now we have to define XMMWORD as blank.
++%define XMMWORD                                 ; int128 (SSE register)
++%define SIZEOF_XMMWORD          SIZEOF_OWORD    ; sizeof(XMMWORD)
++%define XMMWORD_BIT             OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
++
 +%define SIZEOF_BYTE             1               ; sizeof(BYTE)
 +%define SIZEOF_WORD             2               ; sizeof(WORD)
 +%define SIZEOF_DWORD            4               ; sizeof(DWORD)
 +%define SIZEOF_QWORD            8               ; sizeof(QWORD)
++%define SIZEOF_OWORD            16              ; sizeof(OWORD)
 +
 +%define BYTE_BIT                8               ; CHAR_BIT in C
 +%define WORD_BIT                16              ; sizeof(WORD)*BYTE_BIT
 +%define DWORD_BIT               32              ; sizeof(DWORD)*BYTE_BIT
 +%define QWORD_BIT               64              ; sizeof(QWORD)*BYTE_BIT
++%define OWORD_BIT               128             ; sizeof(OWORD)*BYTE_BIT
  
  ; --------------------------------------------------------------------------
  ;  External Symbol Name