From: DRC Date: Wed, 30 Jan 2019 20:12:06 +0000 (-0600) Subject: MMI: Use aligned store instructions when possible X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=db84125fcb05bbc48430b53d45b89392a8f3609e;p=libjpeg-turbo MMI: Use aligned store instructions when possible This improves decompression performance by 2-5%. --- diff --git a/simd/loongson/jdcolext-mmi.c b/simd/loongson/jdcolext-mmi.c index 560d9b0..7f44649 100644 --- a/simd/loongson/jdcolext-mmi.c +++ b/simd/loongson/jdcolext-mmi.c @@ -2,8 +2,8 @@ * Loongson MMI optimizations for libjpeg-turbo * * Copyright 2009 Pierre Ossman for Cendio AB - * Copyright (C) 2015, D. R. Commander. All Rights Reserved. - * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing. + * Copyright (C) 2015, 2019, D. R. Commander. All Rights Reserved. + * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing. * All Rights Reserved. * Authors: ZhuChen * SunZhangzhi @@ -251,9 +251,15 @@ void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf, mmC = _mm_unpacklo_pi32(mmC, mmF); /* mmC=(15 25 06 16 26 07 17 27) */ if (num_cols >= 8) { - _mm_store_si64((__m64 *)outptr, mmA); - _mm_store_si64((__m64 *)(outptr + 8), mmE); - _mm_store_si64((__m64 *)(outptr + 16), mmC); + if (!(((long)outptr) & 7)) { + _mm_store_si64((__m64 *)outptr, mmA); + _mm_store_si64((__m64 *)(outptr + 8), mmE); + _mm_store_si64((__m64 *)(outptr + 16), mmC); + } else { + _mm_storeu_si64((__m64 *)outptr, mmA); + _mm_storeu_si64((__m64 *)(outptr + 8), mmE); + _mm_storeu_si64((__m64 *)(outptr + 16), mmC); + } outptr += RGB_PIXELSIZE * 8; } else { col = num_cols * 3; diff --git a/simd/loongson/loongson-mmintrin.h b/simd/loongson/loongson-mmintrin.h index 50d166b..db9b35a 100644 --- a/simd/loongson/loongson-mmintrin.h +++ b/simd/loongson/loongson-mmintrin.h @@ -1217,14 +1217,24 @@ _mm_store_pi32(__m32 *dest, __m64 src) extern __inline void FUNCTION_ATTRIBS _mm_store_si64(__m64 *dest, __m64 src) { - asm("gssdlc1 %1, 7+%0\n\t" - "gssdrc1 %1, %0\n\t" + asm("sdc1 %1, %0 \n\t" : "=m" (*dest) : "f" (src) : "memory" ); } +extern __inline void FUNCTION_ATTRIBS +_mm_storeu_si64(__m64 *dest, __m64 src) +{ + asm("gssdlc1 %1, 7(%0) \n\t" + "gssdrc1 %1, 0(%0) \n\t" + : + : "r" (dest), "f" (src) + : "memory" + ); +} + extern __inline __m64 FUNCTION_ATTRIBS _mm_load_si32(const __m32 *src) {