From: Henrik Gramner <henrik@gramner.com>
Date: Sat, 31 Mar 2018 11:49:56 +0000 (+0200)
Subject: x86inc: Optimize VEX instruction encoding
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8badb910847e94abb66686009e424bdce355c9f4;p=libx264

x86inc: Optimize VEX instruction encoding

Most VEX-encoded instructions require an additional byte to encode when src2
is a high register (e.g. x|ymm8..15). If the instruction is commutative we
can swap src1 and src2 when doing so reduces the instruction length, e.g.

    vpaddw xmm0, xmm0, xmm8 -> vpaddw xmm0, xmm8, xmm0
---

diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index 49e73d65..280a9955 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -1240,9 +1240,40 @@ INIT_XMM
     %elif %0 >= 9
         __instr %6, %7, %8, %9
     %elif %0 == 8
-        __instr %6, %7, %8
+        %if avx_enabled && %5
+            %xdefine __src1 %7
+            %xdefine __src2 %8
+            %ifnum regnumof%7
+                %ifnum regnumof%8
+                    %if regnumof%7 < 8 && regnumof%8 >= 8 && regnumof%8 < 16 && sizeof%8 <= 32
+                        ; Most VEX-encoded instructions require an additional byte to encode when
+                        ; src2 is a high register (e.g. m8..15). If the instruction is commutative
+                        ; we can swap src1 and src2 when doing so reduces the instruction length.
+                        %xdefine __src1 %8
+                        %xdefine __src2 %7
+                    %endif
+                %endif
+            %endif
+            __instr %6, __src1, __src2
+        %else
+            __instr %6, %7, %8
+        %endif
     %elif %0 == 7
-        __instr %6, %7
+        %if avx_enabled && %5
+            %xdefine __src1 %6
+            %xdefine __src2 %7
+            %ifnum regnumof%6
+                %ifnum regnumof%7
+                    %if regnumof%6 < 8 && regnumof%7 >= 8 && regnumof%7 < 16 && sizeof%7 <= 32
+                        %xdefine __src1 %7
+                        %xdefine __src2 %6
+                    %endif
+                %endif
+            %endif
+            __instr %6, __src1, __src2
+        %else
+            __instr %6, %7
+        %endif
     %else
         __instr %6
     %endif