// Rotate by 1
def ROR8r1 : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
"ror{b}\t$dst",
- [(set GR8:$dst, (rotl GR8:$src1, (i8 7)))]>;
+ [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>;
def ROR16r1 : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
"ror{w}\t$dst",
- [(set GR16:$dst, (rotl GR16:$src1, (i8 15)))]>, OpSize16;
+ [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize16;
def ROR32r1 : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
"ror{l}\t$dst",
- [(set GR32:$dst, (rotl GR32:$src1, (i8 31)))]>, OpSize32;
+ [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>, OpSize32;
def ROR64r1 : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
"ror{q}\t$dst",
- [(set GR64:$dst, (rotl GR64:$src1, (i8 63)))]>;
+ [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>;
} // Constraints = "$src = $dst", SchedRW
let Uses = [CL], SchedRW = [WriteRotateCLLd, WriteRMW] in {
// Rotate by 1
def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
"ror{b}\t$dst",
- [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)]>;
+ [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
def ROR16m1 : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
"ror{w}\t$dst",
- [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)]>,
+ [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
OpSize16;
def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
"ror{l}\t$dst",
- [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)]>,
+ [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>,
OpSize32;
def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
"ror{q}\t$dst",
- [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)]>,
+ [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>,
Requires<[In64BitMode]>;
} // SchedRW
} // Defs = [EFLAGS]
+// Use the opposite rotate if allows us to use the rotate by 1 instruction.
+def : Pat<(rotl GR8:$src1, (i8 7)), (ROR8r1 GR8:$src1)>;
+def : Pat<(rotl GR16:$src1, (i8 15)), (ROR16r1 GR16:$src1)>;
+def : Pat<(rotl GR32:$src1, (i8 31)), (ROR32r1 GR32:$src1)>;
+def : Pat<(rotl GR64:$src1, (i8 63)), (ROR64r1 GR64:$src1)>;
+def : Pat<(rotr GR8:$src1, (i8 7)), (ROL8r1 GR8:$src1)>;
+def : Pat<(rotr GR16:$src1, (i8 15)), (ROL16r1 GR16:$src1)>;
+def : Pat<(rotr GR32:$src1, (i8 31)), (ROL32r1 GR32:$src1)>;
+def : Pat<(rotr GR64:$src1, (i8 63)), (ROL64r1 GR64:$src1)>;
+
+def : Pat<(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst),
+ (ROR8m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst),
+ (ROR16m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst),
+ (ROR32m1 addr:$dst)>;
+def : Pat<(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst),
+ (ROR64m1 addr:$dst)>, Requires<[In64BitMode]>;
+
+def : Pat<(store (rotr (loadi8 addr:$dst), (i8 7)), addr:$dst),
+ (ROL8m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi16 addr:$dst), (i8 15)), addr:$dst),
+ (ROL16m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi32 addr:$dst), (i8 31)), addr:$dst),
+ (ROL32m1 addr:$dst)>;
+def : Pat<(store (rotr (loadi64 addr:$dst), (i8 63)), addr:$dst),
+ (ROL64m1 addr:$dst)>, Requires<[In64BitMode]>;
+
// Sandy Bridge and newer Intel processors support faster rotates using
// SHLD to avoid a partial flag update on the normal rotate instructions.
let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in {
(SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>;
def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
(SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>;
+
+ def : Pat<(rotr GR32:$src, (i8 imm:$shamt)),
+ (SHRD32rri8 GR32:$src, GR32:$src, imm:$shamt)>;
+ def : Pat<(rotr GR64:$src, (i8 imm:$shamt)),
+ (SHRD64rri8 GR64:$src, GR64:$src, imm:$shamt)>;
}
def ROT32L2R_imm8 : SDNodeXForm<imm, [{
// Prefer RORX which is non-destructive and doesn't update EFLAGS.
let AddedComplexity = 10 in {
+ def : Pat<(rotr GR32:$src, (i8 imm:$shamt)),
+ (RORX32ri GR32:$src, imm:$shamt)>;
+ def : Pat<(rotr GR64:$src, (i8 imm:$shamt)),
+ (RORX64ri GR64:$src, imm:$shamt)>;
+
def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
(RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
(RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
}
+ def : Pat<(rotr (loadi32 addr:$src), (i8 imm:$shamt)),
+ (RORX32mi addr:$src, imm:$shamt)>;
+ def : Pat<(rotr (loadi64 addr:$src), (i8 imm:$shamt)),
+ (RORX64mi addr:$src, imm:$shamt)>;
+
def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
(RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
(RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
// Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
- // immedidate shift, i.e. the following code is considered better
+ // immediate shift, i.e. the following code is considered better
//
// mov %edi, %esi
// shl $imm, %esi
; X32-SSE2-LABEL: rotr_i8_const_shift7:
; X32-SSE2: # %bb.0:
; X32-SSE2-NEXT: movb {{[0-9]+}}(%esp), %al
-; X32-SSE2-NEXT: rorb $7, %al
+; X32-SSE2-NEXT: rolb %al
; X32-SSE2-NEXT: retl
;
; X64-AVX2-LABEL: rotr_i8_const_shift7:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: movl %edi, %eax
-; X64-AVX2-NEXT: rorb $7, %al
+; X64-AVX2-NEXT: rolb %al
; X64-AVX2-NEXT: # kill: def $al killed $al killed $eax
; X64-AVX2-NEXT: retq
%f = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 7)
}
define i32 @fshr(i32 %x) nounwind {
-; CHECK32-LABEL: fshr:
-; CHECK32: # %bb.0:
-; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT: rorl $7, %eax
-; CHECK32-NEXT: retl
+; X86-LABEL: fshr:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: rorl $7, %eax
+; X86-NEXT: retl
;
-; CHECK64-LABEL: fshr:
-; CHECK64: # %bb.0:
-; CHECK64-NEXT: movl %edi, %eax
-; CHECK64-NEXT: rorl $7, %eax
-; CHECK64-NEXT: retq
+; SHLD-LABEL: fshr:
+; SHLD: # %bb.0:
+; SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SHLD-NEXT: shrdl $7, %eax, %eax
+; SHLD-NEXT: retl
+;
+; BMI2-LABEL: fshr:
+; BMI2: # %bb.0:
+; BMI2-NEXT: rorxl $7, {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: retl
+;
+; X64-LABEL: fshr:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: rorl $7, %eax
+; X64-NEXT: retq
+;
+; SHLD64-LABEL: fshr:
+; SHLD64: # %bb.0:
+; SHLD64-NEXT: movl %edi, %eax
+; SHLD64-NEXT: shrdl $7, %edi, %eax
+; SHLD64-NEXT: retq
+;
+; BMI264-LABEL: fshr:
+; BMI264: # %bb.0:
+; BMI264-NEXT: rorxl $7, %edi, %eax
+; BMI264-NEXT: retq
%f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 7)
ret i32 %f
}
declare i32 @llvm.fshr.i32(i32, i32, i32)
define i32 @fshr1(i32 %x) nounwind {
-; CHECK32-LABEL: fshr1:
-; CHECK32: # %bb.0:
-; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT: rorl $1, %eax
-; CHECK32-NEXT: retl
+; X86-LABEL: fshr1:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: rorl $1, %eax
+; X86-NEXT: retl
;
-; CHECK64-LABEL: fshr1:
-; CHECK64: # %bb.0:
-; CHECK64-NEXT: movl %edi, %eax
-; CHECK64-NEXT: rorl $1, %eax
-; CHECK64-NEXT: retq
+; SHLD-LABEL: fshr1:
+; SHLD: # %bb.0:
+; SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SHLD-NEXT: shrdl $1, %eax, %eax
+; SHLD-NEXT: retl
+;
+; BMI2-LABEL: fshr1:
+; BMI2: # %bb.0:
+; BMI2-NEXT: rorxl $1, {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: retl
+;
+; X64-LABEL: fshr1:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: rorl $1, %eax
+; X64-NEXT: retq
+;
+; SHLD64-LABEL: fshr1:
+; SHLD64: # %bb.0:
+; SHLD64-NEXT: movl %edi, %eax
+; SHLD64-NEXT: shrdl $1, %edi, %eax
+; SHLD64-NEXT: retq
+;
+; BMI264-LABEL: fshr1:
+; BMI264: # %bb.0:
+; BMI264-NEXT: rorxl $1, %edi, %eax
+; BMI264-NEXT: retq
%f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 1)
ret i32 %f
}
define i32 @fshr31(i32 %x) nounwind {
-; CHECK32-LABEL: fshr31:
-; CHECK32: # %bb.0:
-; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT: rorl $31, %eax
-; CHECK32-NEXT: retl
+; X86-LABEL: fshr31:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: roll %eax
+; X86-NEXT: retl
;
-; CHECK64-LABEL: fshr31:
-; CHECK64: # %bb.0:
-; CHECK64-NEXT: movl %edi, %eax
-; CHECK64-NEXT: rorl $31, %eax
-; CHECK64-NEXT: retq
+; SHLD-LABEL: fshr31:
+; SHLD: # %bb.0:
+; SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SHLD-NEXT: shrdl $31, %eax, %eax
+; SHLD-NEXT: retl
+;
+; BMI2-LABEL: fshr31:
+; BMI2: # %bb.0:
+; BMI2-NEXT: rorxl $31, {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: retl
+;
+; X64-LABEL: fshr31:
+; X64: # %bb.0:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: roll %eax
+; X64-NEXT: retq
+;
+; SHLD64-LABEL: fshr31:
+; SHLD64: # %bb.0:
+; SHLD64-NEXT: movl %edi, %eax
+; SHLD64-NEXT: shrdl $31, %edi, %eax
+; SHLD64-NEXT: retq
+;
+; BMI264-LABEL: fshr31:
+; BMI264: # %bb.0:
+; BMI264-NEXT: rorxl $31, %edi, %eax
+; BMI264-NEXT: retq
%f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 31)
ret i32 %f
}
define i32 @fshr_load(i32* %p) nounwind {
-; CHECK32-LABEL: fshr_load:
-; CHECK32: # %bb.0:
-; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT: movl (%eax), %eax
-; CHECK32-NEXT: rorl $7, %eax
-; CHECK32-NEXT: retl
+; X86-LABEL: fshr_load:
+; X86: # %bb.0:
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: rorl $7, %eax
+; X86-NEXT: retl
;
-; CHECK64-LABEL: fshr_load:
-; CHECK64: # %bb.0:
-; CHECK64-NEXT: movl (%rdi), %eax
-; CHECK64-NEXT: rorl $7, %eax
-; CHECK64-NEXT: retq
+; SHLD-LABEL: fshr_load:
+; SHLD: # %bb.0:
+; SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SHLD-NEXT: movl (%eax), %eax
+; SHLD-NEXT: shrdl $7, %eax, %eax
+; SHLD-NEXT: retl
+;
+; BMI2-LABEL: fshr_load:
+; BMI2: # %bb.0:
+; BMI2-NEXT: movl {{[0-9]+}}(%esp), %eax
+; BMI2-NEXT: rorxl $7, (%eax), %eax
+; BMI2-NEXT: retl
+;
+; X64-LABEL: fshr_load:
+; X64: # %bb.0:
+; X64-NEXT: movl (%rdi), %eax
+; X64-NEXT: rorl $7, %eax
+; X64-NEXT: retq
+;
+; SHLD64-LABEL: fshr_load:
+; SHLD64: # %bb.0:
+; SHLD64-NEXT: movl (%rdi), %eax
+; SHLD64-NEXT: shrdl $7, %eax, %eax
+; SHLD64-NEXT: retq
+;
+; BMI264-LABEL: fshr_load:
+; BMI264: # %bb.0:
+; BMI264-NEXT: rorxl $7, (%rdi), %eax
+; BMI264-NEXT: retq
%x = load i32, i32* %p
%f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 7)
ret i32 %f
}
define i64 @fshr(i64 %x) nounwind {
-; ALL-LABEL: fshr:
-; ALL: # %bb.0:
-; ALL-NEXT: movq %rdi, %rax
-; ALL-NEXT: rorq $7, %rax
-; ALL-NEXT: retq
+; X64-LABEL: fshr:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: rorq $7, %rax
+; X64-NEXT: retq
+;
+; SHLD-LABEL: fshr:
+; SHLD: # %bb.0:
+; SHLD-NEXT: movq %rdi, %rax
+; SHLD-NEXT: shrdq $7, %rdi, %rax
+; SHLD-NEXT: retq
+;
+; BMI2-LABEL: fshr:
+; BMI2: # %bb.0:
+; BMI2-NEXT: rorxq $7, %rdi, %rax
+; BMI2-NEXT: retq
%f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 7)
ret i64 %f
}
declare i64 @llvm.fshr.i64(i64, i64, i64)
define i64 @fshr1(i64 %x) nounwind {
-; ALL-LABEL: fshr1:
-; ALL: # %bb.0:
-; ALL-NEXT: movq %rdi, %rax
-; ALL-NEXT: rorq $1, %rax
-; ALL-NEXT: retq
+; X64-LABEL: fshr1:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: rorq $1, %rax
+; X64-NEXT: retq
+;
+; SHLD-LABEL: fshr1:
+; SHLD: # %bb.0:
+; SHLD-NEXT: movq %rdi, %rax
+; SHLD-NEXT: shrdq $1, %rdi, %rax
+; SHLD-NEXT: retq
+;
+; BMI2-LABEL: fshr1:
+; BMI2: # %bb.0:
+; BMI2-NEXT: rorxq $1, %rdi, %rax
+; BMI2-NEXT: retq
%f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 1)
ret i64 %f
}
define i64 @fshr63(i64 %x) nounwind {
-; ALL-LABEL: fshr63:
-; ALL: # %bb.0:
-; ALL-NEXT: movq %rdi, %rax
-; ALL-NEXT: rorq $63, %rax
-; ALL-NEXT: retq
+; X64-LABEL: fshr63:
+; X64: # %bb.0:
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: rolq %rax
+; X64-NEXT: retq
+;
+; SHLD-LABEL: fshr63:
+; SHLD: # %bb.0:
+; SHLD-NEXT: movq %rdi, %rax
+; SHLD-NEXT: shrdq $63, %rdi, %rax
+; SHLD-NEXT: retq
+;
+; BMI2-LABEL: fshr63:
+; BMI2: # %bb.0:
+; BMI2-NEXT: rorxq $63, %rdi, %rax
+; BMI2-NEXT: retq
%f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 63)
ret i64 %f
}
define i64 @fshr_load(i64* %p) nounwind {
-; ALL-LABEL: fshr_load:
-; ALL: # %bb.0:
-; ALL-NEXT: movq (%rdi), %rax
-; ALL-NEXT: rorq $7, %rax
-; ALL-NEXT: retq
+; X64-LABEL: fshr_load:
+; X64: # %bb.0:
+; X64-NEXT: movq (%rdi), %rax
+; X64-NEXT: rorq $7, %rax
+; X64-NEXT: retq
+;
+; SHLD-LABEL: fshr_load:
+; SHLD: # %bb.0:
+; SHLD-NEXT: movq (%rdi), %rax
+; SHLD-NEXT: shrdq $7, %rax, %rax
+; SHLD-NEXT: retq
+;
+; BMI2-LABEL: fshr_load:
+; BMI2: # %bb.0:
+; BMI2-NEXT: rorxq $7, (%rdi), %rax
+; BMI2-NEXT: retq
%x = load i64, i64* %p
%f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 7)
ret i64 %f