; TODO stack_fold_pextrb
+; We can't naively fold pextrw as it only writes to a 16-bit memory location
+; even though it can store to a 32-bit register.
+define i16 @stack_fold_pextrw(<8 x i16> %a0) {
+; CHECK-LABEL: stack_fold_pextrw
+; CHECK: pextrw $1, {{%xmm[0-9][0-9]*}}, %[[GPR32:(e[a-z]+|r[0-9]+d)]]
+; CHECK: movl %[[GPR32]], {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Spill
+; CHECK: movl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
+entry:
+; add forces execution domain
+ %add = add <8 x i16> %a0, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+ %extract = extractelement <8 x i16> %add, i32 1
+ %asm = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+ ret i16 %extract
+}
+
define i32 @stack_fold_pextrd(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_pextrd
;CHECK: pextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
ret i64 %1
}
-; TODO stack_fold_pextrw
-
define <4 x i32> @stack_fold_phaddd(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK-LABEL: stack_fold_phaddd
;CHECK: phaddd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload