return NewLd;
};
- // LOAD - all consecutive load/undefs (must start/end with a load).
- // If we have found an entire vector of loads and undefs, then return a large
- // load of the entire vector width starting at the base pointer.
- // If the vector contains zeros, then attempt to shuffle those elements.
- if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
+ // Check if the base load is entirely dereferenceable.
+ bool IsDereferenceable =
+ LDBase &&
+ LDBase->getPointerInfo().isDereferenceable(
+ VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
+
+ // LOAD - all consecutive load/undefs (must start/end with a load or be
+ // entirely dereferenceable). If we have found an entire vector of loads and
+ // undefs, then return a large load of the entire vector width starting at the
+ // base pointer. If the vector contains zeros, then attempt to shuffle those
+ // elements.
+ if (FirstLoadedElt == 0 &&
+ (LastLoadedElt == (int)(NumElems - 1) || IsDereferenceable) &&
(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
assert(LDBase && "Did not find base load for merging consecutive loads");
EVT EltVT = LDBase->getValueType(0);
if (NumElems == 1)
return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
- if (IsConsecutiveLoad)
+ if (!ZeroMask)
return CreateLoad(VT, LDBase);
// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
// vector and a zero vector to clear out the zero elements.
- if (!isAfterLegalize && VT.isVector() && NumElems == VT.getVectorNumElements()) {
+ if (!isAfterLegalize && VT.isVector()) {
SmallVector<int, 4> ClearMask(NumElems, -1);
for (unsigned i = 0; i < NumElems; ++i) {
if (ZeroMask[i])
;
define <4 x float> @load_float4_float3(<4 x float>* nocapture readonly dereferenceable(16)) {
-; SSE2-LABEL: load_float4_float3:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_float4_float3:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_float4_float3:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; SSE41-NEXT: retq
+; SSE-LABEL: load_float4_float3:
+; SSE: # %bb.0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: load_float4_float3:
; AVX: # %bb.0:
-; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: retq
%p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
%p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1
}
define <8 x float> @load_float8_float3(<4 x float>* nocapture readonly dereferenceable(16)) {
-; SSE2-LABEL: load_float8_float3:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_float8_float3:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_float8_float3:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; SSE41-NEXT: retq
+; SSE-LABEL: load_float8_float3:
+; SSE: # %bb.0:
+; SSE-NEXT: movups (%rdi), %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: load_float8_float3:
; AVX: # %bb.0:
-; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: retq
%p0 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 0
%p1 = getelementptr inbounds <4 x float>, <4 x float>* %0, i64 0, i64 1
;
; AVX-LABEL: load_double4_0u2u:
; AVX: # %bb.0:
-; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX-NEXT: vmovddup {{.*#+}} ymm0 = mem[0,0,2,2]
; AVX-NEXT: retq
%2 = load double, double* %0, align 8
%3 = insertelement <4 x double> undef, double %2, i32 0