@@ -3084,6 +3084,385 @@ void AccessAnalysis::processMemAccesses(bool UseDeferred) {
3084
3084
}
3085
3085
}
3086
3086
3087
+
/// \brief Checks memory dependences among accesses to the same underlying
3088
+
/// object to determine whether there vectorization is legal or not (and at
3089
+
/// which vectorization factor).
3090
+
///
3091
+
/// This class works under the assumption that we already checked that memory
3092
+
/// locations with different underlying pointers are "must-not alias".
3093
+
/// We use the ScalarEvolution framework to symbolically evalutate access
3094
+
/// functions pairs. Since we currently don't restructure the loop we can rely
3095
+
/// on the program order of memory accesses to determine their safety.
3096
+
/// At the moment we will only deem accesses as safe for:
3097
+
/// * A negative constant distance assuming program order.
3098
+
///
3099
+
/// Safe: tmp = a[i + 1]; OR a[i + 1] = x;
3100
+
/// a[i] = tmp; y = a[i];
3101
+
///
3102
+
/// The latter case is safe because later checks guarantuee that there can't
3103
+
/// be a cycle through a phi node (that is, we check that "x" and "y" is not
3104
+
/// the same variable: a header phi can only be an induction or a reduction, a
3105
+
/// reduction can't have a memory sink, an induction can't have a memory
3106
+
/// source). This is important and must not be violated (or we have to
3107
+
/// resort to checking for cycles through memory).
3108
+
///
3109
+
/// * A positive constant distance assuming program order that is bigger
3110
+
/// than the biggest memory access.
3111
+
///
3112
+
/// tmp = a[i] OR b[i] = x
3113
+
/// a[i+2] = tmp y = b[i+2];
3114
+
///
3115
+
/// Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
3116
+
///
3117
+
/// * Zero distances and all accesses have the same size.
3118
+
///
3119
+
class MemoryDepChecker {
3120
+
public:
3121
+
typedef std::pair<Value*, char> MemAccessInfo;
3122
+
3123
+
MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L) :
3124
+
SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0) {}
3125
+
3126
+
/// \brief Register the location (instructions are given increasing numbers)
3127
+
/// of a write access.
3128
+
void addAccess(StoreInst *SI) {
3129
+
Value *Ptr = SI->getPointerOperand();
3130
+
Accesses[std::make_pair(Ptr, true)].push_back(AccessIdx);
3131
+
InstMap.push_back(SI);
3132
+
++AccessIdx;
3133
+
}
3134
+
3135
+
/// \brief Register the location (instructions are given increasing numbers)
3136
+
/// of a write access.
3137
+
void addAccess(LoadInst *LI) {
3138
+
Value *Ptr = LI->getPointerOperand();
3139
+
Accesses[std::make_pair(Ptr, false)].push_back(AccessIdx);
3140
+
InstMap.push_back(LI);
3141
+
++AccessIdx;
3142
+
}
3143
+
3144
+
/// \brief Check whether the dependencies between the accesses are safe.
3145
+
///
3146
+
/// Only checks sets with elements in \p CheckDeps.
3147
+
bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
3148
+
DenseSet<MemAccessInfo> &CheckDeps);
3149
+
3150
+
/// \brief The maximum number of bytes of a vector register we can vectorize
3151
+
/// the accesses safely with.
3152
+
unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
3153
+
3154
+
private:
3155
+
ScalarEvolution *SE;
3156
+
DataLayout *DL;
3157
+
const Loop *InnermostLoop;
3158
+
3159
+
/// \brief Maps access locations (ptr, read/write) to program order.
3160
+
DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
3161
+
3162
+
/// \brief Memory access instructions in program order.
3163
+
SmallVector<Instruction *, 16> InstMap;
3164
+
3165
+
/// \brief The program order index to be used for the next instruction.
3166
+
unsigned AccessIdx;
3167
+
3168
+
// We can access this many bytes in parallel safely.
3169
+
unsigned MaxSafeDepDistBytes;
3170
+
3171
+
/// \brief Check whether there is a plausible dependence between the two
3172
+
/// accesses.
3173
+
///
3174
+
/// Access \p A must happen before \p B in program order. The two indices
3175
+
/// identify the index into the program order map.
3176
+
///
3177
+
/// This function checks whether there is a plausible dependence (or the
3178
+
/// absence of such can't be proved) between the two accesses. If there is a
3179
+
/// plausible dependence but the dependence distance is bigger than one
3180
+
/// element access it records this distance in \p MaxSafeDepDistBytes (if this
3181
+
/// distance is smaller than any other distance encountered so far).
3182
+
/// Otherwise, this function returns true signaling a possible dependence.
3183
+
bool isDependent(const MemAccessInfo &A, unsigned AIdx,
3184
+
const MemAccessInfo &B, unsigned BIdx);
3185
+
3186
+
/// \brief Check whether the data dependence could prevent store-load
3187
+
/// forwarding.
3188
+
bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
3189
+
};
3190
+
3191
+
static bool isInBoundsGep(Value *Ptr) {
3192
+
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
3193
+
return GEP->isInBounds();
3194
+
return false;
3195
+
}
3196
+
3197
+
/// \brief Check whether the access through \p Ptr has a constant stride.
3198
+
static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
3199
+
const Loop *Lp) {
3200
+
const Type *PtrTy = Ptr->getType();
3201
+
assert(PtrTy->isPointerTy() && "Unexpected non ptr");
3202
+
3203
+
// Make sure that the pointer does not point to aggregate types.
3204
+
if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType()) {
3205
+
DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr
3206
+
<< "\n");
3207
+
return 0;
3208
+
}
3209
+
3210
+
const SCEV *PtrScev = SE->getSCEV(Ptr);
3211
+
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
3212
+
if (!AR) {
3213
+
DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer "
3214
+
<< *Ptr << " SCEV: " << *PtrScev << "\n");
3215
+
return 0;
3216
+
}
3217
+
3218
+
// The accesss function must stride over the innermost loop.
3219
+
if (Lp != AR->getLoop()) {
3220
+
DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " << *Ptr
3221
+
<< " SCEV: " << *PtrScev << "\n");
3222
+
}
3223
+
3224
+
// The address calculation must not wrap. Otherwise, a dependence could be
3225
+
// inverted. An inbounds getelementptr that is a AddRec with a unit stride
3226
+
// cannot wrap per definition. The unit stride requirement is checked later.
3227
+
bool IsInBoundsGEP = isInBoundsGep(Ptr);
3228
+
bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
3229
+
if (!IsNoWrapAddRec && !IsInBoundsGEP) {
3230
+
DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space "
3231
+
<< *Ptr << " SCEV: " << *PtrScev << "\n");
3232
+
return 0;
3233
+
}
3234
+
3235
+
// Check the step is constant.
3236
+
const SCEV *Step = AR->getStepRecurrence(*SE);
3237
+
3238
+
// Calculate the pointer stride and check if it is consecutive.
3239
+
const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
3240
+
if (!C) {
3241
+
DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr <<
3242
+
" SCEV: " << *PtrScev << "\n");
3243
+
return 0;
3244
+
}
3245
+
3246
+
int64_t Size = DL->getTypeAllocSize(PtrTy->getPointerElementType());
3247
+
const APInt &APStepVal = C->getValue()->getValue();
3248
+
3249
+
// Huge step value - give up.
3250
+
if (APStepVal.getBitWidth() > 64)
3251
+
return 0;
3252
+
3253
+
int64_t StepVal = APStepVal.getSExtValue();
3254
+
3255
+
// Strided access.
3256
+
int64_t Stride = StepVal / Size;
3257
+
int64_t Rem = StepVal % Size;
3258
+
if (Rem)
3259
+
return 0;
3260
+
3261
+
// If the SCEV could wrap but we have an inbounds gep with a unit stride we
3262
+
// know we can't "wrap around the address space".
3263
+
if (!IsNoWrapAddRec && IsInBoundsGEP && Stride != 1 && Stride != -1)
3264
+
return 0;
3265
+
3266
+
return Stride;
3267
+
}
3268
+
3269
+
bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
3270
+
unsigned TypeByteSize) {
3271
+
// If loads occur at a distance that is not a multiple of a feasible vector
3272
+
// factor store-load forwarding does not take place.
3273
+
// Positive dependences might cause troubles because vectorizing them might
3274
+
// prevent store-load forwarding making vectorized code run a lot slower.
3275
+
// a[i] = a[i-3] ^ a[i-8];
3276
+
// The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
3277
+
// hence on your typical architecture store-load forwarding does not take
3278
+
// place. Vectorizing in such cases does not make sense.
3279
+
// Store-load forwarding distance.
3280
+
const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize;
3281
+
// Maximum vector factor.
3282
+
unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize;
3283
+
if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
3284
+
MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes;
3285
+
3286
+
for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues;
3287
+
vf *= 2) {
3288
+
if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) {
3289
+
MaxVFWithoutSLForwardIssues = (vf >>=1);
3290
+
break;
3291
+
}
3292
+
}
3293
+
3294
+
if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) {
3295
+
DEBUG(dbgs() << "LV: Distance " << Distance <<
3296
+
" that could cause a store-load forwarding conflict\n");
3297
+
return true;
3298
+
}
3299
+
3300
+
if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
3301
+
MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize)
3302
+
MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
3303
+
return false;
3304
+
}
3305
+
3306
+
bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
3307
+
const MemAccessInfo &B, unsigned BIdx) {
3308
+
assert (AIdx < BIdx && "Must pass arguments in program order");
3309
+
3310
+
Value *APtr = A.first;
3311
+
Value *BPtr = B.first;
3312
+
bool AIsWrite = A.second;
3313
+
bool BIsWrite = B.second;
3314
+
3315
+
// Two reads are independent.
3316
+
if (!AIsWrite && !BIsWrite)
3317
+
return false;
3318
+
3319
+
const SCEV *AScev = SE->getSCEV(APtr);
3320
+
const SCEV *BScev = SE->getSCEV(BPtr);
3321
+
3322
+
int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop);
3323
+
int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop);
3324
+
3325
+
const SCEV *Src = AScev;
3326
+
const SCEV *Sink = BScev;
3327
+
3328
+
// If the induction step is negative we have to invert source and sink of the
3329
+
// dependence.
3330
+
if (StrideAPtr < 0) {
3331
+
//Src = BScev;
3332
+
//Sink = AScev;
3333
+
std::swap(APtr, BPtr);
3334
+
std::swap(Src, Sink);
3335
+
std::swap(AIsWrite, BIsWrite);
3336
+
std::swap(AIdx, BIdx);
3337
+
std::swap(StrideAPtr, StrideBPtr);
3338
+
}
3339
+
3340
+
const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
3341
+
3342
+
DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink
3343
+
<< "(Induction step: " << StrideAPtr << ")\n");
3344
+
DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to "
3345
+
<< *InstMap[BIdx] << ": " << *Dist << "\n");
3346
+
3347
+
// Need consecutive accesses. We don't want to vectorize
3348
+
// "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
3349
+
// the address space.
3350
+
if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
3351
+
DEBUG(dbgs() << "Non-consecutive pointer access\n");
3352
+
return true;
3353
+
}
3354
+
3355
+
const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
3356
+
if (!C) {
3357
+
DEBUG(dbgs() << "LV: Dependence because of non constant distance\n");
3358
+
return true;
3359
+
}
3360
+
3361
+
Type *ATy = APtr->getType()->getPointerElementType();
3362
+
Type *BTy = BPtr->getType()->getPointerElementType();
3363
+
unsigned TypeByteSize = DL->getTypeAllocSize(ATy);
3364
+
3365
+
// Negative distances are not plausible dependencies.
3366
+
const APInt &Val = C->getValue()->getValue();
3367
+
if (Val.isNegative()) {
3368
+
bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
3369
+
if (IsTrueDataDependence &&
3370
+
(couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
3371
+
ATy != BTy))
3372
+
return true;
3373
+
3374
+
DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n");
3375
+
return false;
3376
+
}
3377
+
3378
+
// Write to the same location with the same size.
3379
+
// Could be improved to assert type sizes are the same (i32 == float, etc).
3380
+
if (Val == 0) {
3381
+
if (ATy == BTy)
3382
+
return false;
3383
+
DEBUG(dbgs() << "LV: Zero dependence difference but different types");
3384
+
return true;
3385
+
}
3386
+
3387
+
assert(Val.isStrictlyPositive() && "Expect a positive value");
3388
+
3389
+
// Positive distance bigger than max vectorization factor.
3390
+
if (ATy != BTy) {
3391
+
DEBUG(dbgs() <<
3392
+
"LV: ReadWrite-Write positive dependency with different types");
3393
+
return false;
3394
+
}
3395
+
3396
+
unsigned Distance = (unsigned) Val.getZExtValue();
3397
+
3398
+
// Bail out early if passed-in parameters make vectorization not feasible.
3399
+
unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1;
3400
+
unsigned ForcedUnroll = VectorizationUnroll ? VectorizationUnroll : 1;
3401
+
3402
+
// The distance must be bigger than the size needed for a vectorized version
3403
+
// of the operation and the size of the vectorized operation must not be
3404
+
// bigger than the currrent maximum size.
3405
+
if (Distance < 2*TypeByteSize ||
3406
+
2*TypeByteSize > MaxSafeDepDistBytes ||
3407
+
Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
3408
+
DEBUG(dbgs() << "LV: Failure because of Positive distance "
3409
+
<< Val.getSExtValue() << "\n");
3410
+
return true;
3411
+
}
3412
+
3413
+
MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
3414
+
Distance : MaxSafeDepDistBytes;
3415
+
3416
+
bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
3417
+
if (IsTrueDataDependence &&
3418
+
couldPreventStoreLoadForward(Distance, TypeByteSize))
3419
+
return true;
3420
+
3421
+
DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<
3422
+
" with max VF=" << MaxSafeDepDistBytes/TypeByteSize << "\n");
3423
+
3424
+
return false;
3425
+
}
3426
+
3427
+
bool
3428
+
MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
3429
+
DenseSet<MemAccessInfo> &CheckDeps) {
3430
+
3431
+
MaxSafeDepDistBytes = -1U;
3432
+
while (!CheckDeps.empty()) {
3433
+
MemAccessInfo CurAccess = *CheckDeps.begin();
3434
+
3435
+
// Get the relevant memory access set.
3436
+
EquivalenceClasses<MemAccessInfo>::iterator I =
3437
+
AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
3438
+
3439
+
// Check accesses within this set.
3440
+
EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
3441
+
AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
3442
+
3443
+
// Check every access pair.
3444
+
while (AI != AE) {
3445
+
CheckDeps.erase(*AI);
3446
+
EquivalenceClasses<MemAccessInfo>::member_iterator OI = llvm::next(AI);
3447
+
while (OI != AE) {
3448
+
// Check every accessing instruction pair in program order.
3449
+
for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
3450
+
I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
3451
+
for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
3452
+
I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
3453
+
if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2))
3454
+
return false;
3455
+
if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1))
3456
+
return false;
3457
+
}
3458
+
++OI;
3459
+
}
3460
+
AI++;
3461
+
}
3462
+
}
3463
+
return true;
3464
+
}
3465
+
3087
3466
AliasAnalysis::Location
3088
3467
LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) {
3089
3468
if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4