Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion clang/include/clang/Basic/LangOptions.def
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,6 @@ LANGOPT(OpenMPTargetBigJumpLoop , 1, 1, NotCompatible, "Use big jump loop code g
LANGOPT(OpenMPTargetNoLoop , 1, 1, NotCompatible, "Use no-loop code generation technique.")
LANGOPT(OpenMPTargetXteamReduction , 1, 1, NotCompatible, "Use cross-team code generation technique.")
LANGOPT(OpenMPTargetFastReduction , 1, 0, NotCompatible, "Use fast reduction code generation technique.")
LANGOPT(OpenMPTargetMultiDevice , 1, 0, NotCompatible, "Offload the iteration space of a single target region across multiple GPU devices.")

// The flag '-fopenmp-target-xteam-scan' triggers the 'Segmented Cross Team Scan' variant by default. To use the no-loop variant, please use the flag '-fopenmp-target-no-loop-scan' instead.
LANGOPT(OpenMPTargetXteamScan , 1, 0, NotCompatible, "Use the cross-team specialized kernel code generation for 'scan' directive.")
Expand Down
8 changes: 0 additions & 8 deletions clang/include/clang/Options/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -4170,14 +4170,6 @@ def fno_openmp_target_xteam_no_loop_scan : Flag<["-"], "fno-openmp-target-xteam-
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
HelpText<"Do not use the no-loop variant of the cross-team specialized kernel code generation for 'scan' directive.">,
MarshallingInfoFlag<LangOpts<"OpenMPTargetXteamNoLoopScan">>;
def fopenmp_target_multi_device : Flag<["-"], "fopenmp-target-multi-device">, Group<f_Group>,
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option, FlangOption]>,
HelpText<"Enable code generation to emit support for multi device target region execution">,
MarshallingInfoFlag<LangOpts<"OpenMPTargetMultiDevice">>;
def fno_openmp_target_multi_device : Flag<["-"], "fno-openmp-target-multi-device">, Group<f_Group>,
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option,FlangOption]>,
HelpText<"Do not use code generation to emit support for multi target offloading">,
MarshallingInfoFlag<LangOpts<"OpenMPTargetMultiDevice">>;

//===----------------------------------------------------------------------===//
// Shared cc1 + fc1 OpenMP Target Options
Expand Down
138 changes: 31 additions & 107 deletions clang/lib/CodeGen/CGOpenMPRuntime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1073,8 +1073,7 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)

// The user forces the compiler to behave as if omp requires
// unified_shared_memory was given.
if (CGM.getLangOpts().OpenMPForceUSM ||
CGM.getLangOpts().OpenMPTargetMultiDevice) {
if (CGM.getLangOpts().OpenMPForceUSM) {
HasRequiresUnifiedSharedMemory = true;
OMPBuilder.Config.setHasRequiresUnifiedSharedMemory(true);
}
Expand Down Expand Up @@ -1238,8 +1237,7 @@ struct PushAndPopStackRAII {
static llvm::Function *emitParallelOrTeamsOutlinedFunction(
CodeGenModule &CGM, const OMPExecutableDirective &D, const CapturedStmt *CS,
const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
const StringRef OutlinedHelperName, const RegionCodeGenTy &CodeGen,
bool EmittingOutlinedTeams) {
const StringRef OutlinedHelperName, const RegionCodeGenTy &CodeGen) {
assert(ThreadIDVar->getType()->isPointerType() &&
"thread id variable must be of type kmp_int32 *");
CodeGenFunction CGF(CGM, true);
Expand Down Expand Up @@ -1270,8 +1268,7 @@ static llvm::Function *emitParallelOrTeamsOutlinedFunction(
CGOpenMPOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen, InnermostKind,
HasCancel, OutlinedHelperName);
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
return CGF.GenerateOpenMPCapturedStmtFunction(*CS, D,
EmittingOutlinedTeams, false);
return CGF.GenerateOpenMPCapturedStmtFunction(*CS, D, D.getBeginLoc());
}

std::string CGOpenMPRuntime::getOutlinedHelperName(StringRef Name) const {
Expand All @@ -1295,7 +1292,7 @@ llvm::Function *CGOpenMPRuntime::emitParallelOutlinedFunction(
const CapturedStmt *CS = D.getCapturedStmt(OMPD_parallel);
return emitParallelOrTeamsOutlinedFunction(
CGM, D, CS, ThreadIDVar, InnermostKind, getOutlinedHelperName(CGF),
CodeGen, /*EmittingOutlinedTeams*/ false);
CodeGen);
}

llvm::Function *CGOpenMPRuntime::emitTeamsOutlinedFunction(
Expand All @@ -1305,7 +1302,7 @@ llvm::Function *CGOpenMPRuntime::emitTeamsOutlinedFunction(
const CapturedStmt *CS = D.getCapturedStmt(OMPD_teams);
return emitParallelOrTeamsOutlinedFunction(
CGM, D, CS, ThreadIDVar, InnermostKind, getOutlinedHelperName(CGF),
CodeGen, /*EmittingOutlinedTeams*/ true);
CodeGen);
}

llvm::Function *CGOpenMPRuntime::emitTaskOutlinedFunction(
Expand Down Expand Up @@ -2735,37 +2732,19 @@ static void emitForStaticInitCall(
"expected static chunked schedule");
}

if (Values.IsMultiDevice) {
llvm::Value *Args[] = {
UpdateLocation,
ThreadId,
CGF.Builder.getInt32(addMonoNonMonoModifier(CGF.CGM, Schedule, M1,
M2)), // Schedule type
Values.IL.emitRawPointer(CGF), // &isLastIter
Values.MultiDeviceLB.emitRawPointer(CGF), // &MultiDeviceLB
Values.MultiDeviceUB.emitRawPointer(CGF), // &MultiDeviceUB
Values.LB.emitRawPointer(CGF), // &LB
Values.UB.emitRawPointer(CGF), // &UB
Values.ST.emitRawPointer(CGF), // &Stride
CGF.Builder.getIntN(Values.IVSize, 1), // Incr
Chunk // Chunk
};
CGF.EmitRuntimeCall(ForStaticInitFunction, Args);
} else {
llvm::Value *Args[] = {
UpdateLocation,
ThreadId,
CGF.Builder.getInt32(addMonoNonMonoModifier(CGF.CGM, Schedule, M1,
M2)), // Schedule type
Values.IL.emitRawPointer(CGF), // &isLastIter
Values.LB.emitRawPointer(CGF), // &LB
Values.UB.emitRawPointer(CGF), // &UB
Values.ST.emitRawPointer(CGF), // &Stride
CGF.Builder.getIntN(Values.IVSize, 1), // Incr
Chunk // Chunk
};
CGF.EmitRuntimeCall(ForStaticInitFunction, Args);
}
llvm::Value *Args[] = {
UpdateLocation,
ThreadId,
CGF.Builder.getInt32(addMonoNonMonoModifier(CGF.CGM, Schedule, M1,
M2)), // Schedule type
Values.IL.emitRawPointer(CGF), // &isLastIter
Values.LB.emitRawPointer(CGF), // &LB
Values.UB.emitRawPointer(CGF), // &UB
Values.ST.emitRawPointer(CGF), // &Stride
CGF.Builder.getIntN(Values.IVSize, 1), // Incr
Chunk // Chunk
};
CGF.EmitRuntimeCall(ForStaticInitFunction, Args);
}

void CGOpenMPRuntime::emitForStaticInit(CodeGenFunction &CGF,
Expand Down Expand Up @@ -2793,7 +2772,7 @@ void CGOpenMPRuntime::emitForStaticInit(CodeGenFunction &CGF,
void CGOpenMPRuntime::emitDistributeStaticInit(
CodeGenFunction &CGF, SourceLocation Loc,
OpenMPDistScheduleClauseKind SchedKind,
const CGOpenMPRuntime::StaticRTInput &Values, bool IsMultiDeviceKernel) {
const CGOpenMPRuntime::StaticRTInput &Values) {
OpenMPSchedType ScheduleNum =
getRuntimeSchedule(SchedKind, Values.Chunk != nullptr);
llvm::Value *UpdatedLocation =
Expand All @@ -2802,13 +2781,8 @@ void CGOpenMPRuntime::emitDistributeStaticInit(
llvm::FunctionCallee StaticInitFunction;
bool isGPUDistribute =
CGM.getLangOpts().OpenMPIsTargetDevice && CGM.getTriple().isGPU();
if (IsMultiDeviceKernel && isGPUDistribute) {
StaticInitFunction = OMPBuilder.createMDDistributeForStaticInitFunction(
Values.IVSize, Values.IVSigned);
} else {
StaticInitFunction = OMPBuilder.createForStaticInitFunction(
Values.IVSize, Values.IVSigned, isGPUDistribute);
}
StaticInitFunction = OMPBuilder.createForStaticInitFunction(
Values.IVSize, Values.IVSigned, isGPUDistribute);
emitForStaticInitCall(CGF, UpdatedLocation, ThreadId, StaticInitFunction,
ScheduleNum, OMPC_SCHEDULE_MODIFIER_unknown,
OMPC_SCHEDULE_MODIFIER_unknown, Values);
Expand Down Expand Up @@ -6425,10 +6399,7 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
if (CGM.getLangOpts().OpenMPIsTargetDevice && !isGPU())
return CGF.GenerateOpenMPCapturedStmtFunctionAggregate(CS, D);
return CGF.GenerateOpenMPCapturedStmtFunction(
CS, D,
/*CanHaveMultiDeviceArgs*/ true,
/*IsTopKernel*/ true);
return CGF.GenerateOpenMPCapturedStmtFunction(CS, D, D.getBeginLoc());
};

cantFail(OMPBuilder.emitTargetRegionFunction(
Expand Down Expand Up @@ -10785,47 +10756,12 @@ emitDynCGroupMem(const OMPExecutableDirective &D, CodeGenFunction &CGF) {
static void genMapInfoForCaptures(
MappableExprsHandler &MEHandler, CodeGenFunction &CGF,
const CapturedStmt &CS, llvm::SmallVectorImpl<llvm::Value *> &CapturedVars,
llvm::SmallVectorImpl<llvm::Value *> &MultiTargetVars,
llvm::OpenMPIRBuilder &OMPBuilder,
llvm::DenseSet<CanonicalDeclPtr<const Decl>> &MappedVarSet,
uint32_t &CapturedCount,
MappableExprsHandler::MapCombinedInfoTy &CombinedInfo) {
llvm::DenseMap<llvm::Value *, llvm::Value *> LambdaPointers;

// If a for statement is present and the compiler flag for multi-device
// targets is enabled then it means we have 2 variables at the start which
// represent the lower and upper bounds of the loop:
// TODO: add compiler flag condition
for (auto *MTV = MultiTargetVars.begin(); MTV != MultiTargetVars.end();
++MTV) {
// This should always be null because the any used variable (if one exists)
// will be included when capturing the actual variables (not the
// multi-target ones).
MappedVarSet.insert(nullptr);

MappableExprsHandler::MapCombinedInfoTy CurInfo;
CurInfo.Exprs.push_back(nullptr);
CurInfo.BasePointers.push_back(*MTV);
CurInfo.Pointers.push_back(*MTV);
CurInfo.Sizes.push_back(llvm::ConstantInt::get(CGF.Int64Ty, 4));

// Copy to the device as an argument. No need to retrieve it.
CurInfo.Types.push_back(OpenMPOffloadMappingFlags::OMP_MAP_LITERAL |
OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM |
OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT);
CurInfo.Mappers.push_back(nullptr);

assert(CurInfo.BasePointers.size() == CurInfo.Pointers.size() &&
CurInfo.BasePointers.size() == CurInfo.Sizes.size() &&
CurInfo.BasePointers.size() == CurInfo.Types.size() &&
CurInfo.BasePointers.size() == CurInfo.Mappers.size() &&
"Inconsistent map information sizes!");

// We need to append the results of this capture to what we already
// have.
CombinedInfo.append(CurInfo);
}

auto RI = CS.getCapturedRecordDecl()->field_begin();
auto *CV = CapturedVars.begin();
CapturedCount = 0;
Expand Down Expand Up @@ -10952,15 +10888,14 @@ genMapInfo(MappableExprsHandler &MEHandler, CodeGenFunction &CGF,
static void genMapInfo(const OMPExecutableDirective &D, CodeGenFunction &CGF,
const CapturedStmt &CS,
llvm::SmallVectorImpl<llvm::Value *> &CapturedVars,
llvm::SmallVectorImpl<llvm::Value *> &MultiTargetVars,
llvm::OpenMPIRBuilder &OMPBuilder,
uint32_t &CapturedCount,
MappableExprsHandler::MapCombinedInfoTy &CombinedInfo) {
// Get mappable expression information.
MappableExprsHandler MEHandler(D, CGF);
llvm::DenseSet<CanonicalDeclPtr<const Decl>> MappedVarSet;

genMapInfoForCaptures(MEHandler, CGF, CS, CapturedVars, MultiTargetVars,
genMapInfoForCaptures(MEHandler, CGF, CS, CapturedVars,
OMPBuilder, MappedVarSet, CapturedCount, CombinedInfo);
genMapInfo(MEHandler, CGF, CombinedInfo, OMPBuilder, MappedVarSet);
}
Expand All @@ -10985,7 +10920,6 @@ static void emitTargetCallKernelLaunch(
CGOpenMPRuntime *OMPRuntime, llvm::Function *OutlinedFn,
const OMPExecutableDirective &D,
llvm::SmallVectorImpl<llvm::Value *> &CapturedVars,
llvm::SmallVectorImpl<llvm::Value *> &MultiTargetVars,
bool RequiresOuterTask, const CapturedStmt &CS, bool OffloadingMandatory,
llvm::PointerIntPair<const Expr *, 2, OpenMPDeviceClauseModifier> Device,
llvm::Value *OutlinedFnID, CodeGenFunction::OMPTargetDataInfo &InputInfo,
Expand All @@ -10999,7 +10933,7 @@ static void emitTargetCallKernelLaunch(
// Fill up the arrays with all the captured variables.
MappableExprsHandler::MapCombinedInfoTy CombinedInfo;
uint32_t CapturedCount;
genMapInfo(D, CGF, CS, CapturedVars, MultiTargetVars, OMPBuilder,
genMapInfo(D, CGF, CS, CapturedVars, OMPBuilder,
CapturedCount, CombinedInfo);

// Array to hold to allocated XTeam reduction variables:
Expand Down Expand Up @@ -11038,8 +10972,6 @@ static void emitTargetCallKernelLaunch(
llvm::Value *XteamRedNumTeamsFromOccupancy = nullptr;
bool IsXteamRedFast = CGF.CGM.isXteamRedFast(FStmt);
// We don't need to allocate/initialize metadata in the fast version.
// TODO: This will not work for multi-target if we need to allocate
// data for each used device. Ensure conditions guard against that.
if (!IsXteamRedFast) {
// TODO Use device id from device clause, if any.
DevIdVal = CGF.EmitRuntimeCall(
Expand Down Expand Up @@ -11330,9 +11262,6 @@ static void emitTargetCallKernelLaunch(
bool IsReverseOffloading = Device.getInt() == OMPC_DEVICE_ancestor;

if (IsReverseOffloading) {
assert(
!CGF.CGM.getLangOpts().OpenMPTargetMultiDevice &&
"Cannot enable multi-device targets when doing reverse offloading");
// Reverse offloading is not supported, so just execute on the host.
// FIXME: This fallback solution is incorrect since it ignores the
// OMP_TARGET_OFFLOAD environment variable. Instead it would be better to
Expand Down Expand Up @@ -11397,12 +11326,9 @@ static void emitTargetCallKernelLaunch(
CGF.Builder.restoreIP(AfterIP);
};

if (RequiresOuterTask) {
assert(!CGM.getLangOpts().OpenMPTargetMultiDevice &&
"Cannot yet enable multi-device targets for situations in which an "
"outer task is required");
if (RequiresOuterTask)
CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo);
} else
else
OMPRuntime->emitInlinedDirective(CGF, D.getDirectiveKind(), ThenGen);

if (HasXTeamReduction) {
Expand Down Expand Up @@ -11466,11 +11392,10 @@ void CGOpenMPRuntime::emitTargetCall(
needsTaskBasedThreadLimit(D.getDirectiveKind()) &&
D.hasClausesOfKind<OMPThreadLimitClause>());
llvm::SmallVector<llvm::Value *, 16> CapturedVars;
llvm::SmallVector<llvm::Value *, 4> MultiTargetVars;
const CapturedStmt &CS = *D.getCapturedStmt(OMPD_target);
auto &&ArgsCodegen = [&CS, &D, &CapturedVars, &MultiTargetVars](
auto &&ArgsCodegen = [&CS, &D, &CapturedVars](
CodeGenFunction &CGF, PrePostActionTy &) {
CGF.GenerateOpenMPCapturedVarsDevice(CS, CapturedVars, MultiTargetVars,
CGF.GenerateOpenMPCapturedVars(CS, CapturedVars,
CGF.CGM.getOptKernelKey(D));
};
emitInlinedDirective(CGF, OMPD_unknown, ArgsCodegen);
Expand All @@ -11479,13 +11404,13 @@ void CGOpenMPRuntime::emitTargetCall(
llvm::Value *MapTypesArray = nullptr;
llvm::Value *MapNamesArray = nullptr;

auto &&TargetThenGen = [this, OutlinedFn, &D, &CapturedVars, &MultiTargetVars,
auto &&TargetThenGen = [this, OutlinedFn, &D, &CapturedVars,
RequiresOuterTask, &CS, OffloadingMandatory, Device,
OutlinedFnID, &InputInfo, &MapTypesArray,
&MapNamesArray, SizeEmitter](CodeGenFunction &CGF,
PrePostActionTy &) {
emitTargetCallKernelLaunch(
this, OutlinedFn, D, CapturedVars, MultiTargetVars, RequiresOuterTask,
this, OutlinedFn, D, CapturedVars, RequiresOuterTask,
CS, OffloadingMandatory, Device, OutlinedFnID, InputInfo, MapTypesArray,
MapNamesArray, SizeEmitter, CGF, CGM);
};
Expand Down Expand Up @@ -13602,8 +13527,7 @@ void CGOpenMPSIMDRuntime::emitForStaticInit(

void CGOpenMPSIMDRuntime::emitDistributeStaticInit(
CodeGenFunction &CGF, SourceLocation Loc,
OpenMPDistScheduleClauseKind SchedKind, const StaticRTInput &Values,
bool IsMultiDeviceKernel) {
OpenMPDistScheduleClauseKind SchedKind, const StaticRTInput &Values) {
llvm_unreachable("Not supported in SIMD-only mode");
}

Expand Down
19 changes: 2 additions & 17 deletions clang/lib/CodeGen/CGOpenMPRuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -959,17 +959,9 @@ class CGOpenMPRuntime {
bool IVSigned = false;
/// true if loop is ordered, false otherwise.
bool Ordered = false;
/// true if kernel is multi-device
bool IsMultiDevice = false;
Address IL = Address::invalid();
/// Address of the output variable in which the lower iteration number is
/// returned.
Address MultiDeviceLB = Address::invalid();
/// Address of the output variable in which the upper iteration number is
/// returned.
Address MultiDeviceUB = Address::invalid();
/// Address of the output variable in which the lower iteration number is
/// returned.
Address LB = Address::invalid();
/// Address of the output variable in which the upper iteration number is
/// returned.
Expand All @@ -985,11 +977,6 @@ class CGOpenMPRuntime {
llvm::Value *Chunk = nullptr)
: IVSize(IVSize), IVSigned(IVSigned), Ordered(Ordered), IL(IL), LB(LB),
UB(UB), ST(ST), Chunk(Chunk) {}
void setMultiDeviceLBUB(Address LB, Address UB) {
MultiDeviceLB = LB;
MultiDeviceUB = UB;
IsMultiDevice = true;
}
};
/// Call the appropriate runtime routine to initialize it before start
/// of loop.
Expand Down Expand Up @@ -1020,8 +1007,7 @@ class CGOpenMPRuntime {
virtual void emitDistributeStaticInit(CodeGenFunction &CGF,
SourceLocation Loc,
OpenMPDistScheduleClauseKind SchedKind,
const StaticRTInput &Values,
bool IsMultiDeviceKernel);
const StaticRTInput &Values);

/// Call the appropriate runtime routine to notify that we finished
/// iteration of the ordered loop with the dynamic scheduling.
Expand Down Expand Up @@ -1960,8 +1946,7 @@ class CGOpenMPSIMDRuntime final : public CGOpenMPRuntime {
///
void emitDistributeStaticInit(CodeGenFunction &CGF, SourceLocation Loc,
OpenMPDistScheduleClauseKind SchedKind,
const StaticRTInput &Values,
bool IsMultiDeviceKernel) override;
const StaticRTInput &Values) override;

/// Call the appropriate runtime routine to notify that we finished
/// iteration of the ordered loop with the dynamic scheduling.
Expand Down
Loading
Loading