diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index f87f3fb203916..b717c3f53c0b5 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -241,7 +241,6 @@ LANGOPT(OpenMPTargetBigJumpLoop , 1, 1, NotCompatible, "Use big jump loop code g LANGOPT(OpenMPTargetNoLoop , 1, 1, NotCompatible, "Use no-loop code generation technique.") LANGOPT(OpenMPTargetXteamReduction , 1, 1, NotCompatible, "Use cross-team code generation technique.") LANGOPT(OpenMPTargetFastReduction , 1, 0, NotCompatible, "Use fast reduction code generation technique.") -LANGOPT(OpenMPTargetMultiDevice , 1, 0, NotCompatible, "Offload the iteration space of a single target region across multiple GPU devices.") // The flag '-fopenmp-target-xteam-scan' triggers the 'Segmented Cross Team Scan' variant by default. To use the no-loop variant, please use the flag '-fopenmp-target-no-loop-scan' instead. LANGOPT(OpenMPTargetXteamScan , 1, 0, NotCompatible, "Use the cross-team specialized kernel code generation for 'scan' directive.") diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td index 792a3ca9d7806..e6ac0dc25e8dc 100644 --- a/clang/include/clang/Options/Options.td +++ b/clang/include/clang/Options/Options.td @@ -4170,14 +4170,6 @@ def fno_openmp_target_xteam_no_loop_scan : Flag<["-"], "fno-openmp-target-xteam- Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>, HelpText<"Do not use the no-loop variant of the cross-team specialized kernel code generation for 'scan' directive.">, MarshallingInfoFlag>; -def fopenmp_target_multi_device : Flag<["-"], "fopenmp-target-multi-device">, Group, - Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option, FlangOption]>, - HelpText<"Enable code generation to emit support for multi device target region execution">, - MarshallingInfoFlag>; -def fno_openmp_target_multi_device : Flag<["-"], "fno-openmp-target-multi-device">, Group, - Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option,FlangOption]>, - HelpText<"Do not use code generation to emit support for multi target offloading">, - MarshallingInfoFlag>; //===----------------------------------------------------------------------===// // Shared cc1 + fc1 OpenMP Target Options diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 003bc11c467de..266dda575ee9b 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -1073,8 +1073,7 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM) // The user forces the compiler to behave as if omp requires // unified_shared_memory was given. - if (CGM.getLangOpts().OpenMPForceUSM || - CGM.getLangOpts().OpenMPTargetMultiDevice) { + if (CGM.getLangOpts().OpenMPForceUSM) { HasRequiresUnifiedSharedMemory = true; OMPBuilder.Config.setHasRequiresUnifiedSharedMemory(true); } @@ -1238,8 +1237,7 @@ struct PushAndPopStackRAII { static llvm::Function *emitParallelOrTeamsOutlinedFunction( CodeGenModule &CGM, const OMPExecutableDirective &D, const CapturedStmt *CS, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, - const StringRef OutlinedHelperName, const RegionCodeGenTy &CodeGen, - bool EmittingOutlinedTeams) { + const StringRef OutlinedHelperName, const RegionCodeGenTy &CodeGen) { assert(ThreadIDVar->getType()->isPointerType() && "thread id variable must be of type kmp_int32 *"); CodeGenFunction CGF(CGM, true); @@ -1270,8 +1268,7 @@ static llvm::Function *emitParallelOrTeamsOutlinedFunction( CGOpenMPOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen, InnermostKind, HasCancel, OutlinedHelperName); CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo); - return CGF.GenerateOpenMPCapturedStmtFunction(*CS, D, - EmittingOutlinedTeams, false); + return CGF.GenerateOpenMPCapturedStmtFunction(*CS, D, D.getBeginLoc()); } std::string CGOpenMPRuntime::getOutlinedHelperName(StringRef Name) const { @@ -1295,7 +1292,7 @@ llvm::Function *CGOpenMPRuntime::emitParallelOutlinedFunction( const CapturedStmt *CS = D.getCapturedStmt(OMPD_parallel); return emitParallelOrTeamsOutlinedFunction( CGM, D, CS, ThreadIDVar, InnermostKind, getOutlinedHelperName(CGF), - CodeGen, /*EmittingOutlinedTeams*/ false); + CodeGen); } llvm::Function *CGOpenMPRuntime::emitTeamsOutlinedFunction( @@ -1305,7 +1302,7 @@ llvm::Function *CGOpenMPRuntime::emitTeamsOutlinedFunction( const CapturedStmt *CS = D.getCapturedStmt(OMPD_teams); return emitParallelOrTeamsOutlinedFunction( CGM, D, CS, ThreadIDVar, InnermostKind, getOutlinedHelperName(CGF), - CodeGen, /*EmittingOutlinedTeams*/ true); + CodeGen); } llvm::Function *CGOpenMPRuntime::emitTaskOutlinedFunction( @@ -2735,37 +2732,19 @@ static void emitForStaticInitCall( "expected static chunked schedule"); } - if (Values.IsMultiDevice) { - llvm::Value *Args[] = { - UpdateLocation, - ThreadId, - CGF.Builder.getInt32(addMonoNonMonoModifier(CGF.CGM, Schedule, M1, - M2)), // Schedule type - Values.IL.emitRawPointer(CGF), // &isLastIter - Values.MultiDeviceLB.emitRawPointer(CGF), // &MultiDeviceLB - Values.MultiDeviceUB.emitRawPointer(CGF), // &MultiDeviceUB - Values.LB.emitRawPointer(CGF), // &LB - Values.UB.emitRawPointer(CGF), // &UB - Values.ST.emitRawPointer(CGF), // &Stride - CGF.Builder.getIntN(Values.IVSize, 1), // Incr - Chunk // Chunk - }; - CGF.EmitRuntimeCall(ForStaticInitFunction, Args); - } else { - llvm::Value *Args[] = { - UpdateLocation, - ThreadId, - CGF.Builder.getInt32(addMonoNonMonoModifier(CGF.CGM, Schedule, M1, - M2)), // Schedule type - Values.IL.emitRawPointer(CGF), // &isLastIter - Values.LB.emitRawPointer(CGF), // &LB - Values.UB.emitRawPointer(CGF), // &UB - Values.ST.emitRawPointer(CGF), // &Stride - CGF.Builder.getIntN(Values.IVSize, 1), // Incr - Chunk // Chunk - }; - CGF.EmitRuntimeCall(ForStaticInitFunction, Args); - } + llvm::Value *Args[] = { + UpdateLocation, + ThreadId, + CGF.Builder.getInt32(addMonoNonMonoModifier(CGF.CGM, Schedule, M1, + M2)), // Schedule type + Values.IL.emitRawPointer(CGF), // &isLastIter + Values.LB.emitRawPointer(CGF), // &LB + Values.UB.emitRawPointer(CGF), // &UB + Values.ST.emitRawPointer(CGF), // &Stride + CGF.Builder.getIntN(Values.IVSize, 1), // Incr + Chunk // Chunk + }; + CGF.EmitRuntimeCall(ForStaticInitFunction, Args); } void CGOpenMPRuntime::emitForStaticInit(CodeGenFunction &CGF, @@ -2793,7 +2772,7 @@ void CGOpenMPRuntime::emitForStaticInit(CodeGenFunction &CGF, void CGOpenMPRuntime::emitDistributeStaticInit( CodeGenFunction &CGF, SourceLocation Loc, OpenMPDistScheduleClauseKind SchedKind, - const CGOpenMPRuntime::StaticRTInput &Values, bool IsMultiDeviceKernel) { + const CGOpenMPRuntime::StaticRTInput &Values) { OpenMPSchedType ScheduleNum = getRuntimeSchedule(SchedKind, Values.Chunk != nullptr); llvm::Value *UpdatedLocation = @@ -2802,13 +2781,8 @@ void CGOpenMPRuntime::emitDistributeStaticInit( llvm::FunctionCallee StaticInitFunction; bool isGPUDistribute = CGM.getLangOpts().OpenMPIsTargetDevice && CGM.getTriple().isGPU(); - if (IsMultiDeviceKernel && isGPUDistribute) { - StaticInitFunction = OMPBuilder.createMDDistributeForStaticInitFunction( - Values.IVSize, Values.IVSigned); - } else { - StaticInitFunction = OMPBuilder.createForStaticInitFunction( - Values.IVSize, Values.IVSigned, isGPUDistribute); - } + StaticInitFunction = OMPBuilder.createForStaticInitFunction( + Values.IVSize, Values.IVSigned, isGPUDistribute); emitForStaticInitCall(CGF, UpdatedLocation, ThreadId, StaticInitFunction, ScheduleNum, OMPC_SCHEDULE_MODIFIER_unknown, OMPC_SCHEDULE_MODIFIER_unknown, Values); @@ -6425,10 +6399,7 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper( CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo); if (CGM.getLangOpts().OpenMPIsTargetDevice && !isGPU()) return CGF.GenerateOpenMPCapturedStmtFunctionAggregate(CS, D); - return CGF.GenerateOpenMPCapturedStmtFunction( - CS, D, - /*CanHaveMultiDeviceArgs*/ true, - /*IsTopKernel*/ true); + return CGF.GenerateOpenMPCapturedStmtFunction(CS, D, D.getBeginLoc()); }; cantFail(OMPBuilder.emitTargetRegionFunction( @@ -10785,47 +10756,12 @@ emitDynCGroupMem(const OMPExecutableDirective &D, CodeGenFunction &CGF) { static void genMapInfoForCaptures( MappableExprsHandler &MEHandler, CodeGenFunction &CGF, const CapturedStmt &CS, llvm::SmallVectorImpl &CapturedVars, - llvm::SmallVectorImpl &MultiTargetVars, llvm::OpenMPIRBuilder &OMPBuilder, llvm::DenseSet> &MappedVarSet, uint32_t &CapturedCount, MappableExprsHandler::MapCombinedInfoTy &CombinedInfo) { llvm::DenseMap LambdaPointers; - // If a for statement is present and the compiler flag for multi-device - // targets is enabled then it means we have 2 variables at the start which - // represent the lower and upper bounds of the loop: - // TODO: add compiler flag condition - for (auto *MTV = MultiTargetVars.begin(); MTV != MultiTargetVars.end(); - ++MTV) { - // This should always be null because the any used variable (if one exists) - // will be included when capturing the actual variables (not the - // multi-target ones). - MappedVarSet.insert(nullptr); - - MappableExprsHandler::MapCombinedInfoTy CurInfo; - CurInfo.Exprs.push_back(nullptr); - CurInfo.BasePointers.push_back(*MTV); - CurInfo.Pointers.push_back(*MTV); - CurInfo.Sizes.push_back(llvm::ConstantInt::get(CGF.Int64Ty, 4)); - - // Copy to the device as an argument. No need to retrieve it. - CurInfo.Types.push_back(OpenMPOffloadMappingFlags::OMP_MAP_LITERAL | - OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM | - OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT); - CurInfo.Mappers.push_back(nullptr); - - assert(CurInfo.BasePointers.size() == CurInfo.Pointers.size() && - CurInfo.BasePointers.size() == CurInfo.Sizes.size() && - CurInfo.BasePointers.size() == CurInfo.Types.size() && - CurInfo.BasePointers.size() == CurInfo.Mappers.size() && - "Inconsistent map information sizes!"); - - // We need to append the results of this capture to what we already - // have. - CombinedInfo.append(CurInfo); - } - auto RI = CS.getCapturedRecordDecl()->field_begin(); auto *CV = CapturedVars.begin(); CapturedCount = 0; @@ -10952,7 +10888,6 @@ genMapInfo(MappableExprsHandler &MEHandler, CodeGenFunction &CGF, static void genMapInfo(const OMPExecutableDirective &D, CodeGenFunction &CGF, const CapturedStmt &CS, llvm::SmallVectorImpl &CapturedVars, - llvm::SmallVectorImpl &MultiTargetVars, llvm::OpenMPIRBuilder &OMPBuilder, uint32_t &CapturedCount, MappableExprsHandler::MapCombinedInfoTy &CombinedInfo) { @@ -10960,7 +10895,7 @@ static void genMapInfo(const OMPExecutableDirective &D, CodeGenFunction &CGF, MappableExprsHandler MEHandler(D, CGF); llvm::DenseSet> MappedVarSet; - genMapInfoForCaptures(MEHandler, CGF, CS, CapturedVars, MultiTargetVars, + genMapInfoForCaptures(MEHandler, CGF, CS, CapturedVars, OMPBuilder, MappedVarSet, CapturedCount, CombinedInfo); genMapInfo(MEHandler, CGF, CombinedInfo, OMPBuilder, MappedVarSet); } @@ -10985,7 +10920,6 @@ static void emitTargetCallKernelLaunch( CGOpenMPRuntime *OMPRuntime, llvm::Function *OutlinedFn, const OMPExecutableDirective &D, llvm::SmallVectorImpl &CapturedVars, - llvm::SmallVectorImpl &MultiTargetVars, bool RequiresOuterTask, const CapturedStmt &CS, bool OffloadingMandatory, llvm::PointerIntPair Device, llvm::Value *OutlinedFnID, CodeGenFunction::OMPTargetDataInfo &InputInfo, @@ -10999,7 +10933,7 @@ static void emitTargetCallKernelLaunch( // Fill up the arrays with all the captured variables. MappableExprsHandler::MapCombinedInfoTy CombinedInfo; uint32_t CapturedCount; - genMapInfo(D, CGF, CS, CapturedVars, MultiTargetVars, OMPBuilder, + genMapInfo(D, CGF, CS, CapturedVars, OMPBuilder, CapturedCount, CombinedInfo); // Array to hold to allocated XTeam reduction variables: @@ -11038,8 +10972,6 @@ static void emitTargetCallKernelLaunch( llvm::Value *XteamRedNumTeamsFromOccupancy = nullptr; bool IsXteamRedFast = CGF.CGM.isXteamRedFast(FStmt); // We don't need to allocate/initialize metadata in the fast version. - // TODO: This will not work for multi-target if we need to allocate - // data for each used device. Ensure conditions guard against that. if (!IsXteamRedFast) { // TODO Use device id from device clause, if any. DevIdVal = CGF.EmitRuntimeCall( @@ -11330,9 +11262,6 @@ static void emitTargetCallKernelLaunch( bool IsReverseOffloading = Device.getInt() == OMPC_DEVICE_ancestor; if (IsReverseOffloading) { - assert( - !CGF.CGM.getLangOpts().OpenMPTargetMultiDevice && - "Cannot enable multi-device targets when doing reverse offloading"); // Reverse offloading is not supported, so just execute on the host. // FIXME: This fallback solution is incorrect since it ignores the // OMP_TARGET_OFFLOAD environment variable. Instead it would be better to @@ -11397,12 +11326,9 @@ static void emitTargetCallKernelLaunch( CGF.Builder.restoreIP(AfterIP); }; - if (RequiresOuterTask) { - assert(!CGM.getLangOpts().OpenMPTargetMultiDevice && - "Cannot yet enable multi-device targets for situations in which an " - "outer task is required"); + if (RequiresOuterTask) CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo); - } else + else OMPRuntime->emitInlinedDirective(CGF, D.getDirectiveKind(), ThenGen); if (HasXTeamReduction) { @@ -11466,11 +11392,10 @@ void CGOpenMPRuntime::emitTargetCall( needsTaskBasedThreadLimit(D.getDirectiveKind()) && D.hasClausesOfKind()); llvm::SmallVector CapturedVars; - llvm::SmallVector MultiTargetVars; const CapturedStmt &CS = *D.getCapturedStmt(OMPD_target); - auto &&ArgsCodegen = [&CS, &D, &CapturedVars, &MultiTargetVars]( + auto &&ArgsCodegen = [&CS, &D, &CapturedVars]( CodeGenFunction &CGF, PrePostActionTy &) { - CGF.GenerateOpenMPCapturedVarsDevice(CS, CapturedVars, MultiTargetVars, + CGF.GenerateOpenMPCapturedVars(CS, CapturedVars, CGF.CGM.getOptKernelKey(D)); }; emitInlinedDirective(CGF, OMPD_unknown, ArgsCodegen); @@ -11479,13 +11404,13 @@ void CGOpenMPRuntime::emitTargetCall( llvm::Value *MapTypesArray = nullptr; llvm::Value *MapNamesArray = nullptr; - auto &&TargetThenGen = [this, OutlinedFn, &D, &CapturedVars, &MultiTargetVars, + auto &&TargetThenGen = [this, OutlinedFn, &D, &CapturedVars, RequiresOuterTask, &CS, OffloadingMandatory, Device, OutlinedFnID, &InputInfo, &MapTypesArray, &MapNamesArray, SizeEmitter](CodeGenFunction &CGF, PrePostActionTy &) { emitTargetCallKernelLaunch( - this, OutlinedFn, D, CapturedVars, MultiTargetVars, RequiresOuterTask, + this, OutlinedFn, D, CapturedVars, RequiresOuterTask, CS, OffloadingMandatory, Device, OutlinedFnID, InputInfo, MapTypesArray, MapNamesArray, SizeEmitter, CGF, CGM); }; @@ -13602,8 +13527,7 @@ void CGOpenMPSIMDRuntime::emitForStaticInit( void CGOpenMPSIMDRuntime::emitDistributeStaticInit( CodeGenFunction &CGF, SourceLocation Loc, - OpenMPDistScheduleClauseKind SchedKind, const StaticRTInput &Values, - bool IsMultiDeviceKernel) { + OpenMPDistScheduleClauseKind SchedKind, const StaticRTInput &Values) { llvm_unreachable("Not supported in SIMD-only mode"); } diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index 272cc636f98f4..431cbc1ed72e1 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -959,17 +959,9 @@ class CGOpenMPRuntime { bool IVSigned = false; /// true if loop is ordered, false otherwise. bool Ordered = false; - /// true if kernel is multi-device - bool IsMultiDevice = false; Address IL = Address::invalid(); /// Address of the output variable in which the lower iteration number is /// returned. - Address MultiDeviceLB = Address::invalid(); - /// Address of the output variable in which the upper iteration number is - /// returned. - Address MultiDeviceUB = Address::invalid(); - /// Address of the output variable in which the lower iteration number is - /// returned. Address LB = Address::invalid(); /// Address of the output variable in which the upper iteration number is /// returned. @@ -985,11 +977,6 @@ class CGOpenMPRuntime { llvm::Value *Chunk = nullptr) : IVSize(IVSize), IVSigned(IVSigned), Ordered(Ordered), IL(IL), LB(LB), UB(UB), ST(ST), Chunk(Chunk) {} - void setMultiDeviceLBUB(Address LB, Address UB) { - MultiDeviceLB = LB; - MultiDeviceUB = UB; - IsMultiDevice = true; - } }; /// Call the appropriate runtime routine to initialize it before start /// of loop. @@ -1020,8 +1007,7 @@ class CGOpenMPRuntime { virtual void emitDistributeStaticInit(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDistScheduleClauseKind SchedKind, - const StaticRTInput &Values, - bool IsMultiDeviceKernel); + const StaticRTInput &Values); /// Call the appropriate runtime routine to notify that we finished /// iteration of the ordered loop with the dynamic scheduling. @@ -1960,8 +1946,7 @@ class CGOpenMPSIMDRuntime final : public CGOpenMPRuntime { /// void emitDistributeStaticInit(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDistScheduleClauseKind SchedKind, - const StaticRTInput &Values, - bool IsMultiDeviceKernel) override; + const StaticRTInput &Values) override; /// Call the appropriate runtime routine to notify that we finished /// iteration of the ordered loop with the dynamic scheduling. diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 0cf2083c38c06..4aa1d613ea377 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -718,18 +718,6 @@ static void setPropertyWorkGroupSize(CodeGenModule &CGM, StringRef Name, CGM.addCompilerUsedGlobal(GVMode); } -// Create a unique global variable to indicate if the kernel is multi-device. -static void setMultiDeviceStatus(CodeGenModule &CGM, StringRef Name, - int IsMultiDevice) { - auto *GVMode = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int8Ty, - /*isConstant=*/true, llvm::GlobalValue::WeakAnyLinkage, - llvm::ConstantInt::get(CGM.Int8Ty, IsMultiDevice), - Twine(Name, "_multi_device")); - - CGM.addCompilerUsedGlobal(GVMode); -} - // Compute the correct number of threads in a team // to accommodate for a master thread. // Keep aligned with amdgpu plugin code located in function getLaunchVals @@ -812,9 +800,6 @@ void CGOpenMPRuntimeGPU::GenerateMetaData(CodeGenModule &CGM, } // Emit a kernel descriptor for runtime. setPropertyWorkGroupSize(CGM, OutlinedFn->getName(), FlatAttr); - - // Emit multi-device flag for this kernel. - setMultiDeviceStatus(CGM, OutlinedFn->getName(), CGM.isMultiDeviceKernel(D)); } void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D, @@ -1431,20 +1416,6 @@ void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF, else OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF)); OutlinedFnArgs.push_back(ZeroAddr.getPointer()); - - // If this is a kernel we can run on multiple devices then we need to add - // the arguments for multi-device targets. This is needed for the case when - // we emit an outlined teams function which needs to be passed the multi - // device LB and UB. - if (CGM.isMultiDeviceKernel(D)) { - Address LBAddr = - CGF.GetAddrOfLocalVar(CGM.getMultiDeviceLBArg(D, CGF.CurFn)); - OutlinedFnArgs.push_back(CGF.Builder.CreateLoad(LBAddr)); - Address UBAddr = - CGF.GetAddrOfLocalVar(CGM.getMultiDeviceUBArg(D, CGF.CurFn)); - OutlinedFnArgs.push_back(CGF.Builder.CreateLoad(UBAddr)); - } - OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); } @@ -2944,11 +2915,8 @@ llvm::Value *CGOpenMPRuntimeGPU::getXteamRedOperation( SentinelVal, ThreadStartIndex, NumTeams, - CGF.CGM.getLangOpts().OpenMPTargetMultiDevice - ? llvm::ConstantInt::get(CGF.CGM.Int32Ty, - 0) /* __MEMORY_SCOPE_SYSTEM */ - : llvm::ConstantInt::get(CGF.CGM.Int32Ty, - 1) /* __MEMORY_SCOPE_DEVICE */}; + llvm::ConstantInt::get(CGF.CGM.Int32Ty,1) + /* __MEMORY_SCOPE_DEVICE */}; unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size; assert(WarpSize == 32 || WarpSize == 64); diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index 096955bd13c3b..f5c80acbe0b5e 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -80,7 +80,7 @@ CodeGenFunction::EmitBigJumpLoopStartingIndex(const ForStmt &FStmt, assert(Directives.size() > 0 && isa(Directives.back()) && "Appropriate directive not found"); const OMPLoopDirective &LD = *(cast(Directives.back())); - std::pair IVPair = EmitNoLoopIV(LD, Args); + std::pair IVPair = EmitNoLoopIV(LD); const VarDecl *LoopVD = IVPair.first; Address IvAddr = IVPair.second; @@ -110,15 +110,7 @@ CodeGenFunction::EmitBigJumpLoopStartingIndex(const ForStmt &FStmt, llvm::Value *Gtid = Builder.CreateIntCast(GlobalGpuThreadId, IvAddr.getElementType(), false); - llvm::Value *Iv = nullptr; - if (CGM.isMultiDeviceKernel(&FStmt)) { - Iv = Builder.CreateAdd( - Gtid, - Builder.CreateIntCast(Builder.CreateLoad(GetAddrOfLocalVar((*Args)[0])), - IvAddr.getElementType(), false)); - } else { - Iv = Builder.CreateAdd(Gtid, Builder.CreateLoad(IvAddr)); - } + llvm::Value *Iv = Builder.CreateAdd(Gtid, Builder.CreateLoad(IvAddr)); if (CGM.isXteamRedKernel(&FStmt)) { // Cache the thread specific initial loop iteration value and the number of @@ -179,8 +171,7 @@ void CodeGenFunction::EmitBigJumpLoopInc(const ForStmt &FStmt, } std::pair -CodeGenFunction::EmitNoLoopIV(const OMPLoopDirective &LD, - const FunctionArgList *Args) { +CodeGenFunction::EmitNoLoopIV(const OMPLoopDirective &LD) { // Emit the original loop indices for (const Expr *CE : LD.counters()) { const auto *CEDecl = cast(cast(CE)->getDecl()); @@ -228,21 +219,6 @@ CodeGenFunction::EmitNoLoopIV(const OMPLoopDirective &LD, // Emit init of the iteration variable EmitIgnoredExpr(LD.getInit()); - // If multi-device targets are enabled, overwrite the LB and UB - // initialization with the values passed in as arguments in positions 0 and 1 - // respectively: - if (CGM.isMultiDeviceKernel(LD)) { - llvm::Value *LBMultiTarget = Builder.CreateIntCast( - Builder.CreateLoad(GetAddrOfLocalVar((*Args)[0])), - GetAddrOfLocalVar(IVDecl).getElementType(), false); - Builder.CreateStore(LBMultiTarget, GetAddrOfLocalVar(LBDecl)); - Builder.CreateStore(LBMultiTarget, GetAddrOfLocalVar(IVDecl)); - llvm::Value *UBMultiTarget = Builder.CreateIntCast( - Builder.CreateLoad(GetAddrOfLocalVar((*Args)[1])), - GetAddrOfLocalVar(IVDecl).getElementType(), false); - Builder.CreateStore(UBMultiTarget, GetAddrOfLocalVar(UBDecl)); - } - return std::make_pair(IVDecl, GetAddrOfLocalVar(IVDecl)); } @@ -332,18 +308,17 @@ void CodeGenFunction::EmitOptKernelCode( llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_XTEAM_RED); if (OptKernelMode == llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD_NO_LOOP) - EmitNoLoopCode(D, CapturedForStmt, Loc, Args); + EmitNoLoopCode(D, CapturedForStmt, Loc); else if (OptKernelMode == llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD_BIG_JUMP_LOOP) - EmitBigJumpLoopCode(D, CapturedForStmt, Loc, Args); + EmitBigJumpLoopCode(D, CapturedForStmt, Loc); else EmitXteamRedCode(D, CapturedForStmt, Loc, Args); } void CodeGenFunction::EmitNoLoopCode(const OMPExecutableDirective &D, const ForStmt *CapturedForStmt, - SourceLocation Loc, - const FunctionArgList *Args) { + SourceLocation Loc) { assert(isa(D) && "Unexpected directive"); const OMPLoopDirective &LD = cast(D); @@ -352,7 +327,7 @@ void CodeGenFunction::EmitNoLoopCode(const OMPExecutableDirective &D, // Initialize a specialized kernel. RT.initSpecializedKernel(*this); - auto IVPair = EmitNoLoopIV(LD, Args); + auto IVPair = EmitNoLoopIV(LD); const VarDecl *IVDecl = IVPair.first; Address IvAddr = IVPair.second; @@ -380,16 +355,9 @@ void CodeGenFunction::EmitNoLoopCode(const OMPExecutableDirective &D, // initialized llvm::Value *Gtid = Builder.CreateIntCast(GlobalGpuThreadId, IvAddr.getElementType(), false); - if (CGM.isMultiDeviceKernel(D)) { - llvm::Value *Iv = Builder.CreateAdd( - Gtid, - Builder.CreateIntCast(Builder.CreateLoad(GetAddrOfLocalVar((*Args)[0])), - IvAddr.getElementType(), false)); - Builder.CreateStore(Iv, IvAddr); - } else { - llvm::Value *Iv = Builder.CreateAdd(Gtid, Builder.CreateLoad(IvAddr)); - Builder.CreateStore(Iv, IvAddr); - } + + llvm::Value *Iv = Builder.CreateAdd(Gtid, Builder.CreateLoad(IvAddr)); + Builder.CreateStore(Iv, IvAddr); // Emit updates of the original loop indices for (const Expr *UE : LD.updates()) @@ -440,7 +408,7 @@ void CodeGenFunction::EmitNoLoopXteamScanInit(const OMPLoopDirective &LD, llvm::Value *&GlobalGpuThreadId, llvm::Value *&WorkGroupId, llvm::Value *&TotalNumThreads) { - auto IVPair = EmitNoLoopIV(LD, Args); + auto IVPair = EmitNoLoopIV(LD); Address OMPIterationVarAddr = IVPair.second; // Generate: @@ -590,44 +558,11 @@ void CodeGenFunction::EmitNoLoopXteamScanPhaseTwoCode( void CodeGenFunction::EmitBigJumpLoopCode(const OMPExecutableDirective &D, const ForStmt *CapturedForStmt, - SourceLocation Loc, - const FunctionArgList *Args) { + SourceLocation Loc) { auto &RT = static_cast(CGM.getOpenMPRuntime()); // Initialize a specialized kernel. RT.initSpecializedKernel(*this); - - // Add pre-processing code from start of EmitStmt function so that the - // code path is identical. - assert(CapturedForStmt && "Null statement?"); - PGO->setCurrentStmt(CapturedForStmt); - - // These statements have their own debug info handling. - if (EmitSimpleStmt(CapturedForStmt, nullptr)) - return; - - // Check if we are generating unreachable code. - if (!HaveInsertPoint()) { - if (!ContainsLabel(CapturedForStmt)) - return; - - // Otherwise, make a new block to hold the code. - EnsureInsertPoint(); - } - - // Generate a stoppoint if we are emitting debug info. - EmitStopPoint(CapturedForStmt); - - // Ignore all OpenMP directives except for simd if OpenMP with Simd is - // enabled. - if (getLangOpts().OpenMP && getLangOpts().OpenMPSimd) { - if (const auto *D = dyn_cast(CapturedForStmt)) { - EmitSimpleOMPExecutableDirective(*D); - return; - } - } - - // Call variant with Args: - EmitForStmtWithArgs(cast(*CapturedForStmt), Args); + EmitStmt(CapturedForStmt); } void CodeGenFunction::EmitXteamRedCode(const OMPExecutableDirective &D, diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index ffd361f5683d7..251176f1f5b79 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -534,40 +534,6 @@ void CodeGenFunction::GenerateOpenMPCapturedVars( } } -// This function should be called on the host when preparing to emit the -// code that launches the kernel on the device. -void CodeGenFunction::GenerateOpenMPCapturedVarsDevice( - const CapturedStmt &S, SmallVectorImpl &CapturedVars, - SmallVectorImpl &MultiTargetVars, - const Stmt *XteamRedNestKey) { - ASTContext &Ctx = getContext(); - - // If a for loop exists then it means we can use multi-target split on - // this target region. - if (CGM.getLangOpts().OpenMPTargetMultiDevice) { - assert(!CGM.getLangOpts().OpenMPIsTargetDevice && - "This should only happen on host CG"); - - // Add LB placeholder: - Address CastedLBMultiAddr = - CreateMemTemp(Ctx.getUIntPtrType(), "LB.multi.addr"); - LValue CastedLBMultiLV = - MakeAddrLValue(CastedLBMultiAddr, Ctx.getUIntPtrType()); - llvm::Value *LBValue = EmitLoadOfScalar(CastedLBMultiLV, S.getBeginLoc()); - MultiTargetVars.push_back(LBValue); - - // Add UB placeholder: - Address CastedUBMultiAddr = - CreateMemTemp(Ctx.getUIntPtrType(), "UB.multi.addr"); - LValue CastedUBMultiLV = - MakeAddrLValue(CastedUBMultiAddr, Ctx.getUIntPtrType()); - llvm::Value *UBValue = EmitLoadOfScalar(CastedUBMultiLV, S.getBeginLoc()); - MultiTargetVars.push_back(UBValue); - } - - GenerateOpenMPCapturedVars(S, CapturedVars, XteamRedNestKey); -} - static Address castValueFromUintptr(CodeGenFunction &CGF, SourceLocation Loc, QualType DstType, StringRef Name, LValue AddrLV) { @@ -640,8 +606,7 @@ static llvm::Function *emitOutlinedFunctionPrologue( llvm::DenseMap> &VLASizes, llvm::Value *&CXXThisValue, const FunctionOptions &FO, - bool argsNeedAddrSpace, bool isXteamKernel, bool AddMultiDeviceArgs, - bool AddArgsToTopKernelOnly) { + bool argsNeedAddrSpace, bool isXteamKernel) { const CapturedDecl *CD = FO.S->getCapturedDecl(); const RecordDecl *RD = FO.S->getCapturedRecordDecl(); assert(CD->hasBody() && "missing CapturedDecl body"); @@ -657,46 +622,6 @@ static llvm::Function *emitOutlinedFunctionPrologue( CD->param_begin(), std::next(CD->param_begin(), CD->getContextParamPosition())); - // Add arguments for multi-device targets if enabled and if there is a an - // iteration space associated with the directive containing the target - // directive. - unsigned ContextArgsMultiDeviceOffset = 0; - VarDecl *LBDeclVD = nullptr; - VarDecl *UBDeclVD = nullptr; - - // Determine if two extra arguments should be added. The args should always - // be added to the top kernel when in multi-device mode and on the device. - bool AddedExtraMDArgs = false; - if (AddArgsToTopKernelOnly) { - AddedExtraMDArgs = true; - } else if (AddMultiDeviceArgs) { - assert(CGM.getOptKernelKey(D) && - "Mapping key for Xteam reduction statement not found"); - const ForStmt *FStmt = CGM.getSingleForStmt(CGM.getOptKernelKey(D)); - assert(FStmt && "For statement for directive not found"); - - // If we have a valid for statement for this target region then we can - // emit a multi-device target for it. Add the two arguments that hold the - // lower and upper bound for the loop: - if (FStmt) { - AddedExtraMDArgs = true; - } - } - - if (AddedExtraMDArgs) { - QualType Int64Ty = - Ctx.getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1); - LBDeclVD = ImplicitParamDecl::Create(Ctx, Int64Ty, - ImplicitParamKind::CapturedContext); - Args.emplace_back(LBDeclVD); - TargetArgs.emplace_back(LBDeclVD); - UBDeclVD = ImplicitParamDecl::Create(Ctx, Int64Ty, - ImplicitParamKind::CapturedContext); - Args.emplace_back(UBDeclVD); - TargetArgs.emplace_back(UBDeclVD); - ContextArgsMultiDeviceOffset = 2; - } - auto I = FO.S->captures().begin(); FunctionDecl *DebugFunctionDecl = nullptr; if (!FO.UIntPtrCastRequired) { @@ -877,12 +802,8 @@ static llvm::Function *emitOutlinedFunctionPrologue( FO.UIntPtrCastRequired ? FO.Loc : FO.S->getBeginLoc(), FO.UIntPtrCastRequired ? FO.Loc : CD->getBody()->getBeginLoc()); - - // When multi-device targets are enabled and applicable to this kernel then - // we need to add an offset of 2 to the regular offset since now the - // context variables start in position 3 instead of 1. The loop below will - // iterate over any variables captured from the user context. - unsigned Cnt = ContextArgsMultiDeviceOffset + CD->getContextParamPosition(); + + unsigned Cnt = CD->getContextParamPosition(); I = FO.S->captures().begin(); for (const FieldDecl *FD : RD->fields()) { // Do not map arguments if we emit function with non-original types. @@ -951,15 +872,6 @@ static llvm::Function *emitOutlinedFunctionPrologue( ++I; } - if (AddMultiDeviceArgs) { - const ForStmt *FStmt = CGM.getSingleForStmt(CGM.getOptKernelKey(D)); - if (FStmt) { - // Save these emitted arguments to use them later on if we need to emit an - // outlined function in the generic case. - CGM.saveMultiDeviceArgs(D, F, LBDeclVD, UBDeclVD); - } - } - return F; } @@ -1070,9 +982,7 @@ static llvm::Function *emitOutlinedFunctionPrologueAggregate( } llvm::Function *CodeGenFunction::GenerateOpenMPCapturedStmtFunction( - const CapturedStmt &S, const OMPExecutableDirective &D, - bool CanHaveMultiDeviceArgs, bool IsTopKernel) { - SourceLocation Loc = D.getBeginLoc(); + const CapturedStmt &S, const OMPExecutableDirective &D, SourceLocation Loc) { assert( CapturedStmtInfo && "CapturedStmtInfo should be set when generating the captured function"); @@ -1127,11 +1037,6 @@ llvm::Function *CodeGenFunction::GenerateOpenMPCapturedStmtFunction( !CGM.getTriple().isAMDGCN() && !isXteamKernel && (getDebugInfo() && CGM.getCodeGenOpts().hasReducedDebugInfo()); - // Determine if the kernel is multi-device. The check and set function will - // verify if the value has been set before, if it has been set then return it. - bool IsMultiDeviceKernel = - CGM.checkAndSetMultiDeviceKernel(D, CanHaveMultiDeviceArgs); - OpenMPDirectiveKind EKind = getEffectiveDirectiveKind(D); bool IsDeviceKernel = CGM.getOpenMPRuntime().isGPU() && isOpenMPTargetExecutionDirective(EKind) && @@ -1147,31 +1052,17 @@ llvm::Function *CodeGenFunction::GenerateOpenMPCapturedStmtFunction( CapturedStmtInfo->getHelperName(), Loc, IsDeviceKernel); WrapperCGF.CapturedStmtInfo = CapturedStmtInfo; - // TODO: Determine if the wrapper function needs to pass in multi-device - // args in the meantime it is always false. WrapperF = emitOutlinedFunctionPrologue( WrapperCGF, D, Args, LocalAddrs, VLASizes, WrapperCGF.CXXThisValue, - WrapperFO, isKernel, isXteamKernel, /*AddMultiDeviceArgs*/ false, - /*AddArgsToTopKernelOnly*/ false); + WrapperFO, isKernel, isXteamKernel); Out << "_debug__"; } FunctionOptions FO(&S, !NeedWrapperFunction, /*RegisterCastedArgsOnly=*/false, Out.str(), Loc, !NeedWrapperFunction && IsDeviceKernel); - - // Add multi-device args only if this is the team level or higher. For - // outlined parallel level we should never emit multi device arguments even if - // this is deemed to be a multi device kernel. The team level, when outlined, - // will correctly pass the LB and UB values to the outlined parallel region as - // prev.UB and prev.LB arguments. - bool ShouldEmitMultiDevicePrologue = - IsMultiDeviceKernel && CanHaveMultiDeviceArgs; - bool AddArgsToTopKernelOnly = IsTopKernel && !ShouldEmitMultiDevicePrologue && - getLangOpts().OpenMPTargetMultiDevice && - getLangOpts().OpenMPIsTargetDevice; + llvm::Function *F = emitOutlinedFunctionPrologue( *this, D, WrapperArgs, WrapperLocalAddrs, WrapperVLASizes, CXXThisValue, - FO, isKernel, isXteamKernel, ShouldEmitMultiDevicePrologue, - AddArgsToTopKernelOnly); + FO, isKernel, isXteamKernel); CodeGenFunction::OMPPrivateScope LocalScope(*this); for (const auto &LocalAddrPair : WrapperLocalAddrs) { if (LocalAddrPair.second.first) { @@ -1192,18 +1083,17 @@ llvm::Function *CodeGenFunction::GenerateOpenMPCapturedStmtFunction( EmitOptKernel( D, FStmt, llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD_NO_LOOP, Loc, - &WrapperArgs); + /*WrapperArgs=*/nullptr); else EmitOptKernel( D, FStmt, llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD_BIG_JUMP_LOOP, - Loc, &WrapperArgs); + Loc, /*WrapperArgs=*/nullptr); } else if (CGM.getLangOpts().OpenMPIsTargetDevice && isXteamKernel) { EmitOptKernel(D, FStmt, llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_XTEAM_RED, Loc, &WrapperArgs); } else { - // TODO: for multi-device targets handle this case if (!(CGM.isXteamScanKernel() && !CGM.isXteamScanPhaseOne)) // This condition prevents any codegen for the host fallback function of // the PhaseTwo kernel of Xteam Scan. @@ -1300,8 +1190,7 @@ llvm::Function *CodeGenFunction::GenerateOpenMPCapturedStmtFunctionAggregate( /*IsDeviceKernel=*/false); F = emitOutlinedFunctionPrologue( *this, D, Args, LocalAddrs, VLASizes, CXXThisValue, FO, - /*argsNeedAddrSpace=*/false, /*isXteamKernel=*/false, - /*AddMultiDeviceArgs=*/false, /*AddArgsToTopKernelOnly=*/false); + /*argsNeedAddrSpace=*/false, /*isXteamKernel=*/false); } else { llvm::Value *ContextV = nullptr; F = emitOutlinedFunctionPrologueAggregate(*this, Args, LocalAddrs, VLASizes, @@ -2918,78 +2807,6 @@ void CodeGenFunction::EmitOMPInnerLoop( EmitBlock(LoopExit.getBlock()); } -void CodeGenFunction::EmitOMPMultiDeviceInnerLoop( - const OMPExecutableDirective &S, bool RequiresCleanup, const Expr *LoopCond, - const Expr *IncExpr, const VarDecl *IVDecl, - const llvm::function_ref BodyGen, - const llvm::function_ref PostIncGen) { - // If this is not a multi-device kernel, call the previous method. - if (!CGM.isMultiDeviceKernel(S)) - return EmitOMPInnerLoop(S, RequiresCleanup, LoopCond, IncExpr, BodyGen, - PostIncGen); - - auto LoopExit = getJumpDestInCurrentScope("omp.inner.for.end"); - - // Start the loop with a block that tests the condition. - auto CondBlock = createBasicBlock("omp.inner.for.cond"); - EmitBlock(CondBlock); - const SourceRange R = S.getSourceRange(); - - // If attributes are attached, push to the basic block with them. - const auto &OMPED = cast(S); - const CapturedStmt *ICS = OMPED.getInnermostCapturedStmt(); - const Stmt *SS = ICS->getCapturedStmt(); - const AttributedStmt *AS = dyn_cast_or_null(SS); - OMPLoopNestStack.clear(); - if (AS) - LoopStack.push(CondBlock, CGM.getContext(), CGM.getCodeGenOpts(), - AS->getAttrs(), SourceLocToDebugLoc(R.getBegin()), - SourceLocToDebugLoc(R.getEnd())); - else - LoopStack.push(CondBlock, SourceLocToDebugLoc(R.getBegin()), - SourceLocToDebugLoc(R.getEnd())); - - // If there are any cleanups between here and the loop-exit scope, - // create a block to stage a loop exit along. - llvm::BasicBlock *ExitBlock = LoopExit.getBlock(); - if (RequiresCleanup) - ExitBlock = createBasicBlock("omp.inner.for.cond.cleanup"); - - llvm::BasicBlock *LoopBody = createBasicBlock("omp.inner.for.body"); - // Emit condition bearing in mind that the condition should be compared - // against MultiDeviceUB not the original loop UB. - llvm::Value *IV = Builder.CreateLoad(GetAddrOfLocalVar(IVDecl)); - llvm::Value *IVCast = Builder.CreateIntCast(IV, Int64Ty, /*isSigned=*/true); - Address MultiDeviceUBAddr = - GetAddrOfLocalVar(CGM.getMultiDeviceUBArg(S, CurFn)); - llvm::Value *MultiDeviceUB = Builder.CreateLoad(MultiDeviceUBAddr); - llvm::Value *CmpI = Builder.CreateICmpSLE(IVCast, MultiDeviceUB); - Builder.CreateCondBr(CmpI, LoopBody, ExitBlock); - if (ExitBlock != LoopExit.getBlock()) { - EmitBlock(ExitBlock); - EmitBranchThroughCleanup(LoopExit); - } - - EmitBlock(LoopBody); - incrementProfileCounter(&S); - - // Create a block for the increment. - JumpDest Continue = getJumpDestInCurrentScope("omp.inner.for.inc"); - BreakContinueStack.push_back(BreakContinue(*SS, LoopExit, Continue)); - - BodyGen(*this); - - // Emit "IV = IV + 1" and a back-edge to the condition block. - EmitBlock(Continue.getBlock()); - EmitIgnoredExpr(IncExpr); - PostIncGen(*this); - BreakContinueStack.pop_back(); - EmitBranch(CondBlock); - LoopStack.pop(); - // Emit the fall-through block. - EmitBlock(LoopExit.getBlock()); -} - bool CodeGenFunction::EmitOMPLinearClauseInit(const OMPLoopDirective &D) { if (!HaveInsertPoint()) return false; @@ -4027,13 +3844,7 @@ void CodeGenFunction::EmitOMPDistributeOuterLoop( CGOpenMPRuntime::StaticRTInput StaticInit( IVSize, IVSigned, /* Ordered = */ false, LoopArgs.IL, LoopArgs.LB, LoopArgs.UB, LoopArgs.ST, LoopArgs.Chunk); - bool IsMultiDeviceKernel = CGM.isMultiDeviceKernel(S); - if (IsMultiDeviceKernel) - StaticInit.setMultiDeviceLBUB( - GetAddrOfLocalVar(CGM.getMultiDeviceLBArg(S, CurFn)), - GetAddrOfLocalVar(CGM.getMultiDeviceUBArg(S, CurFn))); - RT.emitDistributeStaticInit(*this, S.getBeginLoc(), ScheduleKind, StaticInit, - IsMultiDeviceKernel); + RT.emitDistributeStaticInit(*this, S.getBeginLoc(), ScheduleKind, StaticInit); // for combined 'distribute' and 'for' the increment expression of distribute // is stored in DistInc. For 'distribute' alone, it is in Inc. @@ -6839,7 +6650,6 @@ void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S, bool StaticChunked = RT.isStaticChunked(ScheduleKind, /* Chunked */ Chunk != nullptr) && isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()); - bool IsMultiDeviceKernel = CGM.isMultiDeviceKernel(S); if (RT.isStaticNonchunked(ScheduleKind, /* Chunked */ Chunk != nullptr) || StaticChunked) { @@ -6847,60 +6657,15 @@ void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S, IVSize, IVSigned, /* Ordered = */ false, IL.getAddress(), LB.getAddress(), UB.getAddress(), ST.getAddress(), StaticChunked ? Chunk : nullptr); - // If the current emission is part of multi-device kernel then we need - // to invoke a special method. - if (IsMultiDeviceKernel) - StaticInit.setMultiDeviceLBUB( - GetAddrOfLocalVar(CGM.getMultiDeviceLBArg(S, CurFn)), - GetAddrOfLocalVar(CGM.getMultiDeviceUBArg(S, CurFn))); RT.emitDistributeStaticInit(*this, S.getBeginLoc(), ScheduleKind, - StaticInit, IsMultiDeviceKernel); + StaticInit); JumpDest LoopExit = getJumpDestInCurrentScope(createBasicBlock("omp.loop.exit")); - - // For multi device kernels we have to compare against the MultiDeviceUB - // instead of the GlobalUB. - if (CGM.isMultiDeviceKernel(S)) { - // UB = min(UB, MultiDeviceUB); - // Step 1: load UB variable which was just passed and modified by the - // distribute static init runtime function. - llvm::Value *UBVal = Builder.CreateLoad(UB.getAddress()); - - // Step 2: Get the address of the Multi Device UB and load it: - Address MultiDeviceUBAddr = - GetAddrOfLocalVar(CGM.getMultiDeviceUBArg(S, CurFn)); - llvm::Value *MultiDeviceUB = Builder.CreateLoad(MultiDeviceUBAddr); - - // Step 3: Make sure the compared values have the same type: - llvm::Value *UBValCasted = - Builder.CreateIntCast(UBVal, Int64Ty, /*isSigned=*/true); - - // Step 4: Compare the values: if current UB is > MultiDeviceUB then - // ensure that we do not go beyond the MultiDeviceUB. - llvm::Value *CmpI = Builder.CreateICmpSGT(UBValCasted, MultiDeviceUB); - auto MDCheckTrue = createBasicBlock("omp.md.check.true"); - auto MDCheckEnd = createBasicBlock("omp.md.check.end"); - - // Step 5: Emit the comparison: - Builder.CreateCondBr(CmpI, MDCheckTrue, MDCheckEnd); - - // Step 6: Emit the true block which will store the upper bound. - EmitBlock(MDCheckTrue); - llvm::Value *MultiDeviceUBCasted = Builder.CreateIntCast( - MultiDeviceUB, UBVal->getType(), /*isSigned=*/true); - Builder.CreateStore(MultiDeviceUBCasted, UB.getAddress()); - EmitBranch(MDCheckEnd); - - // Step 7: emit condition end block - EmitBlock(MDCheckEnd); - } else { - // UB = min(UB, GlobalUB); - EmitIgnoredExpr( - isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()) + // UB = min(UB, GlobalUB); + EmitIgnoredExpr(isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()) ? S.getCombinedEnsureUpperBound() : S.getEnsureUpperBound()); - } - + // IV = LB; EmitIgnoredExpr(isOpenMPLoopBoundSharingDirective(S.getDirectiveKind()) ? S.getCombinedInit() @@ -6944,67 +6709,18 @@ void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S, if (isOpenMPSimdDirective(S.getDirectiveKind())) CGF.EmitOMPSimdInit(S); }, - [&S, &LoopScope, Cond, IncExpr, IVDecl, LoopExit, &CodeGenLoop, - StaticChunked, UB](CodeGenFunction &CGF, PrePostActionTy &) { - CGF.EmitOMPMultiDeviceInnerLoop( - S, LoopScope.requiresCleanups(), Cond, IncExpr, IVDecl, + [&S, &LoopScope, Cond, IncExpr, LoopExit, &CodeGenLoop, + StaticChunked](CodeGenFunction &CGF, PrePostActionTy &) { + CGF.EmitOMPInnerLoop( + S, LoopScope.requiresCleanups(), Cond, IncExpr, [&S, LoopExit, &CodeGenLoop](CodeGenFunction &CGF) { CodeGenLoop(CGF, S, LoopExit); }, - [&S, StaticChunked, UB](CodeGenFunction &CGF) { + [&S, StaticChunked](CodeGenFunction &CGF) { if (StaticChunked) { CGF.EmitIgnoredExpr(S.getCombinedNextLowerBound()); CGF.EmitIgnoredExpr(S.getCombinedNextUpperBound()); - // TODO: emit UB = min(UB, MutliDeviceUB) - if (CGF.CGM.isMultiDeviceKernel(S)) { - // UB = min(UB, MultiDeviceUB); - // Step 1: load UB variable which was just passed and - // modified by the distribute static init runtime - // function. - llvm::Value *UBVal = - CGF.Builder.CreateLoad(UB.getAddress()); - - // Step 2: Get the address of the Multi Device UB and - // load it: - Address MultiDeviceUBAddr = CGF.GetAddrOfLocalVar( - CGF.CGM.getMultiDeviceUBArg(S, CGF.CurFn)); - llvm::Value *MultiDeviceUB = - CGF.Builder.CreateLoad(MultiDeviceUBAddr); - - // Step 3: Make sure the compared values have the same - // type: - llvm::Value *UBValCasted = CGF.Builder.CreateIntCast( - UBVal, CGF.Int64Ty, /*isSigned=*/true); - - // Step 4: Compare the values: if current UB is > - // MultiDeviceUB then ensure that we do not go beyond - // the MultiDeviceUB. - llvm::Value *CmpI = CGF.Builder.CreateICmpSGT( - UBValCasted, MultiDeviceUB); - auto MDCheckTrue = - CGF.createBasicBlock("omp.md.check.true"); - auto MDCheckEnd = - CGF.createBasicBlock("omp.md.check.end"); - - // Step 5: Emit the comparison: - CGF.Builder.CreateCondBr(CmpI, MDCheckTrue, MDCheckEnd); - - // Step 6: Emit the true block which will store the - // upper bound. - CGF.EmitBlock(MDCheckTrue); - llvm::Value *MultiDeviceUBCasted = - CGF.Builder.CreateIntCast(MultiDeviceUB, - UBVal->getType(), - /*isSigned=*/true); - CGF.Builder.CreateStore(MultiDeviceUBCasted, - UB.getAddress()); - CGF.EmitBranch(MDCheckEnd); - - // Step 7: emit condition end block - CGF.EmitBlock(MDCheckEnd); - } else { - CGF.EmitIgnoredExpr(S.getCombinedEnsureUpperBound()); - } + CGF.EmitIgnoredExpr(S.getCombinedEnsureUpperBound()); CGF.EmitIgnoredExpr(S.getCombinedInit()); } }); @@ -7015,7 +6731,6 @@ void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S, } else { // Emit the outer loop, which requests its work chunk [LB..UB] from // runtime and runs the inner loop to process it. - // TODO: handle this case for Multi-Device Kernels. const OMPLoopArguments LoopArguments = { LB.getAddress(), UB.getAddress(), ST.getAddress(), IL.getAddress(), Chunk}; @@ -7079,9 +6794,7 @@ emitOutlinedOrderedFunction(CodeGenModule &CGM, const CapturedStmt *S, CodeGenFunction::CGCapturedStmtInfo CapStmtInfo; CGF.CapturedStmtInfo = &CapStmtInfo; llvm::Function *Fn = - CGF.GenerateOpenMPCapturedStmtFunction(*S, D, - /*CanHaveMultiDeviceArgs*/ false, - /*IsTopKernel*/ false); + CGF.GenerateOpenMPCapturedStmtFunction(*S, D, D.getBeginLoc()); Fn->setDoesNotRecurse(); return Fn; } diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 0ba9a4e09850b..9947c2bf3dd47 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3662,12 +3662,10 @@ class CodeGenFunction : public CodeGenTypeCache { SourceLocation Loc, const FunctionArgList *Args); void EmitNoLoopCode(const OMPExecutableDirective &D, - const ForStmt *CapturedForStmt, SourceLocation Loc, - const FunctionArgList *Args); + const ForStmt *CapturedForStmt, SourceLocation Loc); void EmitBigJumpLoopCode(const OMPExecutableDirective &D, - const ForStmt *CapturedForStmt, SourceLocation Loc, - const FunctionArgList *Args); + const ForStmt *CapturedForStmt, SourceLocation Loc); void EmitXteamRedCode(const OMPExecutableDirective &D, const ForStmt *CapturedForStmt, SourceLocation Loc, @@ -3693,8 +3691,7 @@ class CodeGenFunction : public CodeGenTypeCache { /// Used in No-Loop and Xteam codegen to emit the loop iteration and the /// associated variables. Returns the loop iteration variable and its address. - std::pair EmitNoLoopIV(const OMPLoopDirective &LD, - const FunctionArgList *Args); + std::pair EmitNoLoopIV(const OMPLoopDirective &LD); /// Emit updates of the original loop indices. Used by both /// BigJumpLoop and Xteam reduction kernel codegen. @@ -3829,18 +3826,13 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Function *GenerateCapturedStmtFunction(const CapturedStmt &S); Address GenerateCapturedStmtArgument(const CapturedStmt &S); llvm::Function *GenerateOpenMPCapturedStmtFunction( - const CapturedStmt &S, const OMPExecutableDirective &D, - bool TopLevel, bool IsTopKernel); + const CapturedStmt &S, const OMPExecutableDirective &D, SourceLocation Loc); llvm::Function * GenerateOpenMPCapturedStmtFunctionAggregate(const CapturedStmt &S, const OMPExecutableDirective &D); void GenerateOpenMPCapturedVars(const CapturedStmt &S, SmallVectorImpl &CapturedVars, const Stmt *XteamRedNestKey); - void GenerateOpenMPCapturedVarsDevice( - const CapturedStmt &S, SmallVectorImpl &CapturedVars, - SmallVectorImpl &MultiTargetVars, - const Stmt *XteamRedNestKey); void InitializeXteamRedCapturedVars(SmallVectorImpl &CapturedVars, QualType RedVarQualType); @@ -4182,22 +4174,6 @@ class CodeGenFunction : public CodeGenTypeCache { const llvm::function_ref BodyGen, const llvm::function_ref PostIncGen); - /// Emit inner loop of the worksharing/simd construct. - /// - /// \param S Directive, for which the inner loop must be emitted. - /// \param RequiresCleanup true, if directive has some associated private - /// variables. - /// \param LoopCond Bollean condition for loop continuation. - /// \param IncExpr Increment expression for loop control variable. - /// \param BodyGen Generator for the inner body of the inner loop. - /// \param PostIncGen Genrator for post-increment code (required for ordered - /// loop directvies). - void EmitOMPMultiDeviceInnerLoop( - const OMPExecutableDirective &S, bool RequiresCleanup, - const Expr *LoopCond, const Expr *IncExpr, const VarDecl *IVDecl, - const llvm::function_ref BodyGen, - const llvm::function_ref PostIncGen); - JumpDest getOMPCancelDestination(OpenMPDirectiveKind Kind); /// Emit initial code for loop counters of loop-based directives. void EmitOMPPrivateLoopCounters(const OMPLoopDirective &S, diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 24148996aa636..471b95a52f091 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -9407,17 +9407,6 @@ CodeGenModule::getNoLoopForStmtStatus(const OMPExecutableDirective &D, return std::make_pair(NxSuccess, HasNestedGenericCall); } -CodeGenModule::NoLoopXteamErr -CodeGenModule::getMultiDeviceForStmtStatus(const OMPExecutableDirective &D, - const Stmt *OMPStmt) { - const ForStmt *FStmt = getSingleForStmt(OMPStmt); - if (FStmt == nullptr) - return NxNoSingleForStmt; - - assert(isa(D) && "Expected a loop directive"); - return NxSuccess; -} - int64_t CodeGenModule::getXteamRedNumTeamsFromClause( const OptKernelNestDirectives &NestDirs) { for (const auto &D : NestDirs) { @@ -9657,26 +9646,6 @@ CodeGenModule::NoLoopXteamErr CodeGenModule::getXteamRedStatusForClauses( return getNoLoopCompatibleSchedStatus(LD); } -CodeGenModule::NoLoopXteamErr CodeGenModule::getMultiDeviceStatusForClauses( - const OptKernelNestDirectives &NestDirs) { - for (auto &D : NestDirs) { - if (D->hasClausesOfKind() || - D->hasClausesOfKind() || - D->hasClausesOfKind() || - D->hasClausesOfKind() || - D->hasClausesOfKind() || - D->hasClausesOfKind()) - return NxUnsupportedTargetClause; - } - if (!isa(NestDirs.back())) - return NxNotLoopDirective; - const OMPLoopDirective &LD = cast(*NestDirs.back()); - NoLoopXteamErr NxStatus = NxSuccess; - if ((NxStatus = getNoLoopCompatibleOrderStatus(LD))) - return NxStatus; - return getNoLoopCompatibleSchedStatus(LD); -} - /// Given a directive, collect metadata for the reduction variables for Xteam /// reduction, if applicable std::pair @@ -9742,7 +9711,6 @@ CodeGenModule::collectXteamRedVars(const OptKernelNestDirectives &NestDirs) { // Either we emit Xteam code for all reduction variables or none at all. // Track whether the kernel has any min/max reduction variable. - bool isMultiDeviceCompile = getLangOpts().OpenMPTargetMultiDevice; bool isFastReductionEnabled = getLangOpts().OpenMPTargetFastReduction; for (auto &D : NestDirs) { for (const auto *C : D->getClausesOfKind()) { @@ -9813,14 +9781,6 @@ CodeGenModule::collectXteamRedVars(const OptKernelNestDirectives &NestDirs) { BinExprRhs, Ref->getType()->isUnsignedIntegerType()); OpKindsFound |= MinMaxOp; - // Multi-device compilation is not compatible with Xteam min/max, - // so disable Xteam codegen. - if (MinMaxOp != XR_OP_unknown && isMultiDeviceCompile) { - return std::make_pair( - NxMultiDeviceMinMaxNotSupported, - XteamRedCollectionInfo(VarMap, VarVec, OpKindsFound)); - } - // Fast reduction is not compatible with Xteam min/max, so // disable Xteam codegen. if (MinMaxOp != XR_OP_unknown && isFastReductionEnabled) { @@ -10158,49 +10118,6 @@ CodeGenModule::checkAndSetXteamRedKernel(const OMPExecutableDirective &D) { return NxOptionDisabledOrHasCall; } -bool CodeGenModule::checkAndSetMultiDeviceKernel( - const OMPExecutableDirective &D, bool CanBeMultiDevice) { - bool IsMultiDeviceKernel = false; - - if (!getLangOpts().OpenMPTargetMultiDevice || - !getLangOpts().OpenMPIsTargetDevice) - return IsMultiDeviceKernel; - - OptKernelNestDirectives NestDirs; - if (checkNest(D, &NestDirs) == NxSuccess && - getMultiDeviceStatusForClauses(NestDirs) == NxSuccess && - D.hasAssociatedStmt()) { - const OMPExecutableDirective &InnermostDir = *NestDirs.back(); - if (InnermostDir.hasAssociatedStmt() && - getMultiDeviceForStmtStatus( - InnermostDir, InnermostDir.getAssociatedStmt()) == NxSuccess) { - // The metadata map for all optimized kernels will have the ForStmt - // as the key. - const ForStmt *FStmt = getSingleForStmt(InnermostDir.getAssociatedStmt()); - - // Check that we are on the device and that multi device has been enabled. - if (FStmt) { - // Set the entry only if we have not set it before otherwise just return - // the outcome of the isMultiDeviceKernel check. If this is the first - // time the function is called the code below will add an entry to the - // struct to keep track of the multi kernel metadata. - if (!multiDeviceFStmtEntryExists(FStmt)) { - // Now that a multi-device kernel will be generated, set the nest map - addOptKernelNestMap(NestDirs); - - MultiDeviceFunctionBoundsMap FunctionBoundsMap; - MultiDeviceKernels.insert(std::make_pair( - FStmt, MultiDeviceKernelInfo(NestDirs, FunctionBoundsMap, - CanBeMultiDevice))); - } - IsMultiDeviceKernel = isMultiDeviceKernel(FStmt); - } - } - } - - return IsMultiDeviceKernel; -} - bool CodeGenModule::isXteamRedKernel(const OMPExecutableDirective &D) { if (!D.hasAssociatedStmt()) return false; @@ -10228,15 +10145,6 @@ bool CodeGenModule::isNoLoopKernel(const OMPExecutableDirective &D) { return isNoLoopKernel(FStmt); } -bool CodeGenModule::isMultiDeviceKernel(const OMPExecutableDirective &D) { - if (!D.hasAssociatedStmt()) - return false; - const ForStmt *FStmt = getSingleForStmt(getOptKernelKey(D)); - if (FStmt == nullptr) - return false; - return isMultiDeviceKernel(FStmt); -} - void CodeGenModule::addOptKernelNestMap( const OptKernelNestDirectives &NestDirs) { const OMPExecutableDirective &InnermostDir = *NestDirs.back(); diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h index 0fe1e0ddadb4d..c44ebc370983c 100644 --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -484,32 +484,6 @@ class CodeGenModule : public CodeGenTypeCache { uint8_t OpKindsFound; }; - /// Metadata for multi-device kernel codegen - struct MultiDeviceBoundsInfo { - MultiDeviceBoundsInfo(VarDecl *LBArg, VarDecl *UBArg) - : LBArg{LBArg}, UBArg{UBArg} {} - VarDecl *LBArg; - VarDecl *UBArg; - }; - using MultiDeviceFunctionBoundsMap = - llvm::DenseMap; - - struct MultiDeviceKernelInfo { - MultiDeviceKernelInfo(OptKernelNestDirectives Dirs, - MultiDeviceFunctionBoundsMap FBM, - bool CanBeMultiDevice) - : MultiDeviceNestDirs{Dirs}, FunctionBoundsMap{FBM}, - CanBeMultiDevice{CanBeMultiDevice} {} - - OptKernelNestDirectives MultiDeviceNestDirs; - MultiDeviceFunctionBoundsMap FunctionBoundsMap; - bool CanBeMultiDevice; - bool NewBoundsHaveBeenUsed = false; - }; - /// Map construct statement to corresponding metadata for a NoLoop kernel. - using MultiDeviceKernelMap = - llvm::DenseMap; - private: ASTContext &Context; const LangOptions &LangOpts; @@ -565,7 +539,6 @@ class CodeGenModule : public CodeGenTypeCache { NoLoopKernelMap NoLoopKernels; NoLoopKernelMap BigJumpLoopKernels; XteamRedKernelMap XteamRedKernels; - MultiDeviceKernelMap MultiDeviceKernels; // A set of references that have only been seen via a weakref so far. This is // used to remove the weak of the reference if we ever see a direct reference @@ -2113,12 +2086,6 @@ class CodeGenModule : public CodeGenTypeCache { /// reduction variables are created for subsequent codegen phases to work on. NoLoopXteamErr checkAndSetXteamRedKernel(const OMPExecutableDirective &D); - /// If we are able to generate a multi-device kernel for this directive, - /// return true, otherwise return false. If successful, metadata for the - /// argument variables is created for subsequent codegen phases to work on. - bool checkAndSetMultiDeviceKernel(const OMPExecutableDirective &D, - bool CanBeMultiDevice); - /// Compute the block size to be used for a kernel. int getWorkGroupSizeSPMDHelper(const OMPExecutableDirective &D); /// Used in optimized kernel codegen, compute the block size from the nested @@ -2332,57 +2299,6 @@ class CodeGenModule : public CodeGenTypeCache { std::pair getStatusXteamSupportedPseudoObject(const PseudoObjectExpr *PO); - /// Are we generating multi-device kernel for the statement - bool multiDeviceFStmtEntryExists(const Stmt *S) { - return MultiDeviceKernels.find(S) != MultiDeviceKernels.end(); - } - bool isMultiDeviceKernel(const Stmt *S) { - if (MultiDeviceKernels.find(S) == MultiDeviceKernels.end()) - return false; - MultiDeviceKernelInfo MDInfo = MultiDeviceKernels.find(S)->second; - return MDInfo.CanBeMultiDevice; - } - bool isMultiDeviceKernel(const OMPExecutableDirective &D); - - /// Given a ForStmt for which Multi Device codegen will be done, save the - /// metadata for the LB and UB args. - void saveMultiDeviceArgs(const OMPExecutableDirective &D, - const llvm::Function *F, VarDecl *LBDecl, - VarDecl *UBDecl) { - assert(isMultiDeviceKernel(getSingleForStmt(getOptKernelKey(D))) && - "Must be a multi-device kernel"); - const ForStmt *FStmt = getSingleForStmt(getOptKernelKey(D)); - assert((MultiDeviceKernels.find(FStmt) != MultiDeviceKernels.end()) && - "FStmt not found"); - MultiDeviceKernelInfo &MDInfo = MultiDeviceKernels.find(FStmt)->second; - MDInfo.FunctionBoundsMap.insert( - std::make_pair(F, MultiDeviceBoundsInfo(LBDecl, UBDecl))); - } - - /// Retrieve the metadata for the LB arg. - MultiDeviceBoundsInfo getMultiDeviceBounds(const OMPExecutableDirective &D, - const llvm::Function *F) { - const ForStmt *FStmt = getSingleForStmt(getOptKernelKey(D)); - assert((MultiDeviceKernels.find(FStmt) != MultiDeviceKernels.end()) && - "FStmt not found"); - MultiDeviceKernelInfo MDInfo = MultiDeviceKernels.find(FStmt)->second; - assert(MDInfo.FunctionBoundsMap.find(F) != MDInfo.FunctionBoundsMap.end() && - "Function must exist"); - return MDInfo.FunctionBoundsMap.find(F)->second; - } - - /// Retrieve the metadata for the LB arg. - VarDecl *getMultiDeviceLBArg(const OMPExecutableDirective &D, - const llvm::Function *F) { - return getMultiDeviceBounds(D, F).LBArg; - } - - /// Retrieve the metadata for the LB arg. - VarDecl *getMultiDeviceUBArg(const OMPExecutableDirective &D, - const llvm::Function *F) { - return getMultiDeviceBounds(D, F).UBArg; - } - /// Move some lazily-emitted states to the NewBuilder. This is especially /// essential for the incremental parsing environment like Clang Interpreter, /// because we'll lose all important information after each repl. @@ -2755,15 +2671,6 @@ class CodeGenModule : public CodeGenTypeCache { std::pair collectXteamRedVars(const OptKernelNestDirectives &NestDirs); - /// Top level checker for multi device of the loop - NoLoopXteamErr getMultiDeviceForStmtStatus(const OMPExecutableDirective &, - const Stmt *); - - /// Are clauses on a combined OpenMP construct compatible with multi-device - /// codegen? - NoLoopXteamErr - getMultiDeviceStatusForClauses(const OptKernelNestDirectives &NestDirs); - /// Emit deactivation symbols for any PFP fields whose offset is taken with /// offsetof. void emitPFPFieldsWithEvaluatedOffset(); diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index ae50512aa0c5a..ed334930901b8 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6966,12 +6966,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, else CmdArgs.push_back("-fno-openmp-target-fast-reduction"); - if (Args.hasFlag(options::OPT_fopenmp_target_multi_device, - options::OPT_fno_openmp_target_multi_device, false)) - CmdArgs.push_back("-fopenmp-target-multi-device"); - else - CmdArgs.push_back("-fno-openmp-target-multi-device"); - if (Args.hasFlag(options::OPT_fopenmp_target_xteam_scan, options::OPT_fno_openmp_target_xteam_scan, false)) CmdArgs.push_back("-fopenmp-target-xteam-scan"); diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 1d9fb7d100b72..abe00f742fcae 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3902,11 +3902,6 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts, else GenerateArg(Consumer, OPT_fno_openmp_target_fast_reduction); - if (Opts.OpenMPTargetMultiDevice) - GenerateArg(Consumer, OPT_fopenmp_target_multi_device); - else - GenerateArg(Consumer, OPT_fno_openmp_target_multi_device); - if (Opts.OpenMPTargetXteamScan) GenerateArg(Consumer, OPT_fopenmp_target_xteam_scan); else @@ -4434,14 +4429,6 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, Args.hasFlag(options::OPT_fopenmp_target_fast_reduction, options::OPT_fno_openmp_target_fast_reduction, false); - Opts.OpenMPTargetMultiDevice = - Args.hasFlag(options::OPT_fopenmp_target_multi_device, - options::OPT_fno_openmp_target_multi_device, false); - - // Multi-device kernels always run in fast xteam reduction mode: - if (Opts.OpenMPTargetMultiDevice) - Opts.OpenMPTargetFastReduction = true; - Opts.OpenMPTargetXteamScan = Args.hasFlag(options::OPT_fopenmp_target_xteam_scan, options::OPT_fno_openmp_target_xteam_scan, false); diff --git a/clang/test/OpenMP/multi_device_codegen.cpp b/clang/test/OpenMP/multi_device_codegen.cpp deleted file mode 100644 index 5f1e2d82c5c98..0000000000000 --- a/clang/test/OpenMP/multi_device_codegen.cpp +++ /dev/null @@ -1,1974 +0,0 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-target-multi-device -fopenmp-target-fast-reduction -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-target-multi-device -fopenmp-target-fast-reduction -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -// expected-no-diagnostics - -#include - -int main() -{ - int N = 100; - - double a[N], b[N]; - int bint[N]; - unsigned cint[N]; - - int8_t int8_sum = 0; - int16_t int16_sum = 0; - int32_t int32_sum = 0; - uint32_t uint32_sum = 0; - int64_t int64_sum = 0; - uint64_t uint64_sum = 0; - - for (int i=0; i T getInfo(DeviceInfo Info) const { InfoTreeNode DevInfo = RTL->obtain_device_info(RTLDeviceID); diff --git a/offload/libomptarget/PluginManager.cpp b/offload/libomptarget/PluginManager.cpp index 3518420a4512c..9ec75c274b75f 100644 --- a/offload/libomptarget/PluginManager.cpp +++ b/offload/libomptarget/PluginManager.cpp @@ -382,20 +382,12 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) { (getRequirements() & OMPX_REQ_EAGER_ZERO_COPY_MAPS)); } - // Add the flag for multi-device. - if (ExclusiveDevicesAccessor->size() > 0) { - auto &Device = *(*ExclusiveDevicesAccessor)[0]; - if (Device.getNumMultiDevices() > 0) - addRequirements(OMPX_REQ_MULTI_DEVICE_ENABLED); - } - ODBG(ODT_Init) << "Done registering entries!"; } // Temporary forward declaration, old style CTor/DTor handling is going away. int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, - KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo, - bool InMultiDeviceMode, bool &IsMultiDeviceKernel); + KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo); void PluginManager::unregisterLib(__tgt_bin_desc *Desc) { ODBG(ODT_Deinit) << "Unloading target library!"; diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp index 1da79283d87d2..3dc89e1a3c8d7 100644 --- a/offload/libomptarget/device.cpp +++ b/offload/libomptarget/device.cpp @@ -493,15 +493,6 @@ void DeviceTy::zeroCopySanityChecksAndDiag(bool isUnifiedSharedMemory, isAutoZeroCopy, isEagerMaps); } -uint32_t DeviceTy::getNumMultiDevices() const { - return RTL->get_num_multi_devices(RTLDeviceID); -} - -// Check if kernel is a multi device kernel -bool DeviceTy::isMultiDeviceKernel(void *TgtEntryPtr) { - return RTL->kernel_is_multi_device(RTLDeviceID, TgtEntryPtr); -} - bool DeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) { return RTL->is_accessible_ptr(RTLDeviceID, Ptr, Size); } diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp index 63103bb267a98..b550301cbe61f 100644 --- a/offload/libomptarget/interface.cpp +++ b/offload/libomptarget/interface.cpp @@ -393,19 +393,6 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, static_assert(std::is_convertible_v, "Target AsyncInfoTy must be convertible to AsyncInfoTy."); - // Target multiple devices if the user requests more than 1 device. The - // variable below tracks the number of EXTRA devices that are going to be - // used other than the first device. - int32_t NumMultiDevices = 0; - char *SplitFactor = getenv("LIBOMPTARGET_NUM_MULTI_DEVICES"); - if (SplitFactor) { - NumMultiDevices = atoi(SplitFactor) - 1; - - // In multi-device mode the default device is always 0. - if (DeviceId == -1) - DeviceId = 0; - } - ODBG(ODT_Interface) << "Entering target region for device " << DeviceId << " with entry point " << HostPtr; @@ -472,86 +459,14 @@ static inline int targetKernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams, /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);) int Rc = OFFLOAD_SUCCESS; - bool IsMultiDeviceKernel = false; - Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo, - /*InMultiDeviceMode*/ NumMultiDevices > 0, IsMultiDeviceKernel); - - // Check if this is a multi-device kernel. - SmallVector TargetAsyncInfos; - if (IsMultiDeviceKernel) { - // Check whether we have enough iterations for multiple devices, if we do - // not then we execute on one device. If the kernel does not have at least - // two arguments it means the loop bounds have not been passed in so we - // cannot execute on multiple devices. - if (NumMultiDevices > 0 && (KernelArgs->Tripcount < (NumMultiDevices + 1) || - KernelArgs->NumArgs < 2)) - NumMultiDevices = 0; - - // The first device used by the multi-device infrastructure: - int32_t FirstDeviceId = DeviceId + 1; - - // Launch kernel on one or across multiple devices. - for (int64_t DeviceIndex = FirstDeviceId; - DeviceIndex < FirstDeviceId + NumMultiDevices; DeviceIndex++) { - ODBG(ODT_Kernel) << "Entering target region for device " - << DeviceIndex << " with entry point " - << HostPtr; - - if (checkDevice(DeviceIndex, Loc)) { - ODBG(ODT_Kernel) << "Not offloading to device " << DeviceIndex; - return OMP_TGT_FAIL; - } - - if (getInfoLevel() & OMP_INFOTYPE_KERNEL_ARGS) - printKernelArguments(Loc, DeviceIndex, KernelArgs->NumArgs, - KernelArgs->ArgSizes, KernelArgs->ArgTypes, - KernelArgs->ArgNames, "Entering OpenMP kernel"); -#ifdef OMPTARGET_DEBUG - for (int I = 0; I < KernelArgs->NumArgs; ++I) { - ODBG(ODT_Device) - << "Entry " << I - << " Base=" << KernelArgs->ArgBasePtrs[I] - << " Begin=" << KernelArgs->ArgPtrs[I] - << " Size=" << KernelArgs->ArgSizes[I] - << " Type=0x%" << KernelArgs->ArgTypes[I] - << " Name=" << KernelArgs->ArgNames; - } -#endif - - auto DeviceOrErr = PM->getDevice(DeviceIndex); - if (!DeviceOrErr) - FATAL_MESSAGE(DeviceIndex, "%s", - toString(DeviceOrErr.takeError()).c_str()); - - TargetAsyncInfoTy *LocalTAI = new TargetAsyncInfoTy(*DeviceOrErr); - AsyncInfoTy &AsyncInfoMD = *LocalTAI; - TargetAsyncInfos.emplace_back(LocalTAI); - - // No need to check the global multi device value for this kernel. - if (target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfoMD, false, - IsMultiDeviceKernel) != OFFLOAD_SUCCESS) - Rc = OFFLOAD_FAIL; - } - } - - int PostSyncRc = Rc; - if (Rc == OFFLOAD_SUCCESS) { - PostSyncRc = AsyncInfo.synchronize(); - for (TargetAsyncInfoTy *LocalTAI : TargetAsyncInfos) { - AsyncInfoTy &AsyncInfo = *LocalTAI; - if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) - PostSyncRc = OFFLOAD_FAIL; - } - } - - // Deallocate the multi-device async infos if any were allocated. - for (TargetAsyncInfoTy *LocalTAI : TargetAsyncInfos) - delete LocalTAI; + Rc = target(Loc, *DeviceOrErr, HostPtr, *KernelArgs, AsyncInfo); + + if (Rc == OFFLOAD_SUCCESS) + Rc = AsyncInfo.synchronize(); - handleTargetOutcome(PostSyncRc == OFFLOAD_SUCCESS, Loc); - assert(PostSyncRc == OFFLOAD_SUCCESS && "offload failed"); - assert(PostSyncRc == OFFLOAD_SUCCESS && - "__tgt_target_kernel unexpected failure!"); + handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); + assert(Rc == OFFLOAD_SUCCESS && "offload failed"); + assert(Rc == OFFLOAD_SUCCESS && "__tgt_target_kernel unexpected failure!"); return OMP_TGT_SUCCESS; } diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp index 41784874266dd..1098f3e20c529 100644 --- a/offload/libomptarget/omptarget.cpp +++ b/offload/libomptarget/omptarget.cpp @@ -2267,8 +2267,7 @@ static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr, /// returns 0 if it was able to transfer the execution to a target and an /// integer different from zero otherwise. int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, - KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo, - bool InMultiDeviceMode, bool &IsMultiDeviceKernel) { + KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo) { int32_t DeviceId = Device.DeviceID; TableMap *TM = getTableMap(HostPtr); // No map for this host pointer found! @@ -2360,17 +2359,9 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, /*EventType=*/ompt_callback_target_submit, DeviceId, NumTeams); #endif Ret = Device.launchKernel(TgtEntryPtr, TgtArgs.data(), TgtOffsets.data(), - KernelArgs, nullptr, AsyncInfo); - - // If we are in multidevice mode the check the value of the global variable - // for this kernel to see if the kernel is indeed a multi device kernel. - if (InMultiDeviceMode) - IsMultiDeviceKernel = Device.isMultiDeviceKernel(TgtEntryPtr); + KernelArgs, AsyncInfo); } - // Reset number of arguments just in case the kernel launch changed it. - KernelArgs.NumArgs = NumClangLaunchArgs; - if (Ret != OFFLOAD_SUCCESS) { REPORT() << "Executing target region abort target."; return OFFLOAD_FAIL; diff --git a/offload/libomptarget/private.h b/offload/libomptarget/private.h index 76b98450f87e4..e16178f9bb621 100644 --- a/offload/libomptarget/private.h +++ b/offload/libomptarget/private.h @@ -24,8 +24,7 @@ #include extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, - KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo, - bool InMultiDeviceMode, bool &IsMultiDeviceKernel); + KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo); extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *ReqAddr, bool IsRecord, bool SaveOutput, diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index bdf5ec4359bee..05e687b95c416 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -818,14 +818,11 @@ struct AMDGPUKernelTy : public GenericKernelTy { /// Print more elaborate kernel launch info for AMDGPU Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs, uint32_t NumThreads[3], - uint32_t NumBlocks[3], int64_t MultiDeviceLB, - int64_t MultiDeviceUB) const override; + uint32_t NumBlocks[3]) const override; /// Print the "old" AMD KernelTrace single-line format void printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs, - uint32_t NumThreads[3], uint32_t NumBlocks[3], - int64_t MultiDeviceLB, - int64_t MultiDeviceUB) const; + uint32_t NumThreads[3], uint32_t NumBlocks[3]) const; /// Get group and private segment kernel size. uint32_t getGroupSize() const { return GroupSize; } @@ -3559,10 +3556,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Err; ODBG(ODT_Tool) << "The number of XGMI Engines: " << NumXGmiEngines; - // Detect if we are in Multi-Device mode - if (OMPX_NumMultiDevices > 0) - IsMultiDeviceEnabled = true; - // Detect if XNACK is enabled SmallVector> Targets; if (auto Err = hsa_utils::getTargetTripleAndFeatures(Agent, Targets)) @@ -5369,9 +5362,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { // If set, TARGET_ALLOC_SHARED is allocated on coarse grain memory on MI200 bool EnableGFX90ACoarseGrainSharedAlloc = false; - /// True if in multi-device mode. - bool IsMultiDeviceEnabled = false; - /// Arguments for device memory initialization. void *DMHeapPtr = nullptr; void *DMSlabPtr = nullptr; @@ -6200,9 +6190,7 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs, uint32_t NumThreads[3], - uint32_t NumBlocks[3], - int64_t MultiDeviceLB, - int64_t MultiDeviceUB) const { + uint32_t NumBlocks[3]) const { auto GroupSegmentSize = (*KernelInfo).GroupSegmentList; auto SGPRCount = (*KernelInfo).SGPRCount; auto VGPRCount = (*KernelInfo).VGPRCount; @@ -6223,14 +6211,13 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice, "reqd:(%4dX%4d) lds_usage:%uB scratch:%uB sgpr_count:%u vgpr_count:%u " "agpr_count:%u " "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d " - "md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: " + "Max Occupancy: %u Achieved Occupancy: " "%d%% n:%s\n", GenericDevice.getDeviceId(), LaunchId, getExecutionModeFlags(), ConstWGSize, KernelArgs.NumArgs, NumBlocks[0], NumThreads[0], 0, 0, GroupSegmentSize, getPrivateSize(), SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount, VGPRSpillCount, KernelArgs.Tripcount, HasRPC, - isMultiDeviceKernel(), MultiDeviceLB, MultiDeviceUB, MaxOccupancy, - AchievedOccupancy, getName()); + MaxOccupancy, AchievedOccupancy, getName()); } else { // This line should print exactly as the one in the old plugin. @@ -6240,13 +6227,12 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice, "reqd:(%4dX%4d) lds_usage:%uB scratch:%uB sgpr_count:%u vgpr_count:%u " "agpr_count:%u " "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d " - "md:%d md_LB:%ld md_UB:%ld Max Occupancy: %u Achieved Occupancy: " + "Max Occupancy: %u Achieved Occupancy: " "%d%% n:%s\n", GenericDevice.getDeviceId(), getExecutionModeFlags(), ConstWGSize, KernelArgs.NumArgs, NumBlocks[0], NumThreads[0], 0, 0, GroupSegmentSize, getPrivateSize(), SGPRCount, VGPRCount, AGPRCount, SGPRSpillCount, - VGPRSpillCount, KernelArgs.Tripcount, HasRPC, isMultiDeviceKernel(), - MultiDeviceLB, MultiDeviceUB, MaxOccupancy, AchievedOccupancy, + VGPRSpillCount, KernelArgs.Tripcount, HasRPC, MaxOccupancy, AchievedOccupancy, getName()); } } @@ -6254,16 +6240,13 @@ void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice, Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs, uint32_t NumThreads[3], - uint32_t NumBlocks[3], - int64_t MultiDeviceLB, - int64_t MultiDeviceUB) const { + uint32_t NumBlocks[3]) const { // When LIBOMPTARGET_KERNEL_TRACE is set, print the single-line kernel trace // info present in the old ASO plugin, and continue with the upstream 2-line // info, should LIBOMPTARGET_INFO be a meaningful value, otherwise return. if ((getInfoLevel() & OMP_INFOTYPE_AMD_KERNEL_TRACE) || GenericDevice.enableKernelDurationTracing()) - printAMDOneLineKernelTrace(GenericDevice, KernelArgs, NumThreads, NumBlocks, - MultiDeviceLB, MultiDeviceUB); + printAMDOneLineKernelTrace(GenericDevice, KernelArgs, NumThreads, NumBlocks); // Only do all this when the output is requested if (!(getInfoLevel() & OMP_INFOTYPE_PLUGIN_KERNEL)) diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 6b45cecc1e93b..eea14552eb6ed 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -481,9 +481,6 @@ struct GenericKernelTy { return false; } - /// Check if kernel is a multi-device kernel. - bool isMultiDeviceKernel() const { return IsMultiDeviceKernel; } - /// Compute kernel occupancy /// This function computes the max(upperbound) occupancy for a lanuched kernel /// based on the given hardware resources e.g. the number of registers, size @@ -556,17 +553,14 @@ struct GenericKernelTy { /// Prints generic kernel launch information. Error printLaunchInfo(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs, uint32_t NumThreads[3], - uint32_t NumBlocks[3], int64_t MultiDeviceLB, - int64_t MultiDeviceUB) const; + uint32_t NumBlocks[3]) const; /// Prints plugin-specific kernel launch information after generic kernel /// launch information virtual Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs, uint32_t NumThreads[3], - uint32_t NumBlocks[3], - int64_t MultiDeviceLB, - int64_t MultiDeviceUB) const; + uint32_t NumBlocks[3]) const; private: /// Prepare the block memory buffer requested for the kernel and execute the @@ -611,9 +605,6 @@ struct GenericKernelTy { /// The execution flags of the kernel. OMPTgtExecModeFlags ExecutionMode; - /// The multi-device kernel flag. - bool IsMultiDeviceKernel; - /// The image that contains this kernel. DeviceImageTy *ImagePtr = nullptr; @@ -1336,12 +1327,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy { return Error::success(); } - uint32_t getNumMultiDevices() const { return OMPX_NumMultiDevices; } - bool enableRuntimeAutotuning() const { return OMPX_EnableRuntimeAutotuning; } - bool getMultiDeviceKernelValue(void *EntryPtr); - KernelRunRecordTy *getKernelRunRecords() const { return KernelRunRecords; } /// Return true if a descriptor of size 'Size' should be allocated using @@ -1529,9 +1516,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { UInt32Envar OMPX_InitialNumStreams; UInt32Envar OMPX_InitialNumEvents; - /// Specify the number of devices used by multi-device kernels. - UInt32Envar OMPX_NumMultiDevices; - /// Envar to enable runtime tuning. BoolEnvar OMPX_EnableRuntimeAutotuning; @@ -2074,12 +2058,6 @@ struct GenericPluginTy { bool isAutoZeroCopy, bool isEagerMaps); - /// Return number of devices used by multi-device kernels. - int32_t get_num_multi_devices(int32_t DeviceId); - - /// Check if kernel is multi-device. - bool kernel_is_multi_device(int32_t DeviceId, void *TgtEntryPtr); - /// Return true if a descriptor of size 'Size' should be allocated using /// shared memory. bool use_shared_mem_for_descriptor(int32_t DeviceId, int64_t Size); diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 6badd80819a22..799ddd45f6cf4 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -127,19 +127,6 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice, ExecutionMode = ExecModeGlobal.getValue(); } - // Create a metadata object for the multi-device global (auto-generated). - StaticGlobalTy MultiDeviceGlobal(getName(), "_multi_device"); - if (auto Err = GHandler.readGlobalFromImage(GenericDevice, Image, - MultiDeviceGlobal)) { - ODBG(ODT_Init) << "Missing symbol " - << MultiDeviceGlobal.getName().data() - << " continue execution anyway."; - consumeError(std::move(Err)); - IsMultiDeviceKernel = false; - } else { - IsMultiDeviceKernel = MultiDeviceGlobal.getValue(); - } - // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max; MaxNumThreads = KernelEnvironment.Configuration.MaxThreads > 0 ? std::min(KernelEnvironment.Configuration.MaxThreads, @@ -249,25 +236,20 @@ GenericKernelTy::getKernelLaunchEnvironment( Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs, uint32_t NumThreads[3], - uint32_t NumBlocks[3], - int64_t MultiDeviceLB, - int64_t MultiDeviceUB) const { + uint32_t NumBlocks[3]) const { INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), "Launching kernel %s with [%u,%u,%u] blocks and [%u,%u,%u] threads in " "%s mode\n", getName(), NumBlocks[0], NumBlocks[1], NumBlocks[2], NumThreads[0], - NumThreads[1], NumThreads[2], getExecutionModeName(), - isMultiDeviceKernel() ? " in multi-device mode" : ""); + NumThreads[1], NumThreads[2], getExecutionModeName()); return printLaunchInfoDetails(GenericDevice, KernelArgs, NumThreads, - NumBlocks, MultiDeviceLB, MultiDeviceUB); + NumBlocks); } Error GenericKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, KernelArgsTy &KernelArgs, uint32_t NumThreads[3], - uint32_t NumBlocks[3], - int64_t MultiDeviceLB, - int64_t MultiDeviceUB) const { + uint32_t NumBlocks[3]) const { return Plugin::success(); } @@ -345,45 +327,10 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, if (!KernelLaunchEnvOrErr) return KernelLaunchEnvOrErr.takeError(); - // If the multi-device mode is not enabled for this kernel then there is no - // need to overwrite any arguments. - int32_t NumMultiDevices = GenericDevice.getNumMultiDevices(); - int64_t MultiDeviceLB = -1; - int64_t MultiDeviceUB = -1; - if (isMultiDeviceKernel() && NumMultiDevices > 0) { - // Compute the chunk size based on how many devices we are targeting and - // the length of the loop trip count. - int32_t DeviceId = GenericDevice.getDeviceId(); - if (KernelArgs.Tripcount < NumMultiDevices) { - ArgPtrs[0] = (void *)0; - ArgPtrs[1] = (void *)(KernelArgs.Tripcount - 1); - } else { - int64_t Chunk = (int64_t)KernelArgs.Tripcount / NumMultiDevices; - - // Set the lower bound. Consider the case where the LB of the loop is not - // zero. - ArgPtrs[0] = (void *)(DeviceId * Chunk); - - // Set the upper bound. If this is the last device then leave the upper - // limit unchanged because it is already set to the loop UB. - // TODO: support case where the first device is not device 0. - if (DeviceId < NumMultiDevices - 1) - ArgPtrs[1] = (void *)(((DeviceId + 1) * Chunk) - 1); - else if (DeviceId == NumMultiDevices - 1) - ArgPtrs[1] = (void *)(KernelArgs.Tripcount - 1); - else - assert(false && "Upper bound could not be set"); - } - - MultiDeviceLB = (int64_t)ArgPtrs[0]; - MultiDeviceUB = (int64_t)ArgPtrs[1]; - } - KernelLaunchParamsTy LaunchParams; // Kernel languages don't use indirection. if (KernelArgs.Flags.IsCUDA) { - assert(!isMultiDeviceKernel() && "Multi-device not supported"); LaunchParams = *reinterpret_cast(KernelArgs.ArgPtrs); } else { @@ -440,8 +387,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs, // Get achieved occupancy for this kernel. computeAchievedOccupancy(GenericDevice, NumThreads[0], NumBlocks[0]); - if (auto Err = printLaunchInfo(GenericDevice, KernelArgs, NumThreads, - NumBlocks, MultiDeviceLB, MultiDeviceUB)) + if (auto Err = printLaunchInfo(GenericDevice, KernelArgs, NumThreads, NumBlocks)) return Err; if (GenericDevice.Plugin.getProfiler()) @@ -614,7 +560,6 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId, // By default, the initial number of streams and events is 1. OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1), OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1), - OMPX_NumMultiDevices("LIBOMPTARGET_NUM_MULTI_DEVICES", 0), OMPX_EnableRuntimeAutotuning("OMPX_ENABLE_RUNTIME_AUTOTUNING", false), OMPX_KernelDurationTracing("LIBOMPTARGET_KERNEL_EXE_TIME", false), DeviceId(DeviceId), GridValues(OMPGridValues), @@ -1441,13 +1386,6 @@ Error GenericDeviceTy::zeroCopySanityChecksAndDiag(bool isUnifiedSharedMemory, isEagerMaps); } -bool GenericDeviceTy::getMultiDeviceKernelValue(void *EntryPtr) { - GenericKernelTy &GenericKernel = - *reinterpret_cast(EntryPtr); - - return GenericKernel.isMultiDeviceKernel(); -} - bool GenericDeviceTy::useSharedMemForDescriptor(int64_t Size) { return false; } void *GenericDeviceTy::getFree_ArgBuf(size_t sz) { @@ -2434,23 +2372,6 @@ int32_t GenericPluginTy::zero_copy_sanity_checks_and_diag( return R; } -int32_t GenericPluginTy::get_num_multi_devices(int32_t DeviceId) { - auto T = logger::log(__func__); - auto R = [&]() { return getDevice(DeviceId).getNumMultiDevices(); }(); - T.res(R); - return R; -} - -bool GenericPluginTy::kernel_is_multi_device(int32_t DeviceId, - void *TgtEntryPtr) { - auto T = logger::log(__func__, DeviceId, TgtEntryPtr); - auto R = [&]() { - return getDevice(DeviceId).getMultiDeviceKernelValue(TgtEntryPtr); - }(); - T.res(R); - return R; -} - bool GenericPluginTy::use_shared_mem_for_descriptor(int32_t DeviceId, int64_t Size) { auto T = logger::log(__func__, DeviceId); diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 5e98e63a58439..c7277eea1bf96 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -190,9 +190,6 @@ elif config.libomptarget_current_target.startswith('amdgcn'): is_mi200 = True if supports_unified_shared_memory: config.available_features.add('unified_shared_memory') - arch_list = config.amdgpu_test_arch.split(";") - if len(arch_list) > 1 and arch_list[0] == arch_list[1]: - config.available_features.add('multi_device') if is_mi200: config.available_features.add('mi200') diff --git a/offload/test/multi_device/collapse-clause.cpp b/offload/test/multi_device/collapse-clause.cpp deleted file mode 100644 index 9882097718bc2..0000000000000 --- a/offload/test/multi_device/collapse-clause.cpp +++ /dev/null @@ -1,66 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 2000 -#define M 5 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N * M + 1)); - - // Init a: - for (int i = 0; i < N * M + 1; i++) { - a[i] = 0.0; - } - -#pragma omp target teams distribute parallel for collapse(2) - for (int i = 0; i < N; i++) { - for (int j = 0; j < M; j++) { - a[i * M + j] += 1; - } - } - - // clang-format off - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:10000 rpc:0 md:1 md_LB:0 md_UB:4999 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:10000 rpc:0 md:1 md_LB:5000 md_UB:9999 - // clang-format on - - // Checking the results are correct: - bool error = false; - for (int i = 0; i < N * M; i++) { - if (!(a[i] == 1)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - error = false; - if (a[N * M] != 0) - error = true; - - // CHECK: SUCCESS: last entry - if (!error) - printf("SUCCESS: last entry\n"); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/loop-with-fast-reduction.cpp b/offload/test/multi_device/loop-with-fast-reduction.cpp deleted file mode 100644 index 98d997d994cd1..0000000000000 --- a/offload/test/multi_device/loop-with-fast-reduction.cpp +++ /dev/null @@ -1,139 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -fopenmp-target-fast -fopenmp-target-fast-reduction -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 10000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - // Init a: - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - - // Using "<" - double sum = 7; -#pragma omp target teams distribute parallel for reduction(+ : sum) - for (int i = 7; i < N; i++) { - a[i] += 1; - sum += a[i]; - } - - int UB = N; - sum += 7 * 3; -#pragma omp target teams distribute parallel for reduction(+ : sum) - for (int i = 7; i < UB; i++) { - a[i] += 2; - sum += a[i]; - } - - int LB = 7; - sum += 7 * 6; -#pragma omp target teams distribute parallel for reduction(+ : sum) - for (int i = LB; i < UB; i++) { - a[i] += 3; - sum += a[i]; - } - - // Using "<=" - sum += 7 * 7; -#pragma omp target teams distribute parallel for reduction(+ : sum) - for (int i = 7; i <= N - 1; i++) { - a[i] += 1; - sum += a[i]; - } - - sum += 7 * 9; -#pragma omp target teams distribute parallel for reduction(+ : sum) - for (int i = 7; i <= UB - 1; i++) { - a[i] += 2; - sum += a[i]; - } - - sum += 7 * 12; -#pragma omp target teams distribute parallel for reduction(+ : sum) - for (int i = LB; i <= UB - 1; i++) { - a[i] += 3; - sum += a[i]; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - // clang-format on - - // CHECK: a[40] = 12 - int index = 40; - printf("a[%d] = %f\n", index, a[index]); - - // CHECK: SUM = 380000 - printf("SUM = %f\n", sum); - - // Checking the results are correct: - bool error = false; - for (int i = 7; i < N; i++) { - if (!(a[i] == 12)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - error = false; - if (a[N] != 0) - error = true; - - // CHECK: SUCCESS: last entry - if (!error) - printf("SUCCESS: last entry\n"); - - error = false; - for (int i = 0; i < 7; i++) { - if (!(a[i] == 0)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS: first 7 entries - if (!error) - printf("SUCCESS: first 7 entries\n"); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/loop-with-regular-reduction-and-privates.cpp b/offload/test/multi_device/loop-with-regular-reduction-and-privates.cpp deleted file mode 100644 index c971570bd6141..0000000000000 --- a/offload/test/multi_device/loop-with-regular-reduction-and-privates.cpp +++ /dev/null @@ -1,86 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#pragma omp declare target -int foo(int p) { return p * 2; } -#pragma omp end declare target - -#define N 10000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - double p = 12.0; - - // Init a: - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - - p += a[50]; - - // Using "<" - double sum = 0.0; - double sum2 = 0.0; -#pragma omp target teams distribute parallel for reduction(+ : sum, sum2) \ - map(tofrom : sum, sum2) - for (int i = 0; i < N; i++) { - a[i] += p + foo(p); - sum += a[i]; - sum2 += a[i] + 1; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:2 {{.*}} tripcount:10000 rpc:1 md:1 md_LB:0 md_UB:4999 - // CHECK: DEVID: 1 SGN:2 {{.*}} tripcount:10000 rpc:1 md:1 md_LB:5000 md_UB:9999 - // clang-format on - - // CHECK: a[40] = 36 - int index = 40; - printf("a[%d] = %f\n", index, a[index]); - - // CHECK: SUM = 360000 - printf("SUM = %f\n", sum); - - // CHECK: SUM2 = 370000 - printf("SUM2 = %f\n", sum2); - - // Checking the results are correct: - bool error = false; - for (int i = 0; i < N; i++) { - if (!(a[i] == 36)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - error = false; - if (a[N] != 0) - error = true; - - // CHECK: SUCCESS: last entry - if (!error) - printf("SUCCESS: last entry\n"); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/loop-with-regular-reduction.cpp b/offload/test/multi_device/loop-with-regular-reduction.cpp deleted file mode 100644 index ecbaaf7096776..0000000000000 --- a/offload/test/multi_device/loop-with-regular-reduction.cpp +++ /dev/null @@ -1,139 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 10000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - // Init a: - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - - // Using "<" - double sum = 7; -#pragma omp target teams distribute parallel for reduction(+ : sum) - for (int i = 7; i < N; i++) { - a[i] += 1; - sum += a[i]; - } - - int UB = N; - sum += 7 * 3; -#pragma omp target teams distribute parallel for reduction(+ : sum) - for (int i = 7; i < UB; i++) { - a[i] += 2; - sum += a[i]; - } - - int LB = 7; - sum += 7 * 6; -#pragma omp target teams distribute parallel for reduction(+ : sum) - for (int i = LB; i < UB; i++) { - a[i] += 3; - sum += a[i]; - } - - // Using "<=" - sum += 7 * 7; -#pragma omp target teams distribute parallel for reduction(+ : sum) - for (int i = 7; i <= N - 1; i++) { - a[i] += 1; - sum += a[i]; - } - - sum += 7 * 9; -#pragma omp target teams distribute parallel for reduction(+ : sum) - for (int i = 7; i <= UB - 1; i++) { - a[i] += 2; - sum += a[i]; - } - - sum += 7 * 12; -#pragma omp target teams distribute parallel for reduction(+ : sum) - for (int i = LB; i <= UB - 1; i++) { - a[i] += 3; - sum += a[i]; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:8 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - // clang-format on - - // CHECK: a[40] = 12 - int index = 40; - printf("a[%d] = %f\n", index, a[index]); - - // CHECK: SUM = 380000 - printf("SUM = %f\n", sum); - - // Checking the results are correct: - bool error = false; - for (int i = 7; i < N; i++) { - if (!(a[i] == 12)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - error = false; - if (a[N] != 0) - error = true; - - // CHECK: SUCCESS: last entry - if (!error) - printf("SUCCESS: last entry\n"); - - error = false; - for (int i = 0; i < 7; i++) { - if (!(a[i] == 0)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS: first 7 entries - if (!error) - printf("SUCCESS: first 7 entries\n"); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/loop-with-xteam-reduction-and-privates.cpp b/offload/test/multi_device/loop-with-xteam-reduction-and-privates.cpp deleted file mode 100644 index 643cf20cc4b41..0000000000000 --- a/offload/test/multi_device/loop-with-xteam-reduction-and-privates.cpp +++ /dev/null @@ -1,86 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -fopenmp-target-fast -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#pragma omp declare target -int foo(int p) { return p * 2; } -#pragma omp end declare target - -#define N 10000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 2)); - double p = 12.0; - - // Init a: - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - - p += a[50]; - - // Using "<" - double sum = 0.0; - double sum2 = 0.0; -#pragma omp target teams distribute parallel for reduction(+ : sum, sum2) \ - map(tofrom : sum, sum2) - for (int i = 1; i <= N; i++) { - a[i] += p + foo(p); - sum += a[i]; - sum2 += a[i] + 1; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:8 {{.*}} tripcount:10000 rpc:0 md:1 md_LB:0 md_UB:4999 - // CHECK: DEVID: 1 SGN:8 {{.*}} tripcount:10000 rpc:0 md:1 md_LB:5000 md_UB:9999 - // clang-format on - - // CHECK: a[40] = 36 - int index = 40; - printf("a[%d] = %f\n", index, a[index]); - - // CHECK: SUM = 360000 - printf("SUM = %f\n", sum); - - // CHECK: SUM2 = 370000 - printf("SUM2 = %f\n", sum2); - - // Checking the results are correct: - bool error = false; - for (int i = 1; i <= N; i++) { - if (!(a[i] == 36)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - error = false; - if (a[N + 1] != 0) - error = true; - - // CHECK: SUCCESS: last entry - if (!error) - printf("SUCCESS: last entry\n"); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/mixed-multi-device.cpp b/offload/test/multi_device/mixed-multi-device.cpp deleted file mode 100644 index e59ca276b2e4d..0000000000000 --- a/offload/test/multi_device/mixed-multi-device.cpp +++ /dev/null @@ -1,80 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 LIBOMPTARGET_KERNEL_TRACE=1 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 10000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - // Init a [0, 0, 0] - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - -// Should not be executed in multi-device mode: -#pragma omp target - { -#pragma omp parallel for - for (int i = 1; i < N; i++) { - a[i] += 1; - } - } - -#pragma omp target teams distribute parallel for - for (int i = 1; i < N; i++) { - a[i] += 2; - } - -#pragma omp target - { - for (int i = 1; i < N; i++) { - a[i] += 3; - } - } - - // clang-format off - // CHECK: DEVID: 0 SGN:2 {{.*}} tripcount:0 rpc:1 md:0 - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:9999 rpc:1 md:1 md_LB:0 md_UB:4998 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:9999 rpc:1 md:1 md_LB:4999 md_UB:9998 - // CHECK: DEVID: 0 SGN:1 {{.*}} tripcount:0 rpc:1 md:0 - // clang-format on - - // CHECK: a[0] = 0 - // CHECK: a[1] = 6 - // CHECK: a[9999] = 6 - // CHECK: a[10000] = 0 - printf("a[0] = %f\n", a[0]); - printf("a[1] = %f\n", a[1]); - printf("a[9999] = %f\n", a[9999]); - printf("a[10000] = %f\n", a[10000]); - - bool error = false; - for (int i = 1; i < N; i++) { - if (a[i] != 6) { - error = true; - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/nested-reductions.cpp b/offload/test/multi_device/nested-reductions.cpp deleted file mode 100644 index f507798c20aee..0000000000000 --- a/offload/test/multi_device/nested-reductions.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -fopenmp-target-fast -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define M 10 -#define N 15000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N * M + 1)); - - // Init a: - for (int i = 0; i < M; i++) { - for (int k = 0; k < N; k++) { - a[k * M + i] = i; - } - } - - double final_sum = 0.0; -#pragma omp target teams distribute reduction(+ : final_sum) - for (int i = 0; i < M; i++) { - double sum_qi = 0.0; -#pragma omp parallel for simd reduction(+ : sum_qi) - for (int k = 0; k < N; k++) - sum_qi = sum_qi + a[k * M + i]; - final_sum += sum_qi; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:3 {{.*}} tripcount:10 rpc:0 md:0 - // clang-format on - - // CHECK: final_sum = 675000 - printf("final_sum = %f\n", final_sum); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/no-copy-globals-no-force-usm.cpp b/offload/test/multi_device/no-copy-globals-no-force-usm.cpp deleted file mode 100644 index a88e678e055b0..0000000000000 --- a/offload/test/multi_device/no-copy-globals-no-force-usm.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 LIBOMPTARGET_KERNEL_TRACE=1 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 10000 - -#pragma omp declare target -double rho[N]; -#pragma omp end declare target - -int main() { - // Init RHO: - for (int i = 0; i < N; i++) - rho[i] = 1.0; - -// clang-format off -// CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:10000 rpc:0 md:1 md_LB:0 md_UB:4999 -// CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:10000 rpc:0 md:1 md_LB:5000 md_UB:9999 -// clang-format on -#pragma omp target teams distribute parallel for - for (int i = 0; i < N; i++) { - rho[i] += 2.0; - } - - // CHECK: rho[10] = 3.000000 rho[9000] = 3.000000 - printf("rho[10] = %f rho[9000] = %f\n", rho[10], rho[9000]); - - bool error = false; - for (int i = 0; i < N; i++) { - if (rho[i] != 3) { - printf("ERROR: rho[%d] = %f\n", i, rho[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS! - if (!error) - printf("SUCCESS!\n"); - - return 0; -} diff --git a/offload/test/multi_device/no-copy-globals-target-fast-no-force-usm.cpp b/offload/test/multi_device/no-copy-globals-target-fast-no-force-usm.cpp deleted file mode 100644 index 6be8c14070497..0000000000000 --- a/offload/test/multi_device/no-copy-globals-target-fast-no-force-usm.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -fopenmp-target-fast -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 LIBOMPTARGET_KERNEL_TRACE=1 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 10000 - -#pragma omp declare target -double rho[N]; -#pragma omp end declare target - -int main() { - // Init RHO: - for (int i = 0; i < N; i++) - rho[i] = 1.0; - -// clang-format off -// CHECK: DEVID: 0 SGN:6 {{.*}} tripcount:10000 rpc:0 md:1 md_LB:0 md_UB:4999 -// CHECK: DEVID: 1 SGN:6 {{.*}} tripcount:10000 rpc:0 md:1 md_LB:5000 md_UB:9999 -// clang-format on -#pragma omp target teams distribute parallel for - for (int i = 0; i < N; i++) { - rho[i] += 2.0; - } - - // CHECK: rho[10] = 3.000000 rho[9000] = 3.000000 - printf("rho[10] = %f rho[9000] = %f\n", rho[10], rho[9000]); - - bool error = false; - for (int i = 0; i < N; i++) { - if (rho[i] != 3) { - printf("ERROR: rho[%d] = %f\n", i, rho[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS! - if (!error) - printf("SUCCESS!\n"); - - return 0; -} diff --git a/offload/test/multi_device/no-copy-globals-with-target-fast.cpp b/offload/test/multi_device/no-copy-globals-with-target-fast.cpp deleted file mode 100644 index 5897a75095f03..0000000000000 --- a/offload/test/multi_device/no-copy-globals-with-target-fast.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -fopenmp-target-fast -fopenmp-force-usm -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 LIBOMPTARGET_KERNEL_TRACE=1 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 10000 - -#pragma omp declare target -double rho[N]; -#pragma omp end declare target - -int main() { - // Init RHO: - for (int i = 0; i < N; i++) - rho[i] = 1.0; - - // clang-format off -// CHECK: DEVID: 0 SGN:6 {{.*}} tripcount:10000 rpc:0 md:1 md_LB:0 md_UB:4999 -// CHECK: DEVID: 1 SGN:6 {{.*}} tripcount:10000 rpc:0 md:1 md_LB:5000 md_UB:9999 - // clang-format on - -#pragma omp target teams distribute parallel for - for (int i = 0; i < N; i++) { - rho[i] += 2.0; - } - - // CHECK: rho[10] = 3.000000 rho[9000] = 3.000000 - printf("rho[10] = %f rho[9000] = %f\n", rho[10], rho[9000]); - - bool error = false; - for (int i = 0; i < N; i++) { - if (rho[i] != 3) { - printf("ERROR: rho[%d] = %f\n", i, rho[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS! - if (!error) - printf("SUCCESS!\n"); - - return 0; -} diff --git a/offload/test/multi_device/no-copy-globals.cpp b/offload/test/multi_device/no-copy-globals.cpp deleted file mode 100644 index 96b15888b25c1..0000000000000 --- a/offload/test/multi_device/no-copy-globals.cpp +++ /dev/null @@ -1,57 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -fopenmp-force-usm -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 LIBOMPTARGET_KERNEL_TRACE=1 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 10000 - -#pragma omp declare target -double rho[N]; -#pragma omp end declare target - -int main() { - // Init RHO: - for (int i = 0; i < N; i++) - rho[i] = 1.0; - - // clang-format off -// CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:10000 rpc:0 md:1 md_LB:0 md_UB:4999 -// CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:10000 rpc:0 md:1 md_LB:5000 md_UB:9999 - // clang-format on - -#pragma omp target teams distribute parallel for - for (int i = 0; i < N; i++) { - rho[i] += 2.0; - } - - // CHECK: rho[10] = 3.000000 rho[9000] = 3.000000 - printf("rho[10] = %f rho[9000] = %f\n", rho[10], rho[9000]); - - bool error = false; - for (int i = 0; i < N; i++) { - if (rho[i] != 3) { - printf("ERROR: rho[%d] = %f\n", i, rho[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS! - if (!error) - printf("SUCCESS!\n"); - - return 0; -} diff --git a/offload/test/multi_device/non-unit-stride.cpp b/offload/test/multi_device/non-unit-stride.cpp deleted file mode 100644 index a60e9954507dd..0000000000000 --- a/offload/test/multi_device/non-unit-stride.cpp +++ /dev/null @@ -1,77 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 10000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - // Init a: - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - - int stride = 17; -#pragma omp target teams distribute parallel for - for (int i = 7; i < N; i += stride) { - a[i] += 1; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:588 rpc:0 md:1 md_LB:0 md_UB:293 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:588 rpc:0 md:1 md_LB:294 md_UB:587 - // clang-format on - - // Checking the results are correct: - bool error = false; - for (int i = 7; i < N; i += stride) { - if (!(a[i] == 1)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - error = false; - if (a[N] != 0) - error = true; - - // CHECK: SUCCESS: last entry - if (!error) - printf("SUCCESS: last entry\n"); - - error = false; - for (int i = 0; i < 7; i++) { - if (!(a[i] == 0)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS: first 7 entries - if (!error) - printf("SUCCESS: first 7 entries\n"); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/not-multi-device-small-tripcount.cpp b/offload/test/multi_device/not-multi-device-small-tripcount.cpp deleted file mode 100644 index aa0cef8b2f70b..0000000000000 --- a/offload/test/multi_device/not-multi-device-small-tripcount.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 LIBOMPTARGET_KERNEL_TRACE=1 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 2 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - // Init a [0, 0, 0] - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - -// Should not be executed in multi-device mode: -#pragma omp target - { -#pragma omp parallel for - for (int i = 1; i < N; i++) { - a[i] += 1; - } - } - - // clang-format off - // CHECK: DEVID: 0 SGN:2 {{.*}} tripcount:0 rpc:1 md:0 - // clang-format on - - // CHECK: a[0] = 0 - // CHECK: a[1] = 1 - // CHECK: a[2] = 0 - printf("a[0] = %f\n", a[0]); - printf("a[1] = %f\n", a[1]); - printf("a[2] = %f\n", a[2]); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/not-multi-device.cpp b/offload/test/multi_device/not-multi-device.cpp deleted file mode 100644 index 7b12f9430ee1d..0000000000000 --- a/offload/test/multi_device/not-multi-device.cpp +++ /dev/null @@ -1,53 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 LIBOMPTARGET_KERNEL_TRACE=1 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 10000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - // Init a [0, 0, 0] - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - -// Should not be executed in multi-device mode: -#pragma omp target - { -#pragma omp parallel for - for (int i = 1; i < N; i++) { - a[i] += 1; - } - } - - // clang-format off - // CHECK: DEVID: 0 SGN:2 {{.*}} tripcount:0 rpc:1 md:0 - // clang-format on - - // CHECK: a[0] = 0 - // CHECK: a[1] = 1 - // CHECK: a[9999] = 1 - // CHECK: a[10000] = 0 - printf("a[0] = %f\n", a[0]); - printf("a[1] = %f\n", a[1]); - printf("a[9999] = %f\n", a[9999]); - printf("a[10000] = %f\n", a[10000]); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/simple-loop-big-jump.cpp b/offload/test/multi_device/simple-loop-big-jump.cpp deleted file mode 100644 index 6402ad05cbe08..0000000000000 --- a/offload/test/multi_device/simple-loop-big-jump.cpp +++ /dev/null @@ -1,124 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 10000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - // Init a: - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - -// Using "<" -#pragma omp target teams distribute parallel for - for (int i = 7; i < N; i++) { - a[i] += 1; - } - - int UB = N; -#pragma omp target teams distribute parallel for - for (int i = 7; i < UB; i++) { - a[i] += 2; - } - - int LB = 7; -#pragma omp target teams distribute parallel for - for (int i = LB; i < UB; i++) { - a[i] += 3; - } - -// Using "<=" -#pragma omp target teams distribute parallel for - for (int i = 7; i <= N - 1; i++) { - a[i] += 1; - } - -#pragma omp target teams distribute parallel for - for (int i = 7; i <= UB - 1; i++) { - a[i] += 2; - } - -#pragma omp target teams distribute parallel for - for (int i = LB; i <= UB - 1; i++) { - a[i] += 3; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - // clang-format on - - // CHECK: a[40] = 12 - int index = 40; - printf("a[%d] = %f\n", index, a[index]); - - // Checking the results are correct: - bool error = false; - for (int i = 7; i < N; i++) { - if (!(a[i] == 12)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - error = false; - if (a[N] != 0) - error = true; - - // CHECK: SUCCESS: last entry - if (!error) - printf("SUCCESS: last entry\n"); - - error = false; - for (int i = 0; i < 7; i++) { - if (!(a[i] == 0)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS: first 7 entries - if (!error) - printf("SUCCESS: first 7 entries\n"); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/simple-loop-no-loop.cpp b/offload/test/multi_device/simple-loop-no-loop.cpp deleted file mode 100644 index 14ff6b448f1e2..0000000000000 --- a/offload/test/multi_device/simple-loop-no-loop.cpp +++ /dev/null @@ -1,124 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -fopenmp-target-fast -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 10000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - // Init a: - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - -// Using "<" -#pragma omp target teams distribute parallel for - for (int i = 7; i < N; i++) { - a[i] += 1; - } - - int UB = N; -#pragma omp target teams distribute parallel for - for (int i = 7; i < UB; i++) { - a[i] += 2; - } - - int LB = 7; -#pragma omp target teams distribute parallel for - for (int i = LB; i < UB; i++) { - a[i] += 3; - } - -// Using "<=" -#pragma omp target teams distribute parallel for - for (int i = 7; i <= N - 1; i++) { - a[i] += 1; - } - -#pragma omp target teams distribute parallel for - for (int i = 7; i <= UB - 1; i++) { - a[i] += 2; - } - -#pragma omp target teams distribute parallel for - for (int i = LB; i <= UB - 1; i++) { - a[i] += 3; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:6 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:6 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:6 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:6 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:6 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:6 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:6 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:6 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:6 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:6 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:6 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:6 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - // clang-format on - - // CHECK: a[40] = 12 - int index = 40; - printf("a[%d] = %f\n", index, a[index]); - - // Checking the results are correct: - bool error = false; - for (int i = 7; i < N; i++) { - if (!(a[i] == 12)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - error = false; - if (a[N] != 0) - error = true; - - // CHECK: SUCCESS: last entry - if (!error) - printf("SUCCESS: last entry\n"); - - error = false; - for (int i = 0; i < 7; i++) { - if (!(a[i] == 0)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS: first 7 entries - if (!error) - printf("SUCCESS: first 7 entries\n"); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/small-trip-count-threshold-with-reduction-2.cpp b/offload/test/multi_device/small-trip-count-threshold-with-reduction-2.cpp deleted file mode 100644 index 2208d7caecee1..0000000000000 --- a/offload/test/multi_device/small-trip-count-threshold-with-reduction-2.cpp +++ /dev/null @@ -1,68 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -O3 -fopenmp-target-multi-device -fopenmp-target-fast -fopenmp-target-xteam-reduction-blocksize=128 -fno-openmp-target-xteam-reduction -fopenmp-offload-mandatory -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 LIBOMPTARGET_BLOCKS_FOR_LOW_TRIP_COUNT=512 LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=16 LIBOMPTARGET_AMDGPU_LOW_TRIPCOUNT=15001\ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 15000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - // Init a: - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - - double sum = 0.0; -#pragma omp target teams distribute parallel for reduction(+ : sum) - for (int i = 0; i < N; i++) { - a[i] += 1; - sum += a[i]; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:2 {{.*}} tripcount:15000 rpc:0 md:1 md_LB:0 md_UB:7499 - // CHECK: DEVID: 1 SGN:2 {{.*}} tripcount:15000 rpc:0 md:1 md_LB:7500 md_UB:14999 - // clang-format on - - // CHECK: sum = 15000 - printf("sum = %f\n", sum); - - // Checking the results are correct: - bool error = false; - for (int i = 0; i < N; i++) { - if (!(a[i] == 1)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - error = false; - if (a[N] != 0) - error = true; - - // CHECK: SUCCESS: last entry - if (!error) - printf("SUCCESS: last entry\n"); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/small-trip-count-threshold-with-reduction-3.cpp b/offload/test/multi_device/small-trip-count-threshold-with-reduction-3.cpp deleted file mode 100644 index d835d21d9fa3c..0000000000000 --- a/offload/test/multi_device/small-trip-count-threshold-with-reduction-3.cpp +++ /dev/null @@ -1,68 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -O3 -fopenmp-target-multi-device -fopenmp-target-fast -fopenmp-target-xteam-reduction-blocksize=128 -fno-openmp-target-xteam-reduction -fopenmp-offload-mandatory -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 LIBOMPTARGET_BLOCKS_FOR_LOW_TRIP_COUNT=512 LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=16 LIBOMPTARGET_AMDGPU_LOW_TRIPCOUNT=15001\ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 15000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - // Init a: - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - - double sum = 0.0; -#pragma omp target teams distribute parallel for simd reduction(+ : sum) - for (int i = 0; i < N; i++) { - a[i] += 1; - sum += a[i]; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:2 {{.*}} tripcount:15000 rpc:0 md:1 md_LB:0 md_UB:7499 - // CHECK: DEVID: 1 SGN:2 {{.*}} tripcount:15000 rpc:0 md:1 md_LB:7500 md_UB:14999 - // clang-format on - - // CHECK: sum = 15000 - printf("sum = %f\n", sum); - - // Checking the results are correct: - bool error = false; - for (int i = 0; i < N; i++) { - if (!(a[i] == 1)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - error = false; - if (a[N] != 0) - error = true; - - // CHECK: SUCCESS: last entry - if (!error) - printf("SUCCESS: last entry\n"); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/small-trip-count-threshold-with-reduction.cpp b/offload/test/multi_device/small-trip-count-threshold-with-reduction.cpp deleted file mode 100644 index 87db0a6d7c1fc..0000000000000 --- a/offload/test/multi_device/small-trip-count-threshold-with-reduction.cpp +++ /dev/null @@ -1,68 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 LIBOMPTARGET_BLOCKS_FOR_LOW_TRIP_COUNT=512 LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=16 LIBOMPTARGET_AMDGPU_LOW_TRIPCOUNT=15001\ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 15000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - // Init a: - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - - double sum = 0.0; -#pragma omp target teams distribute parallel for reduction(+ : sum) - for (int i = 0; i < N; i++) { - a[i] += 1; - sum += a[i]; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:8 {{.*}} tripcount:15000 rpc:0 md:1 md_LB:0 md_UB:7499 - // CHECK: DEVID: 1 SGN:8 {{.*}} tripcount:15000 rpc:0 md:1 md_LB:7500 md_UB:14999 - // clang-format on - - // CHECK: sum = 15000 - printf("sum = %f\n", sum); - - // Checking the results are correct: - bool error = false; - for (int i = 0; i < N; i++) { - if (!(a[i] == 1)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - error = false; - if (a[N] != 0) - error = true; - - // CHECK: SUCCESS: last entry - if (!error) - printf("SUCCESS: last entry\n"); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/small-trip-count-threshold.cpp b/offload/test/multi_device/small-trip-count-threshold.cpp deleted file mode 100644 index 78ecc1311f17e..0000000000000 --- a/offload/test/multi_device/small-trip-count-threshold.cpp +++ /dev/null @@ -1,63 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 LIBOMPTARGET_BLOCKS_FOR_LOW_TRIP_COUNT=512 LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT=16 LIBOMPTARGET_AMDGPU_LOW_TRIPCOUNT=15001\ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 15000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - // Init a: - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - -#pragma omp target teams distribute parallel for - for (int i = 0; i < N; i++) { - a[i] += 1; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:15000 rpc:0 md:1 md_LB:0 md_UB:7499 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:15000 rpc:0 md:1 md_LB:7500 md_UB:14999 - // clang-format on - - // Checking the results are correct: - bool error = false; - for (int i = 0; i < N; i++) { - if (!(a[i] == 1)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - error = false; - if (a[N] != 0) - error = true; - - // CHECK: SUCCESS: last entry - if (!error) - printf("SUCCESS: last entry\n"); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/small-trip-count.cpp b/offload/test/multi_device/small-trip-count.cpp deleted file mode 100644 index d79e1d74a6c54..0000000000000 --- a/offload/test/multi_device/small-trip-count.cpp +++ /dev/null @@ -1,49 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 LIBOMPTARGET_KERNEL_TRACE=1 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 2 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - // Init a [0, 0, 0] - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - -// Loop with tripcount 1, should be less than number of devices used -// so this should actually only use 1 device: -#pragma omp target teams distribute parallel for - for (int i = 1; i < N; i++) { - a[i] += 1; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:1 rpc:0 md:1 md_LB:0 md_UB:0 - // clang-format on - - // CHECK: a[0] = 0 - // CHECK: a[1] = 1 - // CHECK: a[2] = 0 - printf("a[0] = %f\n", a[0]); - printf("a[1] = %f\n", a[1]); - printf("a[2] = %f\n", a[2]); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/static-chunk.cpp b/offload/test/multi_device/static-chunk.cpp deleted file mode 100644 index 2c5e70b4c2590..0000000000000 --- a/offload/test/multi_device/static-chunk.cpp +++ /dev/null @@ -1,67 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 LIBOMPTARGET_KERNEL_TRACE=1 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 10000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - -// Runs in multi-device mode -#pragma omp target teams distribute parallel for schedule(static, 1) - for (int i = 1; i < N; i++) { - a[i] += 2; - } - -// Does not run in multi-device node yet -#pragma omp target teams distribute parallel for schedule(static, 2) - for (int i = 1; i < N; i++) { - a[i] += 2; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:9999 rpc:1 md:1 md_LB:0 md_UB:4998 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:9999 rpc:1 md:1 md_LB:4999 md_UB:9998 - - // CHECK: DEVID: 0 SGN:2 {{.*}} tripcount:9999 rpc:1 md:0 - // clang-format on - - // CHECK: a[0] = 0 - // CHECK: a[10000] = 0 - printf("a[0] = %f\n", a[0]); - printf("a[10000] = %f\n", a[10000]); - - bool error = false; - for (int i = 1; i < N; i++) { - if (a[i] != 4) { - error = true; - printf("ERROR: a[%d] = %f\n", i, a[i]); - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - free(a); - return 0; -} diff --git a/offload/test/multi_device/two-step-loop-big-jump.cpp b/offload/test/multi_device/two-step-loop-big-jump.cpp deleted file mode 100644 index 362837e08f1af..0000000000000 --- a/offload/test/multi_device/two-step-loop-big-jump.cpp +++ /dev/null @@ -1,130 +0,0 @@ -// clang-format off -// RUN: %libomptarget-compile-generic -fopenmp-target-multi-device -// RUN: env HSA_XNACK=1 OMPX_APU_MAPS=1 LIBOMPTARGET_KERNEL_TRACE=1 LIBOMPTARGET_NUM_MULTI_DEVICES=2 \ -// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic -// clang-format on - -// UNSUPPORTED: aarch64-unknown-linux-gnu -// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO -// UNSUPPORTED: nvptx64-nvidia-cuda -// UNSUPPORTED: nvptx64-nvidia-cuda-LTO -// UNSUPPORTED: x86_64-pc-linux-gnu -// UNSUPPORTED: x86_64-pc-linux-gnu-LTO - -// REQUIRES: multi_device - -#include -#include - -#define N 10000 - -int main() { - double *a = (double *)malloc(sizeof(double) * (N + 1)); - - // Init a: - for (int i = 0; i < N + 1; i++) { - a[i] = 0.0; - } - -// Using "<" -#pragma omp target teams -#pragma omp distribute parallel for - for (int i = 7; i < N; i++) { - a[i] += 1; - } - - int UB = N; -#pragma omp target teams -#pragma omp distribute parallel for - for (int i = 7; i < UB; i++) { - a[i] += 2; - } - - int LB = 7; -#pragma omp target teams -#pragma omp distribute parallel for - for (int i = LB; i < UB; i++) { - a[i] += 3; - } - -// Using "<=" -#pragma omp target teams -#pragma omp distribute parallel for - for (int i = 7; i <= N - 1; i++) { - a[i] += 1; - } - -#pragma omp target teams -#pragma omp distribute parallel for - for (int i = 7; i <= UB - 1; i++) { - a[i] += 2; - } - -#pragma omp target teams -#pragma omp distribute parallel for - for (int i = LB; i <= UB - 1; i++) { - a[i] += 3; - } - - // clang-format off - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - - // CHECK: DEVID: 0 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:0 md_UB:4995 - // CHECK: DEVID: 1 SGN:7 {{.*}} tripcount:9993 rpc:0 md:1 md_LB:4996 md_UB:9992 - // clang-format on - - // CHECK: a[40] = 12 - int index = 40; - printf("a[%d] = %f\n", index, a[index]); - - // Checking the results are correct: - bool error = false; - for (int i = 7; i < N; i++) { - if (!(a[i] == 12)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS - if (!error) - printf("SUCCESS\n"); - - error = false; - if (a[N] != 0) - error = true; - - // CHECK: SUCCESS: last entry - if (!error) - printf("SUCCESS: last entry\n"); - - error = false; - for (int i = 0; i < 7; i++) { - if (!(a[i] == 0)) { - printf("ERROR at index = %d, value is a[%d] = %f\n", i, i, a[i]); - error = true; - break; - } - } - - // CHECK: SUCCESS: first 7 entries - if (!error) - printf("SUCCESS: first 7 entries\n"); - - free(a); - return 0; -} diff --git a/openmp/device/src/Workshare.cpp b/openmp/device/src/Workshare.cpp index 5a3d0d0fda660..653104ce883d1 100644 --- a/openmp/device/src/Workshare.cpp +++ b/openmp/device/src/Workshare.cpp @@ -197,24 +197,6 @@ template struct omptarget_nvptx_LoopSupport { *pstride = stride; } - /// static init function that takes into account multi-device execution - static void for_static_init_md(int32_t global_tid, int32_t schedtype, - int32_t *plastiter, T *plower_md, T *pupper_md, - T *plower, T *pupper, ST *pstride, ST chunk, - bool IsSPMDExecutionMode) { - T multi_device_lb; - multi_device_lb = *plower_md; - T multi_device_ub; - multi_device_ub = *pupper_md; - - for_static_init(global_tid, schedtype, plastiter, &multi_device_lb, - &multi_device_ub, pstride, chunk, IsSPMDExecutionMode); - - // Perform post static init adjustment of LB and UB - *plower = multi_device_lb; - *pupper = multi_device_ub; - } - //////////////////////////////////////////////////////////////////////////////// // Support for dispatch Init @@ -617,46 +599,6 @@ void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) { // deinit void __kmpc_dispatch_deinit(IdentTy *loc, int32_t tid) { popDST(); } -//////////////////////////////////////////////////////////////////////////////// -// KMP interface implementation (static loops) for multi-device -//////////////////////////////////////////////////////////////////////////////// - -void __kmpc_distribute_static_init_multi_device_4( - IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, - int32_t *plower_md, int32_t *pupper_md, int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, int32_t chunk) { - omptarget_nvptx_LoopSupport::for_static_init_md( - global_tid, schedtype, plastiter, plower_md, pupper_md, plower, pupper, - pstride, chunk, mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_multi_device_4u( - IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, - uint32_t *plower_md, uint32_t *pupper_md, uint32_t *plower, - uint32_t *pupper, int32_t *pstride, int32_t incr, int32_t chunk) { - omptarget_nvptx_LoopSupport::for_static_init_md( - global_tid, schedtype, plastiter, plower_md, pupper_md, plower, pupper, - pstride, chunk, mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_multi_device_8( - IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, - int64_t *plower_md, int64_t *pupper_md, int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, int64_t chunk) { - omptarget_nvptx_LoopSupport::for_static_init_md( - global_tid, schedtype, plastiter, plower_md, pupper_md, plower, pupper, - pstride, chunk, mapping::isSPMDMode()); -} - -void __kmpc_distribute_static_init_multi_device_8u( - IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, - uint64_t *plower_md, uint64_t *pupper_md, uint64_t *plower, - uint64_t *pupper, int64_t *pstride, int64_t incr, int64_t chunk) { - omptarget_nvptx_LoopSupport::for_static_init_md( - global_tid, schedtype, plastiter, plower_md, pupper_md, plower, pupper, - pstride, chunk, mapping::isSPMDMode()); -} - //////////////////////////////////////////////////////////////////////////////// // KMP interface implementation (static loops) ////////////////////////////////////////////////////////////////////////////////