ROCm · nicebert · Mar 20, 2026
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
@@ -241,7 +241,6 @@ LANGOPT(OpenMPTargetBigJumpLoop , 1, 1, NotCompatible, "Use big jump loop code g
 LANGOPT(OpenMPTargetNoLoop , 1, 1, NotCompatible, "Use no-loop code generation technique.")
 LANGOPT(OpenMPTargetXteamReduction , 1, 1, NotCompatible, "Use cross-team code generation technique.")
 LANGOPT(OpenMPTargetFastReduction , 1, 0, NotCompatible, "Use fast reduction code generation technique.")
-LANGOPT(OpenMPTargetMultiDevice , 1, 0, NotCompatible, "Offload the iteration space of a single target region across multiple GPU devices.")
 
 // The flag '-fopenmp-target-xteam-scan' triggers the 'Segmented Cross Team Scan' variant by default. To use the no-loop variant, please use the flag '-fopenmp-target-no-loop-scan' instead. 
 LANGOPT(OpenMPTargetXteamScan , 1, 0, NotCompatible, "Use the cross-team specialized kernel code generation for 'scan' directive.")

diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
@@ -4170,14 +4170,6 @@ def fno_openmp_target_xteam_no_loop_scan : Flag<["-"], "fno-openmp-target-xteam-
   Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Do not use the no-loop variant of the cross-team specialized kernel code generation for 'scan' directive.">,
   MarshallingInfoFlag<LangOpts<"OpenMPTargetXteamNoLoopScan">>;
-def fopenmp_target_multi_device : Flag<["-"], "fopenmp-target-multi-device">, Group<f_Group>,
-  Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option, FlangOption]>,
-  HelpText<"Enable code generation to emit support for multi device target region execution">,
-  MarshallingInfoFlag<LangOpts<"OpenMPTargetMultiDevice">>;
-def fno_openmp_target_multi_device : Flag<["-"], "fno-openmp-target-multi-device">, Group<f_Group>,
-  Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option,FlangOption]>,
-  HelpText<"Do not use code generation to emit support for multi target offloading">,
-  MarshallingInfoFlag<LangOpts<"OpenMPTargetMultiDevice">>;
 
 //===----------------------------------------------------------------------===//
 // Shared cc1 + fc1 OpenMP Target Options

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1073,8 +1073,7 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
 
   // The user forces the compiler to behave as if omp requires
   // unified_shared_memory was given.
-  if (CGM.getLangOpts().OpenMPForceUSM ||
-      CGM.getLangOpts().OpenMPTargetMultiDevice) {
+  if (CGM.getLangOpts().OpenMPForceUSM) {
     HasRequiresUnifiedSharedMemory = true;
     OMPBuilder.Config.setHasRequiresUnifiedSharedMemory(true);
   }
@@ -1238,8 +1237,7 @@ struct PushAndPopStackRAII {
 static llvm::Function *emitParallelOrTeamsOutlinedFunction(
     CodeGenModule &CGM, const OMPExecutableDirective &D, const CapturedStmt *CS,
     const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
-    const StringRef OutlinedHelperName, const RegionCodeGenTy &CodeGen,
-    bool EmittingOutlinedTeams) {
+    const StringRef OutlinedHelperName, const RegionCodeGenTy &CodeGen) {
   assert(ThreadIDVar->getType()->isPointerType() &&
          "thread id variable must be of type kmp_int32 *");
   CodeGenFunction CGF(CGM, true);
@@ -1270,8 +1268,7 @@ static llvm::Function *emitParallelOrTeamsOutlinedFunction(
   CGOpenMPOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen, InnermostKind,
                                     HasCancel, OutlinedHelperName);
   CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
-  return CGF.GenerateOpenMPCapturedStmtFunction(*CS, D,
-                                                EmittingOutlinedTeams, false);
+  return CGF.GenerateOpenMPCapturedStmtFunction(*CS, D, D.getBeginLoc());
 }
 
 std::string CGOpenMPRuntime::getOutlinedHelperName(StringRef Name) const {
@@ -1295,7 +1292,7 @@ llvm::Function *CGOpenMPRuntime::emitParallelOutlinedFunction(
   const CapturedStmt *CS = D.getCapturedStmt(OMPD_parallel);
   return emitParallelOrTeamsOutlinedFunction(
       CGM, D, CS, ThreadIDVar, InnermostKind, getOutlinedHelperName(CGF),
-      CodeGen, /*EmittingOutlinedTeams*/ false);
+      CodeGen);
 }
 
 llvm::Function *CGOpenMPRuntime::emitTeamsOutlinedFunction(
@@ -1305,7 +1302,7 @@ llvm::Function *CGOpenMPRuntime::emitTeamsOutlinedFunction(
   const CapturedStmt *CS = D.getCapturedStmt(OMPD_teams);
   return emitParallelOrTeamsOutlinedFunction(
       CGM, D, CS, ThreadIDVar, InnermostKind, getOutlinedHelperName(CGF),
-      CodeGen, /*EmittingOutlinedTeams*/ true);
+      CodeGen);
 }
 
 llvm::Function *CGOpenMPRuntime::emitTaskOutlinedFunction(
@@ -2735,37 +2732,19 @@ static void emitForStaticInitCall(
            "expected static chunked schedule");
   }
 
-  if (Values.IsMultiDevice) {
-    llvm::Value *Args[] = {
-        UpdateLocation,
-        ThreadId,
-        CGF.Builder.getInt32(addMonoNonMonoModifier(CGF.CGM, Schedule, M1,
-                                                    M2)), // Schedule type
-        Values.IL.emitRawPointer(CGF),                    // &isLastIter
-        Values.MultiDeviceLB.emitRawPointer(CGF),         // &MultiDeviceLB
-        Values.MultiDeviceUB.emitRawPointer(CGF),         // &MultiDeviceUB
-        Values.LB.emitRawPointer(CGF),                    // &LB
-        Values.UB.emitRawPointer(CGF),                    // &UB
-        Values.ST.emitRawPointer(CGF),                    // &Stride
-        CGF.Builder.getIntN(Values.IVSize, 1),            // Incr
-        Chunk                                             // Chunk
-    };
-    CGF.EmitRuntimeCall(ForStaticInitFunction, Args);
-  } else {
-    llvm::Value *Args[] = {
-        UpdateLocation,
-        ThreadId,
-        CGF.Builder.getInt32(addMonoNonMonoModifier(CGF.CGM, Schedule, M1,
-                                                    M2)), // Schedule type
-        Values.IL.emitRawPointer(CGF),                    // &isLastIter
-        Values.LB.emitRawPointer(CGF),                    // &LB
-        Values.UB.emitRawPointer(CGF),                    // &UB
-        Values.ST.emitRawPointer(CGF),                    // &Stride
-        CGF.Builder.getIntN(Values.IVSize, 1),            // Incr
-        Chunk                                             // Chunk
-    };
-    CGF.EmitRuntimeCall(ForStaticInitFunction, Args);
-  }
+  llvm::Value *Args[] = {
+      UpdateLocation,
+      ThreadId,
+      CGF.Builder.getInt32(addMonoNonMonoModifier(CGF.CGM, Schedule, M1,
+                                                  M2)), // Schedule type
+      Values.IL.emitRawPointer(CGF),                    // &isLastIter
+      Values.LB.emitRawPointer(CGF),                    // &LB
+      Values.UB.emitRawPointer(CGF),                    // &UB
+      Values.ST.emitRawPointer(CGF),                    // &Stride
+      CGF.Builder.getIntN(Values.IVSize, 1),            // Incr
+      Chunk                                             // Chunk
+  };
+  CGF.EmitRuntimeCall(ForStaticInitFunction, Args);
 }
 
 void CGOpenMPRuntime::emitForStaticInit(CodeGenFunction &CGF,
@@ -2793,7 +2772,7 @@ void CGOpenMPRuntime::emitForStaticInit(CodeGenFunction &CGF,
 void CGOpenMPRuntime::emitDistributeStaticInit(
     CodeGenFunction &CGF, SourceLocation Loc,
     OpenMPDistScheduleClauseKind SchedKind,
-    const CGOpenMPRuntime::StaticRTInput &Values, bool IsMultiDeviceKernel) {
+    const CGOpenMPRuntime::StaticRTInput &Values) {
   OpenMPSchedType ScheduleNum =
       getRuntimeSchedule(SchedKind, Values.Chunk != nullptr);
   llvm::Value *UpdatedLocation =
@@ -2802,13 +2781,8 @@ void CGOpenMPRuntime::emitDistributeStaticInit(
   llvm::FunctionCallee StaticInitFunction;
   bool isGPUDistribute =
       CGM.getLangOpts().OpenMPIsTargetDevice && CGM.getTriple().isGPU();
-  if (IsMultiDeviceKernel && isGPUDistribute) {
-    StaticInitFunction = OMPBuilder.createMDDistributeForStaticInitFunction(
-        Values.IVSize, Values.IVSigned);
-  } else {
-    StaticInitFunction = OMPBuilder.createForStaticInitFunction(
-        Values.IVSize, Values.IVSigned, isGPUDistribute);
-  }
+  StaticInitFunction = OMPBuilder.createForStaticInitFunction(
+      Values.IVSize, Values.IVSigned, isGPUDistribute);
   emitForStaticInitCall(CGF, UpdatedLocation, ThreadId, StaticInitFunction,
                         ScheduleNum, OMPC_SCHEDULE_MODIFIER_unknown,
                         OMPC_SCHEDULE_MODIFIER_unknown, Values);
@@ -6425,10 +6399,7 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
         CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
         if (CGM.getLangOpts().OpenMPIsTargetDevice && !isGPU())
           return CGF.GenerateOpenMPCapturedStmtFunctionAggregate(CS, D);
-        return CGF.GenerateOpenMPCapturedStmtFunction(
-            CS, D,
-            /*CanHaveMultiDeviceArgs*/ true,
-            /*IsTopKernel*/ true);
+        return CGF.GenerateOpenMPCapturedStmtFunction(CS, D, D.getBeginLoc());
       };
 
   cantFail(OMPBuilder.emitTargetRegionFunction(
@@ -10785,47 +10756,12 @@ emitDynCGroupMem(const OMPExecutableDirective &D, CodeGenFunction &CGF) {
 static void genMapInfoForCaptures(
     MappableExprsHandler &MEHandler, CodeGenFunction &CGF,
     const CapturedStmt &CS, llvm::SmallVectorImpl<llvm::Value *> &CapturedVars,
-    llvm::SmallVectorImpl<llvm::Value *> &MultiTargetVars,
     llvm::OpenMPIRBuilder &OMPBuilder,
     llvm::DenseSet<CanonicalDeclPtr<const Decl>> &MappedVarSet,
     uint32_t &CapturedCount,
     MappableExprsHandler::MapCombinedInfoTy &CombinedInfo) {
   llvm::DenseMap<llvm::Value *, llvm::Value *> LambdaPointers;
 
-  // If a for statement is present and the compiler flag for multi-device
-  // targets is enabled then it means we have 2 variables at the start which
-  // represent the lower and upper bounds of the loop:
-  // TODO: add compiler flag condition
-  for (auto *MTV = MultiTargetVars.begin(); MTV != MultiTargetVars.end();
-       ++MTV) {
-    // This should always be null because the any used variable (if one exists)
-    // will be included when capturing the actual variables (not the
-    // multi-target ones).
-    MappedVarSet.insert(nullptr);
-
-    MappableExprsHandler::MapCombinedInfoTy CurInfo;
-    CurInfo.Exprs.push_back(nullptr);
-    CurInfo.BasePointers.push_back(*MTV);
-    CurInfo.Pointers.push_back(*MTV);
-    CurInfo.Sizes.push_back(llvm::ConstantInt::get(CGF.Int64Ty, 4));
-
-    // Copy to the device as an argument. No need to retrieve it.
-    CurInfo.Types.push_back(OpenMPOffloadMappingFlags::OMP_MAP_LITERAL |
-                            OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM |
-                            OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT);
-    CurInfo.Mappers.push_back(nullptr);
-
-    assert(CurInfo.BasePointers.size() == CurInfo.Pointers.size() &&
-           CurInfo.BasePointers.size() == CurInfo.Sizes.size() &&
-           CurInfo.BasePointers.size() == CurInfo.Types.size() &&
-           CurInfo.BasePointers.size() == CurInfo.Mappers.size() &&
-           "Inconsistent map information sizes!");
-
-    // We need to append the results of this capture to what we already
-    // have.
-    CombinedInfo.append(CurInfo);
-  }
-
   auto RI = CS.getCapturedRecordDecl()->field_begin();
   auto *CV = CapturedVars.begin();
   CapturedCount = 0;
@@ -10952,15 +10888,14 @@ genMapInfo(MappableExprsHandler &MEHandler, CodeGenFunction &CGF,
 static void genMapInfo(const OMPExecutableDirective &D, CodeGenFunction &CGF,
                        const CapturedStmt &CS,
                        llvm::SmallVectorImpl<llvm::Value *> &CapturedVars,
-                       llvm::SmallVectorImpl<llvm::Value *> &MultiTargetVars,
                        llvm::OpenMPIRBuilder &OMPBuilder,
                        uint32_t &CapturedCount,
                        MappableExprsHandler::MapCombinedInfoTy &CombinedInfo) {
   // Get mappable expression information.
   MappableExprsHandler MEHandler(D, CGF);
   llvm::DenseSet<CanonicalDeclPtr<const Decl>> MappedVarSet;
 
-  genMapInfoForCaptures(MEHandler, CGF, CS, CapturedVars, MultiTargetVars,
+  genMapInfoForCaptures(MEHandler, CGF, CS, CapturedVars,
                         OMPBuilder, MappedVarSet, CapturedCount, CombinedInfo);
   genMapInfo(MEHandler, CGF, CombinedInfo, OMPBuilder, MappedVarSet);
 }
@@ -10985,7 +10920,6 @@ static void emitTargetCallKernelLaunch(
     CGOpenMPRuntime *OMPRuntime, llvm::Function *OutlinedFn,
     const OMPExecutableDirective &D,
     llvm::SmallVectorImpl<llvm::Value *> &CapturedVars,
-    llvm::SmallVectorImpl<llvm::Value *> &MultiTargetVars,
     bool RequiresOuterTask, const CapturedStmt &CS, bool OffloadingMandatory,
     llvm::PointerIntPair<const Expr *, 2, OpenMPDeviceClauseModifier> Device,
     llvm::Value *OutlinedFnID, CodeGenFunction::OMPTargetDataInfo &InputInfo,
@@ -10999,7 +10933,7 @@ static void emitTargetCallKernelLaunch(
   // Fill up the arrays with all the captured variables.
   MappableExprsHandler::MapCombinedInfoTy CombinedInfo;
   uint32_t CapturedCount;
-  genMapInfo(D, CGF, CS, CapturedVars, MultiTargetVars, OMPBuilder,
+  genMapInfo(D, CGF, CS, CapturedVars, OMPBuilder,
              CapturedCount, CombinedInfo);
 
   // Array to hold to allocated XTeam reduction variables:
@@ -11038,8 +10972,6 @@ static void emitTargetCallKernelLaunch(
     llvm::Value *XteamRedNumTeamsFromOccupancy = nullptr;
     bool IsXteamRedFast = CGF.CGM.isXteamRedFast(FStmt);
     // We don't need to allocate/initialize metadata in the fast version.
-    // TODO: This will not work for multi-target if we need to allocate
-    // data for each used device. Ensure conditions guard against that.
     if (!IsXteamRedFast) {
       // TODO Use device id from device clause, if any.
       DevIdVal = CGF.EmitRuntimeCall(
@@ -11330,9 +11262,6 @@ static void emitTargetCallKernelLaunch(
     bool IsReverseOffloading = Device.getInt() == OMPC_DEVICE_ancestor;
 
     if (IsReverseOffloading) {
-      assert(
-          !CGF.CGM.getLangOpts().OpenMPTargetMultiDevice &&
-          "Cannot enable multi-device targets when doing reverse offloading");
       // Reverse offloading is not supported, so just execute on the host.
       // FIXME: This fallback solution is incorrect since it ignores the
       // OMP_TARGET_OFFLOAD environment variable. Instead it would be better to
@@ -11397,12 +11326,9 @@ static void emitTargetCallKernelLaunch(
     CGF.Builder.restoreIP(AfterIP);
   };
 
-  if (RequiresOuterTask) {
-    assert(!CGM.getLangOpts().OpenMPTargetMultiDevice &&
-           "Cannot yet enable multi-device targets for situations in which an "
-           "outer task is required");
+  if (RequiresOuterTask)
     CGF.EmitOMPTargetTaskBasedDirective(D, ThenGen, InputInfo);
-  } else
+  else
     OMPRuntime->emitInlinedDirective(CGF, D.getDirectiveKind(), ThenGen);
 
   if (HasXTeamReduction) {
@@ -11466,11 +11392,10 @@ void CGOpenMPRuntime::emitTargetCall(
        needsTaskBasedThreadLimit(D.getDirectiveKind()) &&
        D.hasClausesOfKind<OMPThreadLimitClause>());
   llvm::SmallVector<llvm::Value *, 16> CapturedVars;
-  llvm::SmallVector<llvm::Value *, 4> MultiTargetVars;
   const CapturedStmt &CS = *D.getCapturedStmt(OMPD_target);
-  auto &&ArgsCodegen = [&CS, &D, &CapturedVars, &MultiTargetVars](
+  auto &&ArgsCodegen = [&CS, &D, &CapturedVars](
                            CodeGenFunction &CGF, PrePostActionTy &) {
-    CGF.GenerateOpenMPCapturedVarsDevice(CS, CapturedVars, MultiTargetVars,
+    CGF.GenerateOpenMPCapturedVars(CS, CapturedVars,
                                          CGF.CGM.getOptKernelKey(D));
   };
   emitInlinedDirective(CGF, OMPD_unknown, ArgsCodegen);
@@ -11479,13 +11404,13 @@ void CGOpenMPRuntime::emitTargetCall(
   llvm::Value *MapTypesArray = nullptr;
   llvm::Value *MapNamesArray = nullptr;
 
-  auto &&TargetThenGen = [this, OutlinedFn, &D, &CapturedVars, &MultiTargetVars,
+  auto &&TargetThenGen = [this, OutlinedFn, &D, &CapturedVars,
                           RequiresOuterTask, &CS, OffloadingMandatory, Device,
                           OutlinedFnID, &InputInfo, &MapTypesArray,
                           &MapNamesArray, SizeEmitter](CodeGenFunction &CGF,
                                                        PrePostActionTy &) {
     emitTargetCallKernelLaunch(
-        this, OutlinedFn, D, CapturedVars, MultiTargetVars, RequiresOuterTask,
+        this, OutlinedFn, D, CapturedVars, RequiresOuterTask,
         CS, OffloadingMandatory, Device, OutlinedFnID, InputInfo, MapTypesArray,
         MapNamesArray, SizeEmitter, CGF, CGM);
   };
@@ -13602,8 +13527,7 @@ void CGOpenMPSIMDRuntime::emitForStaticInit(
 
 void CGOpenMPSIMDRuntime::emitDistributeStaticInit(
     CodeGenFunction &CGF, SourceLocation Loc,
-    OpenMPDistScheduleClauseKind SchedKind, const StaticRTInput &Values,
-    bool IsMultiDeviceKernel) {
+    OpenMPDistScheduleClauseKind SchedKind, const StaticRTInput &Values) {
   llvm_unreachable("Not supported in SIMD-only mode");
 }
 

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -959,17 +959,9 @@ class CGOpenMPRuntime {
     bool IVSigned = false;
     /// true if loop is ordered, false otherwise.
     bool Ordered = false;
-    /// true if kernel is multi-device
-    bool IsMultiDevice = false;
     Address IL = Address::invalid();
     /// Address of the output variable in which the lower iteration number is
     /// returned.
-    Address MultiDeviceLB = Address::invalid();
-    /// Address of the output variable in which the upper iteration number is
-    /// returned.
-    Address MultiDeviceUB = Address::invalid();
-    /// Address of the output variable in which the lower iteration number is
-    /// returned.
     Address LB = Address::invalid();
     /// Address of the output variable in which the upper iteration number is
     /// returned.
@@ -985,11 +977,6 @@ class CGOpenMPRuntime {
                   llvm::Value *Chunk = nullptr)
         : IVSize(IVSize), IVSigned(IVSigned), Ordered(Ordered), IL(IL), LB(LB),
           UB(UB), ST(ST), Chunk(Chunk) {}
-    void setMultiDeviceLBUB(Address LB, Address UB) {
-      MultiDeviceLB = LB;
-      MultiDeviceUB = UB;
-      IsMultiDevice = true;
-    }
   };
   /// Call the appropriate runtime routine to initialize it before start
   /// of loop.
@@ -1020,8 +1007,7 @@ class CGOpenMPRuntime {
   virtual void emitDistributeStaticInit(CodeGenFunction &CGF,
                                         SourceLocation Loc,
                                         OpenMPDistScheduleClauseKind SchedKind,
-                                        const StaticRTInput &Values,
-                                        bool IsMultiDeviceKernel);
+                                        const StaticRTInput &Values);
 
   /// Call the appropriate runtime routine to notify that we finished
   /// iteration of the ordered loop with the dynamic scheduling.
@@ -1960,8 +1946,7 @@ class CGOpenMPSIMDRuntime final : public CGOpenMPRuntime {
   ///
   void emitDistributeStaticInit(CodeGenFunction &CGF, SourceLocation Loc,
                                 OpenMPDistScheduleClauseKind SchedKind,
-                                const StaticRTInput &Values,
-                                bool IsMultiDeviceKernel) override;
+                                const StaticRTInput &Values) override;
 
   /// Call the appropriate runtime routine to notify that we finished
   /// iteration of the ordered loop with the dynamic scheduling.