madgraph5 · Copilot · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 14, 2026
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only

diff --git a/...DEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/...DEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,271 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  // If the environment variable MGONGPU_SIMD_LEVEL is set to one of the
+  // recognised level names (avx512z, avx512y, avx2, sse4, none) the
+  // requested level is used *provided* the hardware actually supports it.
+  // An unsupported or unrecognised value triggers a warning and falls back
+  // to auto-detection.  This allows benchmarking a lower SIMD tier on a
+  // machine that supports a higher one, e.g.:
+  //   MGONGPU_SIMD_LEVEL=avx2 ./check_cpp.exe   # force AVX2 on AVX512 HW
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+    // --- optional user override via MGONGPU_SIMD_LEVEL ---
+    const char* simdEnv = getenv( "MGONGPU_SIMD_LEVEL" );
+    if( simdEnv != nullptr )
+    {
+      const std::string requested( simdEnv );
+      SimdLevel req = SimdLevel::none; // initialised to keep compiler happy
+      bool knownLevel = true;
+      if( requested == "avx512z" )
+        req = SimdLevel::avx512z;
+      else if( requested == "avx512y" )
+        req = SimdLevel::avx512y;
+      else if( requested == "avx2" )
+        req = SimdLevel::avx2;
+      else if( requested == "sse4" )
+        req = SimdLevel::sse4;
+      else if( requested == "none" )
+        req = SimdLevel::none;
+      else
+      {
+        std::cerr << "WARNING: MGONGPU_SIMD_LEVEL='" << requested
+                  << "' is not recognised (valid values: avx512z avx512y avx2 sse4 none)."
+                  << " Falling back to auto-detection." << std::endl;
+        knownLevel = false;
+      }
+      if( knownLevel )
+      {
+        // Safety check: refuse to use a level the hardware cannot execute.
+        bool hwOk = false;
+#if defined( __x86_64__ ) || defined( __i386__ )
+        switch( req )
+        {
+          case SimdLevel::avx512z: hwOk = __builtin_cpu_supports( "avx512vl" ); break;
+          case SimdLevel::avx512y: hwOk = __builtin_cpu_supports( "avx512f" );  break;
+          case SimdLevel::avx2:    hwOk = __builtin_cpu_supports( "avx2" );     break;
+          case SimdLevel::sse4:    hwOk = __builtin_cpu_supports( "sse4.2" );   break;
+          case SimdLevel::none:    hwOk = true;                                 break;
+        }
+#else
+        // Non-x86: only sse4 (NEON/VSX) and none are meaningful overrides.
+        hwOk = ( req == SimdLevel::sse4 || req == SimdLevel::none );
+#endif
+        if( hwOk )
+        {
+          if( verbose )
+            std::cout << "INFO: Fat binary: MGONGPU_SIMD_LEVEL override: selected SIMD level "
+                      << requested << std::endl;
+          return req;
+        }
+        else
+        {
+          std::cerr << "WARNING: MGONGPU_SIMD_LEVEL='" << requested
+                    << "' is not supported by this CPU."
+                    << " Falling back to auto-detection." << std::endl;
+        }
+      }
+    }
+    // --- auto-detection ---
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/...ODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/...ODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents