From bfb5b021f6133ec92eba8c762934bb1d6894ee94 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 13 Apr 2026 19:04:10 +0000
Subject: [PATCH 1/4] Add fat binary support: MGONGPU_SIMD_NAMESPACE,
 MatrixElementKernelHostFat, cppfat backend, build rules

Agent-Logs-Url: https://github.com/madgraph5/madgraph4gpu/sessions/db0d537e-e75b-4d0b-ba4c-b7f11fb41df6

Co-authored-by: oliviermattelaer <33414646+oliviermattelaer@users.noreply.github.com>
---
 .../iolibs/template_files/gpu/Bridge.h        |   9 +-
 .../gpu/MatrixElementKernels.cc               | 206 +++++++++++++++++-
 .../template_files/gpu/MatrixElementKernels.h |  60 +++++
 .../iolibs/template_files/gpu/cudacpp.mk      |  79 +++++++
 .../template_files/gpu/cudacpp_config.mk      |   2 +-
 .../iolibs/template_files/gpu/mgOnGpuConfig.h |  19 ++
 .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       |  79 +++++++
 .../cudacpp/ee_mumu.mad/src/cudacpp_config.mk |   2 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h   |  10 +
 .../cudacpp/ee_mumu.sa/SubProcesses/Bridge.h  |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../ee_mumu.sa/SubProcesses/cudacpp.mk        |  79 +++++++
 .../cudacpp/ee_mumu.sa/src/cudacpp_config.mk  |   2 +-
 epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h |  10 +
 .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h   |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk |  79 +++++++
 .../cudacpp/gg_tt.mad/src/cudacpp_config.mk   |   2 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h  |  10 +
 epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk  |  79 +++++++
 epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk |   2 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h   |  10 +
 .../gg_tt01g.mad/SubProcesses/Bridge.h        |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../gg_tt01g.mad/SubProcesses/cudacpp.mk      |  79 +++++++
 .../gg_tt01g.mad/src/cudacpp_config.mk        |   2 +-
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h  |  10 +
 .../cudacpp/gg_ttg.mad/SubProcesses/Bridge.h  |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../gg_ttg.mad/SubProcesses/cudacpp.mk        |  79 +++++++
 .../cudacpp/gg_ttg.mad/src/cudacpp_config.mk  |   2 +-
 epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h |  10 +
 .../cudacpp/gg_ttg.sa/SubProcesses/Bridge.h   |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk |  79 +++++++
 .../cudacpp/gg_ttg.sa/src/cudacpp_config.mk   |   2 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h  |  10 +
 .../cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../gg_ttgg.mad/SubProcesses/cudacpp.mk       |  79 +++++++
 .../cudacpp/gg_ttgg.mad/src/cudacpp_config.mk |   2 +-
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h   |  10 +
 .../cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h  |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../gg_ttgg.sa/SubProcesses/cudacpp.mk        |  79 +++++++
 .../cudacpp/gg_ttgg.sa/src/cudacpp_config.mk  |   2 +-
 epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h |  10 +
 .../gg_ttggg.mad/SubProcesses/Bridge.h        |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../gg_ttggg.mad/SubProcesses/cudacpp.mk      |  79 +++++++
 .../gg_ttggg.mad/src/cudacpp_config.mk        |   2 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h  |  10 +
 .../cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../gg_ttggg.sa/SubProcesses/cudacpp.mk       |  79 +++++++
 .../cudacpp/gg_ttggg.sa/src/cudacpp_config.mk |   2 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h   |  10 +
 .../cudacpp/gq_ttq.mad/SubProcesses/Bridge.h  |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../gq_ttq.mad/SubProcesses/cudacpp.mk        |  79 +++++++
 .../cudacpp/gq_ttq.mad/src/cudacpp_config.mk  |   2 +-
 epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h |  10 +
 .../cudacpp/gq_ttq.sa/SubProcesses/Bridge.h   |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk |  79 +++++++
 .../cudacpp/gq_ttq.sa/src/cudacpp_config.mk   |   2 +-
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h  |  10 +
 .../heft_gg_bb.mad/SubProcesses/Bridge.h      |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../heft_gg_bb.mad/SubProcesses/cudacpp.mk    |  79 +++++++
 .../heft_gg_bb.mad/src/cudacpp_config.mk      |   2 +-
 .../heft_gg_bb.mad/src/mgOnGpuConfig.h        |  10 +
 .../heft_gg_bb.sa/SubProcesses/Bridge.h       |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../heft_gg_bb.sa/SubProcesses/cudacpp.mk     |  79 +++++++
 .../heft_gg_bb.sa/src/cudacpp_config.mk       |   2 +-
 .../cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h |  10 +
 .../nobm_pp_ttW.mad/SubProcesses/Bridge.h     |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../nobm_pp_ttW.mad/SubProcesses/cudacpp.mk   |  79 +++++++
 .../nobm_pp_ttW.mad/src/cudacpp_config.mk     |   2 +-
 .../nobm_pp_ttW.mad/src/mgOnGpuConfig.h       |  10 +
 .../pp_tt012j.mad/SubProcesses/Bridge.h       |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../pp_tt012j.mad/SubProcesses/cudacpp.mk     |  79 +++++++
 .../pp_tt012j.mad/src/cudacpp_config.mk       |   2 +-
 .../cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h |  10 +
 .../smeft_gg_tttt.mad/SubProcesses/Bridge.h   |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../smeft_gg_tttt.mad/SubProcesses/cudacpp.mk |  79 +++++++
 .../smeft_gg_tttt.mad/src/cudacpp_config.mk   |   2 +-
 .../smeft_gg_tttt.mad/src/mgOnGpuConfig.h     |  10 +
 .../smeft_gg_tttt.sa/SubProcesses/Bridge.h    |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../smeft_gg_tttt.sa/SubProcesses/cudacpp.mk  |  79 +++++++
 .../smeft_gg_tttt.sa/src/cudacpp_config.mk    |   2 +-
 .../smeft_gg_tttt.sa/src/mgOnGpuConfig.h      |  10 +
 .../susy_gg_t1t1.mad/SubProcesses/Bridge.h    |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../susy_gg_t1t1.mad/SubProcesses/cudacpp.mk  |  79 +++++++
 .../susy_gg_t1t1.mad/src/cudacpp_config.mk    |   2 +-
 .../susy_gg_t1t1.mad/src/mgOnGpuConfig.h      |  10 +
 .../susy_gg_t1t1.sa/SubProcesses/Bridge.h     |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../susy_gg_t1t1.sa/SubProcesses/cudacpp.mk   |  79 +++++++
 .../susy_gg_t1t1.sa/src/cudacpp_config.mk     |   2 +-
 .../susy_gg_t1t1.sa/src/mgOnGpuConfig.h       |  10 +
 .../susy_gg_tt.mad/SubProcesses/Bridge.h      |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../susy_gg_tt.mad/SubProcesses/cudacpp.mk    |  79 +++++++
 .../susy_gg_tt.mad/src/cudacpp_config.mk      |   2 +-
 .../susy_gg_tt.mad/src/mgOnGpuConfig.h        |  10 +
 .../susy_gg_tt.sa/SubProcesses/Bridge.h       |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +++++++++++++++++-
 .../SubProcesses/MatrixElementKernels.h       |  60 +++++
 .../susy_gg_tt.sa/SubProcesses/cudacpp.mk     |  79 +++++++
 .../susy_gg_tt.sa/src/cudacpp_config.mk       |   2 +-
 .../cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h |  10 +
 144 files changed, 8721 insertions(+), 72 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index 583f3df0c9..b17c12bff9 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -602,6 +602,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -609,6 +611,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -863,6 +870,60 @@ $(BUILDDIR)/%%_$(GPUSUFFIX).o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -877,7 +938,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1132,6 +1207,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk
index b57e56d182..b920463d6c 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index 81e1e24e69..b3792ad8ec 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -305,4 +305,23 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index 0bfd669ab7..b2623bd894 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
index ae8ffaece8..d1b3a94fb9 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
index 0bfd669ab7..b2623bd894 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
index ae8ffaece8..d1b3a94fb9 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
index 0bfd669ab7..b2623bd894 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
index 0bfd669ab7..b2623bd894 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
index ae8ffaece8..d1b3a94fb9 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
index 0bfd669ab7..b2623bd894 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
index ae8ffaece8..d1b3a94fb9 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
index 0bfd669ab7..b2623bd894 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
index ae8ffaece8..d1b3a94fb9 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
index 0bfd669ab7..b2623bd894 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
index ae8ffaece8..d1b3a94fb9 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk b/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h
index 0bfd669ab7..b2623bd894 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk b/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h
index ae8ffaece8..d1b3a94fb9 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk b/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h
index 0bfd669ab7..b2623bd894 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
index 0bfd669ab7..b2623bd894 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk b/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h
index 0bfd669ab7..b2623bd894 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk b/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h
index ae8ffaece8..d1b3a94fb9 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h
index 0bfd669ab7..b2623bd894 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h
index ae8ffaece8..d1b3a94fb9 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h
index 0bfd669ab7..b2623bd894 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
index 4e3f17e0dd..0fb6ccaf90 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
@@ -194,7 +194,9 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    std::unique_ptr<MatrixElementKernelHost> m_pmek;
+    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
+    // use the common base class pointer to support both.
+    std::unique_ptr<MatrixElementKernelBase> m_pmek;
 #endif
   };
 
@@ -281,8 +283,13 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
+#ifdef MGONGPU_CPPFAT
+    m_pmek.reset( new MatrixElementKernelHostFat(
+      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
+#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index b61df224f1..c4b8037cc9 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif
+#endif // !MGONGPUCPP_GPUIMPL
 
 //============================================================================
 
@@ -525,3 +525,207 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
+
+// Fat binary implementation: runtime dispatch to the best available SIMD level.
+// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
+// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
+// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
+// MatrixElementKernelHostFat, which detects the host CPU at construction time and
+// delegates all computations to the best available SIMD version.
+
+#ifndef MGONGPUCPP_GPUIMPL
+
+// Forward declarations for per-SIMD namespaces.
+// These are resolved at link time to the object files compiled with appropriate -march flags.
+// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
+
+#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
+  namespace NS                                                                                \
+  {                                                                                           \
+    void computeDependentCouplings( const fptype* allgs,                                     \
+                                    fptype* allcouplings,                                    \
+                                    const int nevt );                                        \
+    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
+                               const fptype* allcouplings,                                   \
+                               fptype* allMEs,                                               \
+                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
+                               bool* isGoodHel,                                              \
+                               const int nevt );                                             \
+    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
+    void sigmaKin( const fptype* allmomenta,                                                  \
+                   const fptype* allcouplings,                                               \
+                   const fptype* allrndhel,                                                  \
+                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
+                   fptype* allMEs,                                                           \
+                   int* allselhel,                                                           \
+                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
+                   const int nevt );                                                         \
+  }
+
+// The multichannel-dependent parameters differ between builds
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
+  fptype* allNumerators,                             \
+  fptype* allDenominators,
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
+  const fptype* allrndcol,                    \
+  const unsigned int* allChannelIds,
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
+  int* allselcol,                          \
+  fptype* allNumerators,                   \
+  fptype* allDenominators,
+#else
+#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
+#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
+#endif
+
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
+MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
+
+#undef MG5AMC_CPPFAT_FORWARD_DECLARE
+#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
+#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
+
+namespace mg5amcCpu
+{
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
+                                                          const BufferGs& gs,
+                                                          const BufferRndNumHelicity& rndhel,
+                                                          const BufferRndNumColor& rndcol,
+                                                          const BufferChannelIds& channelIds,
+                                                          BufferMatrixElements& matrixElements,
+                                                          BufferSelectedHelicity& selhel,
+                                                          BufferSelectedColor& selcol,
+                                                          const size_t nevt )
+    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
+    , NumberOfEvents( nevt )
+    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
+    , m_couplings( nevt )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    , m_numerators( nevt )
+    , m_denominators( nevt )
+#endif
+  {
+    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
+    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
+    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
+    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
+    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
+    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
+    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
+    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
+    if( nevt % neppM != 0 )
+    {
+      std::ostringstream sstr;
+      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
+      throw std::runtime_error( sstr.str() );
+    }
+  }
+
+  //--------------------------------------------------------------------------
+
+  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
+
+  //--------------------------------------------------------------------------
+
+  // Detect the best SIMD level available on the current CPU at runtime.
+  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
+  {
+#if defined( __x86_64__ ) || defined( __i386__ )
+    if( __builtin_cpu_supports( "avx512vl" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
+      return SimdLevel::avx512z;
+    }
+    if( __builtin_cpu_supports( "avx512f" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
+      return SimdLevel::avx512y;
+    }
+    if( __builtin_cpu_supports( "avx2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
+      return SimdLevel::avx2;
+    }
+    if( __builtin_cpu_supports( "sse4.2" ) )
+    {
+      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
+      return SimdLevel::sse4;
+    }
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
+    return SimdLevel::none;
+#else
+    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
+    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
+    return SimdLevel::sse4;
+#endif
+  }
+
+  //--------------------------------------------------------------------------
+
+// Convenience macro to dispatch to the correct per-SIMD namespace
+#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
+  switch( m_selectedSimd )                                    \
+  {                                                           \
+    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
+    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
+    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
+    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
+    default:                 mg5amcCpu_none::CALL; break;    \
+  }
+
+  int MatrixElementKernelHostFat::computeGoodHelicities()
+  {
+    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
+#else
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
+#endif
+    int nGoodHel = 0;
+    switch( m_selectedSimd )
+    {
+      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
+    }
+    return nGoodHel;
+  }
+
+  //--------------------------------------------------------------------------
+
+  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
+  {
+    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
+#else
+    assert( useChannelIds == false );
+    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
+#endif
+#ifdef MGONGPU_CHANNELID_DEBUG
+    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
+#endif
+  }
+
+#undef MG5AMC_CPPFAT_DISPATCH
+
+  //--------------------------------------------------------------------------
+
+} // namespace mg5amcCpu
+
+#endif // !MGONGPUCPP_GPUIMPL
+
+//============================================================================
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h
index 16f8874888..a9d0521d4a 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h
@@ -156,6 +156,66 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
+#ifndef MGONGPUCPP_GPUIMPL
+  // Enum for the SIMD level selected at runtime in a fat binary
+  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
+
+  // A class encapsulating matrix element calculations on a CPU host, with runtime
+  // dispatch to the best available SIMD implementation (for use in a fat binary).
+  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
+  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
+  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
+  // host CPU capabilities and delegates all ME computations to the best version.
+  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
+  {
+  public:
+
+    // Constructor from existing input and output buffers
+    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
+                                const BufferGs& gs,                   // input: gs for alphaS
+                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
+                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
+                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
+                                BufferMatrixElements& matrixElements, // output: matrix elements
+                                BufferSelectedHelicity& selhel,       // output: helicity selection
+                                BufferSelectedColor& selcol,          // output: color selection
+                                const size_t nevt );
+
+    // Destructor
+    virtual ~MatrixElementKernelHostFat();
+
+    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
+    int computeGoodHelicities() override final;
+
+    // Compute matrix elements
+    void computeMatrixElements( const bool useChannelIds ) override final;
+
+    // Is this a host or device kernel?
+    bool isOnDevice() const override final { return false; }
+
+    // Detect the best available SIMD level on the current CPU
+    static SimdLevel detectBestSimd( const bool verbose = false );
+
+  private:
+
+    // The selected SIMD level (detected once at construction time)
+    SimdLevel m_selectedSimd;
+
+    // The buffer for the event-by-event couplings that depends on alphas QCD
+    HostBufferCouplings m_couplings;
+
+#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
+    // The buffer for the event-by-event numerators of multichannel factors
+    HostBufferNumerators m_numerators;
+
+    // The buffer for the event-by-event denominators of multichannel factors
+    HostBufferDenominators m_denominators;
+#endif
+  };
+#endif
+
+  //--------------------------------------------------------------------------
+
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
index f5bf67efbc..affd0b3aa5 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
@@ -578,6 +578,8 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
+  else ifeq ($(BACKEND),cppfat)
+    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -585,6 +587,11 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
+# Add the MGONGPU_CPPFAT flag when building the fat binary backend
+ifeq ($(BACKEND),cppfat)
+  CXXFLAGS += -DMGONGPU_CPPFAT
+endif
+
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
+# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
+# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
+# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
+# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
+ifeq ($(BACKEND),cppfat)
+# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
+CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
+CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
+CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
+CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
+CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
+CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
+
+$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+endif
+
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
+
+# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
+# once per SIMD level with a versioned namespace. All other objects are compiled once with the
+# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+ifeq ($(BACKEND),cppfat)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
+cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
+else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
+endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1108,6 +1183,10 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
+bldfat:
+	@echo
+	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
+
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk
index b57e56d182..748080a70f 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h
index ae8ffaece8..d1b3a94fb9 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h
@@ -305,4 +305,14 @@ ispoweroftwo( int n )
 #endif
 #endif
 
+// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
+// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
+// This allows compiling those translation units multiple times with different -march flags,
+// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
+// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
+// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
+#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
+#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
+#endif
+
 #endif // MGONGPUCONFIG_H

From 5c19a0887bff00c43a8d748ce8f9b0f78d8eb4ad Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 13 Apr 2026 19:22:38 +0000
Subject: [PATCH 2/4] Fix: revert deployed files to original, keep only CODEGEN
 changes; add Parameters_sm per-SIMD compilation, CPPProcess_base_cpp.o

Agent-Logs-Url: https://github.com/madgraph5/madgraph4gpu/sessions/db0d537e-e75b-4d0b-ba4c-b7f11fb41df6

Co-authored-by: oliviermattelaer <33414646+oliviermattelaer@users.noreply.github.com>
---
 .../iolibs/template_files/gpu/cudacpp.mk      |  62 +++++-
 .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../ee_mumu.mad/SubProcesses/cudacpp.mk       |  79 -------
 .../cudacpp/ee_mumu.mad/src/cudacpp_config.mk |   2 +-
 .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h   |  10 -
 .../cudacpp/ee_mumu.sa/SubProcesses/Bridge.h  |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../ee_mumu.sa/SubProcesses/cudacpp.mk        |  79 -------
 .../cudacpp/ee_mumu.sa/src/cudacpp_config.mk  |   2 +-
 epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h |  10 -
 .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h   |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk |  79 -------
 .../cudacpp/gg_tt.mad/src/cudacpp_config.mk   |   2 +-
 epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h  |  10 -
 epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk  |  79 -------
 epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk |   2 +-
 epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h   |  10 -
 .../gg_tt01g.mad/SubProcesses/Bridge.h        |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../gg_tt01g.mad/SubProcesses/cudacpp.mk      |  79 -------
 .../gg_tt01g.mad/src/cudacpp_config.mk        |   2 +-
 .../cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h  |  10 -
 .../cudacpp/gg_ttg.mad/SubProcesses/Bridge.h  |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../gg_ttg.mad/SubProcesses/cudacpp.mk        |  79 -------
 .../cudacpp/gg_ttg.mad/src/cudacpp_config.mk  |   2 +-
 epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h |  10 -
 .../cudacpp/gg_ttg.sa/SubProcesses/Bridge.h   |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk |  79 -------
 .../cudacpp/gg_ttg.sa/src/cudacpp_config.mk   |   2 +-
 epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h  |  10 -
 .../cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../gg_ttgg.mad/SubProcesses/cudacpp.mk       |  79 -------
 .../cudacpp/gg_ttgg.mad/src/cudacpp_config.mk |   2 +-
 .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h   |  10 -
 .../cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h  |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../gg_ttgg.sa/SubProcesses/cudacpp.mk        |  79 -------
 .../cudacpp/gg_ttgg.sa/src/cudacpp_config.mk  |   2 +-
 epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h |  10 -
 .../gg_ttggg.mad/SubProcesses/Bridge.h        |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../gg_ttggg.mad/SubProcesses/cudacpp.mk      |  79 -------
 .../gg_ttggg.mad/src/cudacpp_config.mk        |   2 +-
 .../cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h  |  10 -
 .../cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../gg_ttggg.sa/SubProcesses/cudacpp.mk       |  79 -------
 .../cudacpp/gg_ttggg.sa/src/cudacpp_config.mk |   2 +-
 .../cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h   |  10 -
 .../cudacpp/gq_ttq.mad/SubProcesses/Bridge.h  |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../gq_ttq.mad/SubProcesses/cudacpp.mk        |  79 -------
 .../cudacpp/gq_ttq.mad/src/cudacpp_config.mk  |   2 +-
 epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h |  10 -
 .../cudacpp/gq_ttq.sa/SubProcesses/Bridge.h   |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk |  79 -------
 .../cudacpp/gq_ttq.sa/src/cudacpp_config.mk   |   2 +-
 epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h  |  10 -
 .../heft_gg_bb.mad/SubProcesses/Bridge.h      |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../heft_gg_bb.mad/SubProcesses/cudacpp.mk    |  79 -------
 .../heft_gg_bb.mad/src/cudacpp_config.mk      |   2 +-
 .../heft_gg_bb.mad/src/mgOnGpuConfig.h        |  10 -
 .../heft_gg_bb.sa/SubProcesses/Bridge.h       |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../heft_gg_bb.sa/SubProcesses/cudacpp.mk     |  79 -------
 .../heft_gg_bb.sa/src/cudacpp_config.mk       |   2 +-
 .../cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h |  10 -
 .../nobm_pp_ttW.mad/SubProcesses/Bridge.h     |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../nobm_pp_ttW.mad/SubProcesses/cudacpp.mk   |  79 -------
 .../nobm_pp_ttW.mad/src/cudacpp_config.mk     |   2 +-
 .../nobm_pp_ttW.mad/src/mgOnGpuConfig.h       |  10 -
 .../pp_tt012j.mad/SubProcesses/Bridge.h       |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../pp_tt012j.mad/SubProcesses/cudacpp.mk     |  79 -------
 .../pp_tt012j.mad/src/cudacpp_config.mk       |   2 +-
 .../cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h |  10 -
 .../smeft_gg_tttt.mad/SubProcesses/Bridge.h   |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../smeft_gg_tttt.mad/SubProcesses/cudacpp.mk |  79 -------
 .../smeft_gg_tttt.mad/src/cudacpp_config.mk   |   2 +-
 .../smeft_gg_tttt.mad/src/mgOnGpuConfig.h     |  10 -
 .../smeft_gg_tttt.sa/SubProcesses/Bridge.h    |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../smeft_gg_tttt.sa/SubProcesses/cudacpp.mk  |  79 -------
 .../smeft_gg_tttt.sa/src/cudacpp_config.mk    |   2 +-
 .../smeft_gg_tttt.sa/src/mgOnGpuConfig.h      |  10 -
 .../susy_gg_t1t1.mad/SubProcesses/Bridge.h    |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../susy_gg_t1t1.mad/SubProcesses/cudacpp.mk  |  79 -------
 .../susy_gg_t1t1.mad/src/cudacpp_config.mk    |   2 +-
 .../susy_gg_t1t1.mad/src/mgOnGpuConfig.h      |  10 -
 .../susy_gg_t1t1.sa/SubProcesses/Bridge.h     |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../susy_gg_t1t1.sa/SubProcesses/cudacpp.mk   |  79 -------
 .../susy_gg_t1t1.sa/src/cudacpp_config.mk     |   2 +-
 .../susy_gg_t1t1.sa/src/mgOnGpuConfig.h       |  10 -
 .../susy_gg_tt.mad/SubProcesses/Bridge.h      |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../susy_gg_tt.mad/SubProcesses/cudacpp.mk    |  79 -------
 .../susy_gg_tt.mad/src/cudacpp_config.mk      |   2 +-
 .../susy_gg_tt.mad/src/mgOnGpuConfig.h        |  10 -
 .../susy_gg_tt.sa/SubProcesses/Bridge.h       |   9 +-
 .../SubProcesses/MatrixElementKernels.cc      | 206 +-----------------
 .../SubProcesses/MatrixElementKernels.h       |  60 -----
 .../susy_gg_tt.sa/SubProcesses/cudacpp.mk     |  79 -------
 .../susy_gg_tt.sa/src/cudacpp_config.mk       |   2 +-
 .../cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h |  10 -
 139 files changed, 122 insertions(+), 8358 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
index b17c12bff9..b37a5ed719 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk
@@ -883,6 +883,11 @@ CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
 CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
 CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
 
+# Identify the process-specific Parameters_<model>.cc source file in src/.
+# It must be defined here (before the rules below) so $(PARAMETERS_STEM) is expanded at parse time.
+PARAMETERS_SRC := $(wildcard ../../src/Parameters_*.cc)
+PARAMETERS_STEM := $(basename $(notdir $(PARAMETERS_SRC)))
+
 $(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
@@ -922,6 +927,42 @@ $(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.bu
 $(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
 	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+# Per-SIMD compilation rules for Parameters_<model>.cc.
+# Parameters_<model> is model-specific (e.g. Parameters_sm.cc) and must be compiled per SIMD level
+# because CPPProcess.cc includes Parameters_<model>.h, which uses the mg5amcCpu namespace macro.
+# When the macro is active, Parameters_<model>::getInstance() etc. end up in the versioned namespace,
+# so the matching definitions must also be compiled into that namespace.
+$(BUILDDIR)/$(PARAMETERS_STEM)_none_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
+
+$(BUILDDIR)/$(PARAMETERS_STEM)_sse4_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
+
+$(BUILDDIR)/$(PARAMETERS_STEM)_avx2_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
+
+$(BUILDDIR)/$(PARAMETERS_STEM)_512y_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
+
+$(BUILDDIR)/$(PARAMETERS_STEM)_512z_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
+
+# Base (compatibility) objects compiled without MGONGPU_SIMD_NAMESPACE, staying in mg5amcCpu namespace.
+# These provide mg5amcCpu::CPPProcess (for check_sa.cc and param_card reading) and
+# mg5amcCpu::sigmaKin (for MatrixElementKernelHost, the non-fat fallback kernel).
+$(BUILDDIR)/CPPProcess_base_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -c $< -o $@
+
+$(BUILDDIR)/color_sum_base_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
+	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
+	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -c $< -o $@
 endif
 
 #-------------------------------------------------------------------------------
@@ -939,16 +980,19 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
 
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
+# For the fat binary backend (cppfat), CPPProcess, color_sum, and Parameters_<model> are compiled
+# multiple times, once per SIMD level with a versioned namespace. Parameters_<model> must be compiled
+# per SIMD level because CPPProcess.cc references it, and when CPPProcess.cc is compiled with
+# -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, all mg5amcCpu:: references (including Parameters_<model>)
+# are renamed to mg5amcCpu_<level>::. The dispatcher (MatrixElementKernels.cc) links all versions.
 ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
+cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_none_cpp.o
+cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_sse4_cpp.o
+cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_avx2_cpp.o
+cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_512y_cpp.o
+cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_512z_cpp.o
+cppfat_objects_base=$(BUILDDIR)/CPPProcess_base_cpp.o $(BUILDDIR)/color_sum_base_cpp.o
+cppfat_objects_all=$(cppfat_objects_base) $(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
 cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
 else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
index b2623bd894..0bfd669ab7 100644
--- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
index d1b3a94fb9..ae8ffaece8 100644
--- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
index b2623bd894..0bfd669ab7 100644
--- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
index d1b3a94fb9..ae8ffaece8 100644
--- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
index b2623bd894..0bfd669ab7 100644
--- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
index b2623bd894..0bfd669ab7 100644
--- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
index d1b3a94fb9..ae8ffaece8 100644
--- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
index b2623bd894..0bfd669ab7 100644
--- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
index d1b3a94fb9..ae8ffaece8 100644
--- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
index b2623bd894..0bfd669ab7 100644
--- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
index d1b3a94fb9..ae8ffaece8 100644
--- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
index b2623bd894..0bfd669ab7 100644
--- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
index d1b3a94fb9..ae8ffaece8 100644
--- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk b/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h
index b2623bd894..0bfd669ab7 100644
--- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk b/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h
index d1b3a94fb9..ae8ffaece8 100644
--- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk b/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h
index b2623bd894..0bfd669ab7 100644
--- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
index b2623bd894..0bfd669ab7 100644
--- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk b/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h
index b2623bd894..0bfd669ab7 100644
--- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk b/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h
index d1b3a94fb9..ae8ffaece8 100644
--- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h
index b2623bd894..0bfd669ab7 100644
--- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h
index d1b3a94fb9..ae8ffaece8 100644
--- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk
+++ b/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h
index b2623bd894..0bfd669ab7 100644
--- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
index 0fb6ccaf90..4e3f17e0dd 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h
@@ -194,9 +194,7 @@ namespace mg5amcCpu
     HostBufferSelectedHelicity m_hstSelHel;
     HostBufferSelectedColor m_hstSelCol;
     HostBufferChannelIds m_hstChannelIds;
-    // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
-    // use the common base class pointer to support both.
-    std::unique_ptr<MatrixElementKernelBase> m_pmek;
+    std::unique_ptr<MatrixElementKernelHost> m_pmek;
 #endif
   };
 
@@ -283,13 +281,8 @@ namespace mg5amcCpu
     std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
               << std::endl;
 #endif
-#ifdef MGONGPU_CPPFAT
-    m_pmek.reset( new MatrixElementKernelHostFat(
-      m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#else
     m_pmek.reset( new MatrixElementKernelHost(
       m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
-#endif // MGONGPU_CPPFAT
 #endif // MGONGPUCPP_GPUIMPL
     // Create a process object, read param card and set parameters
     // FIXME: the process instance can happily go out of scope because it is only
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
index c4b8037cc9..b61df224f1 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc
@@ -288,7 +288,7 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
 }
-#endif // !MGONGPUCPP_GPUIMPL
+#endif
 
 //============================================================================
 
@@ -525,207 +525,3 @@ namespace mg5amcGpu
 #endif
 
 //============================================================================
-
-// Fat binary implementation: runtime dispatch to the best available SIMD level.
-// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
-// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
-// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
-// MatrixElementKernelHostFat, which detects the host CPU at construction time and
-// delegates all computations to the best available SIMD version.
-
-#ifndef MGONGPUCPP_GPUIMPL
-
-// Forward declarations for per-SIMD namespaces.
-// These are resolved at link time to the object files compiled with appropriate -march flags.
-// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.
-
-#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS )                                                  \
-  namespace NS                                                                                \
-  {                                                                                           \
-    void computeDependentCouplings( const fptype* allgs,                                     \
-                                    fptype* allcouplings,                                    \
-                                    const int nevt );                                        \
-    void sigmaKin_getGoodHel( const fptype* allmomenta,                                      \
-                               const fptype* allcouplings,                                   \
-                               fptype* allMEs,                                               \
-                               MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS                  \
-                               bool* isGoodHel,                                              \
-                               const int nevt );                                             \
-    int sigmaKin_setGoodHel( const bool* isGoodHel );                                        \
-    void sigmaKin( const fptype* allmomenta,                                                  \
-                   const fptype* allcouplings,                                               \
-                   const fptype* allrndhel,                                                  \
-                   MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS                                     \
-                   fptype* allMEs,                                                           \
-                   int* allselhel,                                                           \
-                   MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS                                        \
-                   const int nevt );                                                         \
-  }
-
-// The multichannel-dependent parameters differ between builds
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
-  fptype* allNumerators,                             \
-  fptype* allDenominators,
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
-  const fptype* allrndcol,                    \
-  const unsigned int* allChannelIds,
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
-  int* allselcol,                          \
-  fptype* allNumerators,                   \
-  fptype* allDenominators,
-#else
-#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS        /*nothing*/
-#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS           /*nothing*/
-#endif
-
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
-MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )
-
-#undef MG5AMC_CPPFAT_FORWARD_DECLARE
-#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
-#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS
-
-namespace mg5amcCpu
-{
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
-                                                          const BufferGs& gs,
-                                                          const BufferRndNumHelicity& rndhel,
-                                                          const BufferRndNumColor& rndcol,
-                                                          const BufferChannelIds& channelIds,
-                                                          BufferMatrixElements& matrixElements,
-                                                          BufferSelectedHelicity& selhel,
-                                                          BufferSelectedColor& selcol,
-                                                          const size_t nevt )
-    : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
-    , NumberOfEvents( nevt )
-    , m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
-    , m_couplings( nevt )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    , m_numerators( nevt )
-    , m_denominators( nevt )
-#endif
-  {
-    if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
-    if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
-    if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
-    if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
-    if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
-    if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
-    constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
-    static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
-    if( nevt % neppM != 0 )
-    {
-      std::ostringstream sstr;
-      sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
-      throw std::runtime_error( sstr.str() );
-    }
-  }
-
-  //--------------------------------------------------------------------------
-
-  MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}
-
-  //--------------------------------------------------------------------------
-
-  // Detect the best SIMD level available on the current CPU at runtime.
-  SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
-  {
-#if defined( __x86_64__ ) || defined( __i386__ )
-    if( __builtin_cpu_supports( "avx512vl" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
-      return SimdLevel::avx512z;
-    }
-    if( __builtin_cpu_supports( "avx512f" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
-      return SimdLevel::avx512y;
-    }
-    if( __builtin_cpu_supports( "avx2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
-      return SimdLevel::avx2;
-    }
-    if( __builtin_cpu_supports( "sse4.2" ) )
-    {
-      if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
-      return SimdLevel::sse4;
-    }
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
-    return SimdLevel::none;
-#else
-    // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
-    if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
-    return SimdLevel::sse4;
-#endif
-  }
-
-  //--------------------------------------------------------------------------
-
-// Convenience macro to dispatch to the correct per-SIMD namespace
-#define MG5AMC_CPPFAT_DISPATCH( CALL )                       \
-  switch( m_selectedSimd )                                    \
-  {                                                           \
-    case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break;    \
-    case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break;    \
-    case SimdLevel::avx2:    mg5amcCpu_avx2::CALL; break;    \
-    case SimdLevel::sse4:    mg5amcCpu_sse4::CALL; break;    \
-    default:                 mg5amcCpu_none::CALL; break;    \
-  }
-
-  int MatrixElementKernelHostFat::computeGoodHelicities()
-  {
-    HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
-#else
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
-#endif
-    int nGoodHel = 0;
-    switch( m_selectedSimd )
-    {
-      case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::avx2:    nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      case SimdLevel::sse4:    nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-      default:                 nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
-    }
-    return nGoodHel;
-  }
-
-  //--------------------------------------------------------------------------
-
-  void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
-  {
-    MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
-#else
-    assert( useChannelIds == false );
-    MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
-#endif
-#ifdef MGONGPU_CHANNELID_DEBUG
-    MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
-#endif
-  }
-
-#undef MG5AMC_CPPFAT_DISPATCH
-
-  //--------------------------------------------------------------------------
-
-} // namespace mg5amcCpu
-
-#endif // !MGONGPUCPP_GPUIMPL
-
-//============================================================================
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h
index a9d0521d4a..16f8874888 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h
@@ -156,66 +156,6 @@ namespace mg5amcCpu
 
   //--------------------------------------------------------------------------
 
-#ifndef MGONGPUCPP_GPUIMPL
-  // Enum for the SIMD level selected at runtime in a fat binary
-  enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };
-
-  // A class encapsulating matrix element calculations on a CPU host, with runtime
-  // dispatch to the best available SIMD implementation (for use in a fat binary).
-  // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
-  // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
-  // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
-  // host CPU capabilities and delegates all ME computations to the best version.
-  class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
-  {
-  public:
-
-    // Constructor from existing input and output buffers
-    MatrixElementKernelHostFat( const BufferMomenta& momenta,         // input: momenta
-                                const BufferGs& gs,                   // input: gs for alphaS
-                                const BufferRndNumHelicity& rndhel,   // input: random numbers for helicity selection
-                                const BufferRndNumColor& rndcol,      // input: random numbers for color selection
-                                const BufferChannelIds& channelIds,   // input: channel ids for single-diagram enhancement
-                                BufferMatrixElements& matrixElements, // output: matrix elements
-                                BufferSelectedHelicity& selhel,       // output: helicity selection
-                                BufferSelectedColor& selcol,          // output: color selection
-                                const size_t nevt );
-
-    // Destructor
-    virtual ~MatrixElementKernelHostFat();
-
-    // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
-    int computeGoodHelicities() override final;
-
-    // Compute matrix elements
-    void computeMatrixElements( const bool useChannelIds ) override final;
-
-    // Is this a host or device kernel?
-    bool isOnDevice() const override final { return false; }
-
-    // Detect the best available SIMD level on the current CPU
-    static SimdLevel detectBestSimd( const bool verbose = false );
-
-  private:
-
-    // The selected SIMD level (detected once at construction time)
-    SimdLevel m_selectedSimd;
-
-    // The buffer for the event-by-event couplings that depends on alphas QCD
-    HostBufferCouplings m_couplings;
-
-#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-    // The buffer for the event-by-event numerators of multichannel factors
-    HostBufferNumerators m_numerators;
-
-    // The buffer for the event-by-event denominators of multichannel factors
-    HostBufferDenominators m_denominators;
-#endif
-  };
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef MGONGPUCPP_GPUIMPL
   // A class encapsulating matrix element calculations on a GPU device
   class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
index affd0b3aa5..f5bf67efbc 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
+++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk
@@ -578,8 +578,6 @@ else
     override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc]
   else ifeq ($(BACKEND),cpp512z)
     override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers)
-  else ifeq ($(BACKEND),cppfat)
-    override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below)
   endif
 endif
 # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations?
@@ -587,11 +585,6 @@ ifeq ($(GPUCC),)
   CXXFLAGS+= $(AVXFLAGS)
 endif
 
-# Add the MGONGPU_CPPFAT flag when building the fat binary backend
-ifeq ($(BACKEND),cppfat)
-  CXXFLAGS += -DMGONGPU_CPPFAT
-endif
-
 # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f")
 $(info FPTYPE='$(FPTYPE)')
 ifeq ($(FPTYPE),d)
@@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
 	$(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@
 endif
 
-# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat).
-# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate
-# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_<level>, placing all symbols in a
-# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together.
-ifeq ($(BACKEND),cppfat)
-# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march
-CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS))
-CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64
-CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem
-CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell
-CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256
-CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512
-
-$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-
-$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@
-
-$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@
-
-$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@
-
-$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@
-
-$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG)
-	@if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi
-	$(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@
-endif
-
 #-------------------------------------------------------------------------------
 
 # Target (and build rules): common (src) library
@@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}')
 ###$(info processid_short=$(processid_short))
 
 MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp
-
-# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times,
-# once per SIMD level with a versioned namespace. All other objects are compiled once with the
-# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions.
-ifeq ($(BACKEND),cppfat)
-cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o
-cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o
-cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o
-cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o
-cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o
-cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z)
-cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all)
-else
 cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o
-endif
 cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o
 
 ifneq ($(GPUCC),)
@@ -1183,10 +1108,6 @@ bld512z:
 	@echo
 	$(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE)
 
-bldfat:
-	@echo
-	$(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE)
-
 ifeq ($(UNAME_P),ppc64le)
 ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4
 bldavxs: bldnone bldsse4
diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk
index 748080a70f..b57e56d182 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk
+++ b/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h
index d1b3a94fb9..ae8ffaece8 100644
--- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h
+++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h
@@ -305,14 +305,4 @@ ispoweroftwo( int n )
 #endif
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H

From 80e37ce8ac6f9ea59f3e9bcb6e711e502c687a62 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Mon, 13 Apr 2026 19:23:46 +0000
Subject: [PATCH 3/4] Fix duplicate cppfat in SUPPORTED_BACKENDS and duplicate
 block in mgOnGpuConfig.h

Agent-Logs-Url: https://github.com/madgraph5/madgraph4gpu/sessions/db0d537e-e75b-4d0b-ba4c-b7f11fb41df6

Co-authored-by: oliviermattelaer <33414646+oliviermattelaer@users.noreply.github.com>
---
 .../madgraph/iolibs/template_files/gpu/cudacpp_config.mk | 2 +-
 .../madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h   | 9 ---------
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk
index b920463d6c..748080a70f 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk
@@ -34,7 +34,7 @@ endif
 
 # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported
 # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words)
-override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat cppfat
+override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat
 ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1)
   $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)'))
 endif
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
index b3792ad8ec..ef17addc16 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h
@@ -315,13 +315,4 @@ ispoweroftwo( int n )
 #define mg5amcCpu MGONGPU_SIMD_NAMESPACE
 #endif
 
-// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2),
-// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum).
-// This allows compiling those translation units multiple times with different -march flags,
-// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary.
-// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled
-// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace.
-#define mg5amcCpu MGONGPU_SIMD_NAMESPACE
-#endif
-
 #endif // MGONGPUCONFIG_H

From e3a62b3decd492b535608c6e1a1a09201b4fbeec Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 14 Apr 2026 08:58:53 +0000
Subject: [PATCH 4/4] Fat binary: add MGONGPU_SIMD_LEVEL env-var to override
 runtime SIMD selection

Agent-Logs-Url: https://github.com/madgraph5/madgraph4gpu/sessions/0bd7a4a4-bb24-4bbf-b578-58747cf4f9fa

Co-authored-by: oliviermattelaer <33414646+oliviermattelaer@users.noreply.github.com>
---
 .../gpu/MatrixElementKernels.cc               | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
index c4b8037cc9..392c82947f 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc
@@ -637,8 +637,72 @@ namespace mg5amcCpu
   //--------------------------------------------------------------------------
 
   // Detect the best SIMD level available on the current CPU at runtime.
+  // If the environment variable MGONGPU_SIMD_LEVEL is set to one of the
+  // recognised level names (avx512z, avx512y, avx2, sse4, none) the
+  // requested level is used *provided* the hardware actually supports it.
+  // An unsupported or unrecognised value triggers a warning and falls back
+  // to auto-detection.  This allows benchmarking a lower SIMD tier on a
+  // machine that supports a higher one, e.g.:
+  //   MGONGPU_SIMD_LEVEL=avx2 ./check_cpp.exe   # force AVX2 on AVX512 HW
   SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
   {
+    // --- optional user override via MGONGPU_SIMD_LEVEL ---
+    const char* simdEnv = getenv( "MGONGPU_SIMD_LEVEL" );
+    if( simdEnv != nullptr )
+    {
+      const std::string requested( simdEnv );
+      SimdLevel req = SimdLevel::none; // initialised to keep compiler happy
+      bool knownLevel = true;
+      if( requested == "avx512z" )
+        req = SimdLevel::avx512z;
+      else if( requested == "avx512y" )
+        req = SimdLevel::avx512y;
+      else if( requested == "avx2" )
+        req = SimdLevel::avx2;
+      else if( requested == "sse4" )
+        req = SimdLevel::sse4;
+      else if( requested == "none" )
+        req = SimdLevel::none;
+      else
+      {
+        std::cerr << "WARNING: MGONGPU_SIMD_LEVEL='" << requested
+                  << "' is not recognised (valid values: avx512z avx512y avx2 sse4 none)."
+                  << " Falling back to auto-detection." << std::endl;
+        knownLevel = false;
+      }
+      if( knownLevel )
+      {
+        // Safety check: refuse to use a level the hardware cannot execute.
+        bool hwOk = false;
+#if defined( __x86_64__ ) || defined( __i386__ )
+        switch( req )
+        {
+          case SimdLevel::avx512z: hwOk = __builtin_cpu_supports( "avx512vl" ); break;
+          case SimdLevel::avx512y: hwOk = __builtin_cpu_supports( "avx512f" );  break;
+          case SimdLevel::avx2:    hwOk = __builtin_cpu_supports( "avx2" );     break;
+          case SimdLevel::sse4:    hwOk = __builtin_cpu_supports( "sse4.2" );   break;
+          case SimdLevel::none:    hwOk = true;                                 break;
+        }
+#else
+        // Non-x86: only sse4 (NEON/VSX) and none are meaningful overrides.
+        hwOk = ( req == SimdLevel::sse4 || req == SimdLevel::none );
+#endif
+        if( hwOk )
+        {
+          if( verbose )
+            std::cout << "INFO: Fat binary: MGONGPU_SIMD_LEVEL override: selected SIMD level "
+                      << requested << std::endl;
+          return req;
+        }
+        else
+        {
+          std::cerr << "WARNING: MGONGPU_SIMD_LEVEL='" << requested
+                    << "' is not supported by this CPU."
+                    << " Falling back to auto-detection." << std::endl;
+        }
+      }
+    }
+    // --- auto-detection ---
 #if defined( __x86_64__ ) || defined( __i386__ )
     if( __builtin_cpu_supports( "avx512vl" ) )
     {