From bfb5b021f6133ec92eba8c762934bb1d6894ee94 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 19:04:10 +0000 Subject: [PATCH 1/4] Add fat binary support: MGONGPU_SIMD_NAMESPACE, MatrixElementKernelHostFat, cppfat backend, build rules Agent-Logs-Url: https://github.com/madgraph5/madgraph4gpu/sessions/db0d537e-e75b-4d0b-ba4c-b7f11fb41df6 Co-authored-by: oliviermattelaer <33414646+oliviermattelaer@users.noreply.github.com> --- .../iolibs/template_files/gpu/Bridge.h | 9 +- .../gpu/MatrixElementKernels.cc | 206 +++++++++++++++++- .../template_files/gpu/MatrixElementKernels.h | 60 +++++ .../iolibs/template_files/gpu/cudacpp.mk | 79 +++++++ .../template_files/gpu/cudacpp_config.mk | 2 +- .../iolibs/template_files/gpu/mgOnGpuConfig.h | 19 ++ .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../ee_mumu.mad/SubProcesses/cudacpp.mk | 79 +++++++ .../cudacpp/ee_mumu.mad/src/cudacpp_config.mk | 2 +- .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h | 10 + .../cudacpp/ee_mumu.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../ee_mumu.sa/SubProcesses/cudacpp.mk | 79 +++++++ .../cudacpp/ee_mumu.sa/src/cudacpp_config.mk | 2 +- epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h | 10 + .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 79 +++++++ .../cudacpp/gg_tt.mad/src/cudacpp_config.mk | 2 +- epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h | 10 + epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 79 +++++++ epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk | 2 +- epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h | 10 + .../gg_tt01g.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../gg_tt01g.mad/SubProcesses/cudacpp.mk | 79 +++++++ .../gg_tt01g.mad/src/cudacpp_config.mk | 2 +- .../cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h | 10 + .../cudacpp/gg_ttg.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../gg_ttg.mad/SubProcesses/cudacpp.mk | 79 +++++++ .../cudacpp/gg_ttg.mad/src/cudacpp_config.mk | 2 +- epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h | 10 + .../cudacpp/gg_ttg.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 79 +++++++ .../cudacpp/gg_ttg.sa/src/cudacpp_config.mk | 2 +- epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h | 10 + .../cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../gg_ttgg.mad/SubProcesses/cudacpp.mk | 79 +++++++ .../cudacpp/gg_ttgg.mad/src/cudacpp_config.mk | 2 +- .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h | 10 + .../cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../gg_ttgg.sa/SubProcesses/cudacpp.mk | 79 +++++++ .../cudacpp/gg_ttgg.sa/src/cudacpp_config.mk | 2 +- epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h | 10 + .../gg_ttggg.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../gg_ttggg.mad/SubProcesses/cudacpp.mk | 79 +++++++ .../gg_ttggg.mad/src/cudacpp_config.mk | 2 +- .../cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h | 10 + .../cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../gg_ttggg.sa/SubProcesses/cudacpp.mk | 79 +++++++ .../cudacpp/gg_ttggg.sa/src/cudacpp_config.mk | 2 +- .../cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h | 10 + .../cudacpp/gq_ttq.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../gq_ttq.mad/SubProcesses/cudacpp.mk | 79 +++++++ .../cudacpp/gq_ttq.mad/src/cudacpp_config.mk | 2 +- epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h | 10 + .../cudacpp/gq_ttq.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk | 79 +++++++ .../cudacpp/gq_ttq.sa/src/cudacpp_config.mk | 2 +- epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h | 10 + .../heft_gg_bb.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../heft_gg_bb.mad/SubProcesses/cudacpp.mk | 79 +++++++ .../heft_gg_bb.mad/src/cudacpp_config.mk | 2 +- .../heft_gg_bb.mad/src/mgOnGpuConfig.h | 10 + .../heft_gg_bb.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../heft_gg_bb.sa/SubProcesses/cudacpp.mk | 79 +++++++ .../heft_gg_bb.sa/src/cudacpp_config.mk | 2 +- .../cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h | 10 + .../nobm_pp_ttW.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../nobm_pp_ttW.mad/SubProcesses/cudacpp.mk | 79 +++++++ .../nobm_pp_ttW.mad/src/cudacpp_config.mk | 2 +- .../nobm_pp_ttW.mad/src/mgOnGpuConfig.h | 10 + .../pp_tt012j.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../pp_tt012j.mad/SubProcesses/cudacpp.mk | 79 +++++++ .../pp_tt012j.mad/src/cudacpp_config.mk | 2 +- .../cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h | 10 + .../smeft_gg_tttt.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../smeft_gg_tttt.mad/SubProcesses/cudacpp.mk | 79 +++++++ .../smeft_gg_tttt.mad/src/cudacpp_config.mk | 2 +- .../smeft_gg_tttt.mad/src/mgOnGpuConfig.h | 10 + .../smeft_gg_tttt.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../smeft_gg_tttt.sa/SubProcesses/cudacpp.mk | 79 +++++++ .../smeft_gg_tttt.sa/src/cudacpp_config.mk | 2 +- .../smeft_gg_tttt.sa/src/mgOnGpuConfig.h | 10 + .../susy_gg_t1t1.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../susy_gg_t1t1.mad/SubProcesses/cudacpp.mk | 79 +++++++ .../susy_gg_t1t1.mad/src/cudacpp_config.mk | 2 +- .../susy_gg_t1t1.mad/src/mgOnGpuConfig.h | 10 + .../susy_gg_t1t1.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../susy_gg_t1t1.sa/SubProcesses/cudacpp.mk | 79 +++++++ .../susy_gg_t1t1.sa/src/cudacpp_config.mk | 2 +- .../susy_gg_t1t1.sa/src/mgOnGpuConfig.h | 10 + .../susy_gg_tt.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../susy_gg_tt.mad/SubProcesses/cudacpp.mk | 79 +++++++ .../susy_gg_tt.mad/src/cudacpp_config.mk | 2 +- .../susy_gg_tt.mad/src/mgOnGpuConfig.h | 10 + .../susy_gg_tt.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +++++++++++++++++- .../SubProcesses/MatrixElementKernels.h | 60 +++++ .../susy_gg_tt.sa/SubProcesses/cudacpp.mk | 79 +++++++ .../susy_gg_tt.sa/src/cudacpp_config.mk | 2 +- .../cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h | 10 + 144 files changed, 8721 insertions(+), 72 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 583f3df0c9..b17c12bff9 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -602,6 +602,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -609,6 +611,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -863,6 +870,60 @@ $(BUILDDIR)/%%_$(GPUSUFFIX).o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -877,7 +938,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1132,6 +1207,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk index b57e56d182..b920463d6c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index 81e1e24e69..b3792ad8ec 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -305,4 +305,23 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h index 0bfd669ab7..b2623bd894 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h index ae8ffaece8..d1b3a94fb9 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h index 0bfd669ab7..b2623bd894 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h index ae8ffaece8..d1b3a94fb9 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h index 0bfd669ab7..b2623bd894 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h index 0bfd669ab7..b2623bd894 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h index ae8ffaece8..d1b3a94fb9 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h index 0bfd669ab7..b2623bd894 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h index ae8ffaece8..d1b3a94fb9 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h index 0bfd669ab7..b2623bd894 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h index ae8ffaece8..d1b3a94fb9 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h index 0bfd669ab7..b2623bd894 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h index ae8ffaece8..d1b3a94fb9 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk b/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h index 0bfd669ab7..b2623bd894 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk b/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h index ae8ffaece8..d1b3a94fb9 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk b/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h index 0bfd669ab7..b2623bd894 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h index 0bfd669ab7..b2623bd894 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk b/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h index 0bfd669ab7..b2623bd894 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk b/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h index ae8ffaece8..d1b3a94fb9 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h index 0bfd669ab7..b2623bd894 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h index ae8ffaece8..d1b3a94fb9 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h index 0bfd669ab7..b2623bd894 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc index b61df224f1..c4b8037cc9 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,207 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk index f5bf67efbc..affd0b3aa5 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk @@ -578,6 +578,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -585,6 +587,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -839,6 +846,60 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -853,7 +914,21 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, +# once per SIMD level with a versioned namespace. All other objects are compiled once with the +# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o +cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1108,6 +1183,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h index ae8ffaece8..d1b3a94fb9 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H From 5c19a0887bff00c43a8d748ce8f9b0f78d8eb4ad Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 19:22:38 +0000 Subject: [PATCH 2/4] Fix: revert deployed files to original, keep only CODEGEN changes; add Parameters_sm per-SIMD compilation, CPPProcess_base_cpp.o Agent-Logs-Url: https://github.com/madgraph5/madgraph4gpu/sessions/db0d537e-e75b-4d0b-ba4c-b7f11fb41df6 Co-authored-by: oliviermattelaer <33414646+oliviermattelaer@users.noreply.github.com> --- .../iolibs/template_files/gpu/cudacpp.mk | 62 +++++- .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../ee_mumu.mad/SubProcesses/cudacpp.mk | 79 ------- .../cudacpp/ee_mumu.mad/src/cudacpp_config.mk | 2 +- .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h | 10 - .../cudacpp/ee_mumu.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../ee_mumu.sa/SubProcesses/cudacpp.mk | 79 ------- .../cudacpp/ee_mumu.sa/src/cudacpp_config.mk | 2 +- epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h | 10 - .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 79 ------- .../cudacpp/gg_tt.mad/src/cudacpp_config.mk | 2 +- epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h | 10 - epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 79 ------- epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk | 2 +- epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h | 10 - .../gg_tt01g.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../gg_tt01g.mad/SubProcesses/cudacpp.mk | 79 ------- .../gg_tt01g.mad/src/cudacpp_config.mk | 2 +- .../cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h | 10 - .../cudacpp/gg_ttg.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../gg_ttg.mad/SubProcesses/cudacpp.mk | 79 ------- .../cudacpp/gg_ttg.mad/src/cudacpp_config.mk | 2 +- epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h | 10 - .../cudacpp/gg_ttg.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 79 ------- .../cudacpp/gg_ttg.sa/src/cudacpp_config.mk | 2 +- epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h | 10 - .../cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../gg_ttgg.mad/SubProcesses/cudacpp.mk | 79 ------- .../cudacpp/gg_ttgg.mad/src/cudacpp_config.mk | 2 +- .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h | 10 - .../cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../gg_ttgg.sa/SubProcesses/cudacpp.mk | 79 ------- .../cudacpp/gg_ttgg.sa/src/cudacpp_config.mk | 2 +- epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h | 10 - .../gg_ttggg.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../gg_ttggg.mad/SubProcesses/cudacpp.mk | 79 ------- .../gg_ttggg.mad/src/cudacpp_config.mk | 2 +- .../cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h | 10 - .../cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../gg_ttggg.sa/SubProcesses/cudacpp.mk | 79 ------- .../cudacpp/gg_ttggg.sa/src/cudacpp_config.mk | 2 +- .../cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h | 10 - .../cudacpp/gq_ttq.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../gq_ttq.mad/SubProcesses/cudacpp.mk | 79 ------- .../cudacpp/gq_ttq.mad/src/cudacpp_config.mk | 2 +- epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h | 10 - .../cudacpp/gq_ttq.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk | 79 ------- .../cudacpp/gq_ttq.sa/src/cudacpp_config.mk | 2 +- epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h | 10 - .../heft_gg_bb.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../heft_gg_bb.mad/SubProcesses/cudacpp.mk | 79 ------- .../heft_gg_bb.mad/src/cudacpp_config.mk | 2 +- .../heft_gg_bb.mad/src/mgOnGpuConfig.h | 10 - .../heft_gg_bb.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../heft_gg_bb.sa/SubProcesses/cudacpp.mk | 79 ------- .../heft_gg_bb.sa/src/cudacpp_config.mk | 2 +- .../cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h | 10 - .../nobm_pp_ttW.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../nobm_pp_ttW.mad/SubProcesses/cudacpp.mk | 79 ------- .../nobm_pp_ttW.mad/src/cudacpp_config.mk | 2 +- .../nobm_pp_ttW.mad/src/mgOnGpuConfig.h | 10 - .../pp_tt012j.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../pp_tt012j.mad/SubProcesses/cudacpp.mk | 79 ------- .../pp_tt012j.mad/src/cudacpp_config.mk | 2 +- .../cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h | 10 - .../smeft_gg_tttt.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../smeft_gg_tttt.mad/SubProcesses/cudacpp.mk | 79 ------- .../smeft_gg_tttt.mad/src/cudacpp_config.mk | 2 +- .../smeft_gg_tttt.mad/src/mgOnGpuConfig.h | 10 - .../smeft_gg_tttt.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../smeft_gg_tttt.sa/SubProcesses/cudacpp.mk | 79 ------- .../smeft_gg_tttt.sa/src/cudacpp_config.mk | 2 +- .../smeft_gg_tttt.sa/src/mgOnGpuConfig.h | 10 - .../susy_gg_t1t1.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../susy_gg_t1t1.mad/SubProcesses/cudacpp.mk | 79 ------- .../susy_gg_t1t1.mad/src/cudacpp_config.mk | 2 +- .../susy_gg_t1t1.mad/src/mgOnGpuConfig.h | 10 - .../susy_gg_t1t1.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../susy_gg_t1t1.sa/SubProcesses/cudacpp.mk | 79 ------- .../susy_gg_t1t1.sa/src/cudacpp_config.mk | 2 +- .../susy_gg_t1t1.sa/src/mgOnGpuConfig.h | 10 - .../susy_gg_tt.mad/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../susy_gg_tt.mad/SubProcesses/cudacpp.mk | 79 ------- .../susy_gg_tt.mad/src/cudacpp_config.mk | 2 +- .../susy_gg_tt.mad/src/mgOnGpuConfig.h | 10 - .../susy_gg_tt.sa/SubProcesses/Bridge.h | 9 +- .../SubProcesses/MatrixElementKernels.cc | 206 +----------------- .../SubProcesses/MatrixElementKernels.h | 60 ----- .../susy_gg_tt.sa/SubProcesses/cudacpp.mk | 79 ------- .../susy_gg_tt.sa/src/cudacpp_config.mk | 2 +- .../cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h | 10 - 139 files changed, 122 insertions(+), 8358 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index b17c12bff9..b37a5ed719 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -883,6 +883,11 @@ CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 +# Identify the process-specific Parameters_.cc source file in src/. +# It must be defined here (before the rules below) so $(PARAMETERS_STEM) is expanded at parse time. +PARAMETERS_SRC := $(wildcard ../../src/Parameters_*.cc) +PARAMETERS_STEM := $(basename $(notdir $(PARAMETERS_SRC))) + $(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ @@ -922,6 +927,42 @@ $(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.bu $(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +# Per-SIMD compilation rules for Parameters_.cc. +# Parameters_ is model-specific (e.g. Parameters_sm.cc) and must be compiled per SIMD level +# because CPPProcess.cc includes Parameters_.h, which uses the mg5amcCpu namespace macro. +# When the macro is active, Parameters_::getInstance() etc. end up in the versioned namespace, +# so the matching definitions must also be compiled into that namespace. +$(BUILDDIR)/$(PARAMETERS_STEM)_none_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/$(PARAMETERS_STEM)_sse4_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/$(PARAMETERS_STEM)_avx2_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/$(PARAMETERS_STEM)_512y_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/$(PARAMETERS_STEM)_512z_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +# Base (compatibility) objects compiled without MGONGPU_SIMD_NAMESPACE, staying in mg5amcCpu namespace. +# These provide mg5amcCpu::CPPProcess (for check_sa.cc and param_card reading) and +# mg5amcCpu::sigmaKin (for MatrixElementKernelHost, the non-fat fallback kernel). +$(BUILDDIR)/CPPProcess_base_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -c $< -o $@ + +$(BUILDDIR)/color_sum_base_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -c $< -o $@ endif #------------------------------------------------------------------------------- @@ -939,16 +980,19 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. +# For the fat binary backend (cppfat), CPPProcess, color_sum, and Parameters_ are compiled +# multiple times, once per SIMD level with a versioned namespace. Parameters_ must be compiled +# per SIMD level because CPPProcess.cc references it, and when CPPProcess.cc is compiled with +# -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, all mg5amcCpu:: references (including Parameters_) +# are renamed to mg5amcCpu_::. The dispatcher (MatrixElementKernels.cc) links all versions. ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_512z_cpp.o +cppfat_objects_base=$(BUILDDIR)/CPPProcess_base_cpp.o $(BUILDDIR)/color_sum_base_cpp.o +cppfat_objects_all=$(cppfat_objects_base) $(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h index b2623bd894..0bfd669ab7 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h index d1b3a94fb9..ae8ffaece8 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h index b2623bd894..0bfd669ab7 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h index d1b3a94fb9..ae8ffaece8 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h index b2623bd894..0bfd669ab7 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h index b2623bd894..0bfd669ab7 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h index d1b3a94fb9..ae8ffaece8 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h index b2623bd894..0bfd669ab7 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h index d1b3a94fb9..ae8ffaece8 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h index b2623bd894..0bfd669ab7 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h index d1b3a94fb9..ae8ffaece8 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h index b2623bd894..0bfd669ab7 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h index d1b3a94fb9..ae8ffaece8 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk b/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/heft_gg_bb.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h index b2623bd894..0bfd669ab7 100644 --- a/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/heft_gg_bb.mad/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk b/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/heft_gg_bb.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h index d1b3a94fb9..ae8ffaece8 100644 --- a/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/heft_gg_bb.sa/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk b/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h index b2623bd894..0bfd669ab7 100644 --- a/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/nobm_pp_ttW.mad/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h index b2623bd894..0bfd669ab7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk b/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h index b2623bd894..0bfd669ab7 100644 --- a/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/smeft_gg_tttt.mad/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk b/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h index d1b3a94fb9..ae8ffaece8 100644 --- a/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/smeft_gg_tttt.sa/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h index b2623bd894..0bfd669ab7 100644 --- a/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_t1t1.mad/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h index d1b3a94fb9..ae8ffaece8 100644 --- a/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_t1t1.sa/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk +++ b/epochX/cudacpp/susy_gg_tt.mad/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h index b2623bd894..0bfd669ab7 100644 --- a/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_tt.mad/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h index 0fb6ccaf90..4e3f17e0dd 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h @@ -194,9 +194,7 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; - // use the common base class pointer to support both. - std::unique_ptr m_pmek; + std::unique_ptr m_pmek; #endif }; @@ -283,13 +281,8 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif -#ifdef MGONGPU_CPPFAT - m_pmek.reset( new MatrixElementKernelHostFat( - m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc index c4b8037cc9..b61df224f1 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif // !MGONGPUCPP_GPUIMPL +#endif //============================================================================ @@ -525,207 +525,3 @@ namespace mg5amcGpu #endif //============================================================================ - -// Fat binary implementation: runtime dispatch to the best available SIMD level. -// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. -// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, -// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements -// MatrixElementKernelHostFat, which detects the host CPU at construction time and -// delegates all computations to the best available SIMD version. - -#ifndef MGONGPUCPP_GPUIMPL - -// Forward declarations for per-SIMD namespaces. -// These are resolved at link time to the object files compiled with appropriate -march flags. -// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. - -#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ - namespace NS \ - { \ - void computeDependentCouplings( const fptype* allgs, \ - fptype* allcouplings, \ - const int nevt ); \ - void sigmaKin_getGoodHel( const fptype* allmomenta, \ - const fptype* allcouplings, \ - fptype* allMEs, \ - MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - bool* isGoodHel, \ - const int nevt ); \ - int sigmaKin_setGoodHel( const bool* isGoodHel ); \ - void sigmaKin( const fptype* allmomenta, \ - const fptype* allcouplings, \ - const fptype* allrndhel, \ - MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - fptype* allMEs, \ - int* allselhel, \ - MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - const int nevt ); \ - } - -// The multichannel-dependent parameters differ between builds -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ - fptype* allNumerators, \ - fptype* allDenominators, -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ - const fptype* allrndcol, \ - const unsigned int* allChannelIds, -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ - int* allselcol, \ - fptype* allNumerators, \ - fptype* allDenominators, -#else -#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ -#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ -#endif - -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) -MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) - -#undef MG5AMC_CPPFAT_FORWARD_DECLARE -#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS -#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS - -namespace mg5amcCpu -{ - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, - const BufferGs& gs, - const BufferRndNumHelicity& rndhel, - const BufferRndNumColor& rndcol, - const BufferChannelIds& channelIds, - BufferMatrixElements& matrixElements, - BufferSelectedHelicity& selhel, - BufferSelectedColor& selcol, - const size_t nevt ) - : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) - , NumberOfEvents( nevt ) - , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) - , m_couplings( nevt ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - , m_numerators( nevt ) - , m_denominators( nevt ) -#endif - { - if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); - if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); - if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); - if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); - if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); - if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); - constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); - if( nevt % neppM != 0 ) - { - std::ostringstream sstr; - sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; - throw std::runtime_error( sstr.str() ); - } - } - - //-------------------------------------------------------------------------- - - MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} - - //-------------------------------------------------------------------------- - - // Detect the best SIMD level available on the current CPU at runtime. - SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) - { -#if defined( __x86_64__ ) || defined( __i386__ ) - if( __builtin_cpu_supports( "avx512vl" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; - return SimdLevel::avx512z; - } - if( __builtin_cpu_supports( "avx512f" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; - return SimdLevel::avx512y; - } - if( __builtin_cpu_supports( "avx2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; - return SimdLevel::avx2; - } - if( __builtin_cpu_supports( "sse4.2" ) ) - { - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; - return SimdLevel::sse4; - } - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; - return SimdLevel::none; -#else - // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none - if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; - return SimdLevel::sse4; -#endif - } - - //-------------------------------------------------------------------------- - -// Convenience macro to dispatch to the correct per-SIMD namespace -#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ - switch( m_selectedSimd ) \ - { \ - case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ - case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ - case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ - case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ - default: mg5amcCpu_none::CALL; break; \ - } - - int MatrixElementKernelHostFat::computeGoodHelicities() - { - HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) -#else - MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) -#endif - int nGoodHel = 0; - switch( m_selectedSimd ) - { - case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; - } - return nGoodHel; - } - - //-------------------------------------------------------------------------- - - void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) - { - MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) -#else - assert( useChannelIds == false ); - MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) -#endif -#ifdef MGONGPU_CHANNELID_DEBUG - MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); -#endif - } - -#undef MG5AMC_CPPFAT_DISPATCH - - //-------------------------------------------------------------------------- - -} // namespace mg5amcCpu - -#endif // !MGONGPUCPP_GPUIMPL - -//============================================================================ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h index a9d0521d4a..16f8874888 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -156,66 +156,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL - // Enum for the SIMD level selected at runtime in a fat binary - enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; - - // A class encapsulating matrix element calculations on a CPU host, with runtime - // dispatch to the best available SIMD implementation (for use in a fat binary). - // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into - // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, - // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the - // host CPU capabilities and delegates all ME computations to the best version. - class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents - { - public: - - // Constructor from existing input and output buffers - MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta - const BufferGs& gs, // input: gs for alphaS - const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection - const BufferRndNumColor& rndcol, // input: random numbers for color selection - const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement - BufferMatrixElements& matrixElements, // output: matrix elements - BufferSelectedHelicity& selhel, // output: helicity selection - BufferSelectedColor& selcol, // output: color selection - const size_t nevt ); - - // Destructor - virtual ~MatrixElementKernelHostFat(); - - // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) - int computeGoodHelicities() override final; - - // Compute matrix elements - void computeMatrixElements( const bool useChannelIds ) override final; - - // Is this a host or device kernel? - bool isOnDevice() const override final { return false; } - - // Detect the best available SIMD level on the current CPU - static SimdLevel detectBestSimd( const bool verbose = false ); - - private: - - // The selected SIMD level (detected once at construction time) - SimdLevel m_selectedSimd; - - // The buffer for the event-by-event couplings that depends on alphas QCD - HostBufferCouplings m_couplings; - -#ifdef MGONGPU_SUPPORTS_MULTICHANNEL - // The buffer for the event-by-event numerators of multichannel factors - HostBufferNumerators m_numerators; - - // The buffer for the event-by-event denominators of multichannel factors - HostBufferDenominators m_denominators; -#endif - }; -#endif - - //-------------------------------------------------------------------------- - #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk index affd0b3aa5..f5bf67efbc 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk @@ -578,8 +578,6 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else ifeq ($(BACKEND),cppfat) - override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -587,11 +585,6 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif -# Add the MGONGPU_CPPFAT flag when building the fat binary backend -ifeq ($(BACKEND),cppfat) - CXXFLAGS += -DMGONGPU_CPPFAT -endif - # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -846,60 +839,6 @@ $(BUILDDIR)/%_$(GPUSUFFIX).o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif -# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). -# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate -# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a -# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. -ifeq ($(BACKEND),cppfat) -# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march -CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) -CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 -CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem -CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell -CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 -CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 - -$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ - -$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ - -$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ - -$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ - -$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ - -$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ -endif - #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -914,21 +853,7 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp - -# For the fat binary backend (cppfat), CPPProcess and color_sum are compiled multiple times, -# once per SIMD level with a versioned namespace. All other objects are compiled once with the -# baseline x86-64 flags (no SIMD). The dispatcher (MatrixElementKernels.cc) links all versions. -ifeq ($(BACKEND),cppfat) -cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o -cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o -cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o -cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o -cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o -cppfat_objects_all=$(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) -cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) -else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o -endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1183,10 +1108,6 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) -bldfat: - @echo - $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) - ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk b/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk index 748080a70f..b57e56d182 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h index d1b3a94fb9..ae8ffaece8 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h @@ -305,14 +305,4 @@ ispoweroftwo( int n ) #endif #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H From 80e37ce8ac6f9ea59f3e9bcb6e711e502c687a62 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 13 Apr 2026 19:23:46 +0000 Subject: [PATCH 3/4] Fix duplicate cppfat in SUPPORTED_BACKENDS and duplicate block in mgOnGpuConfig.h Agent-Logs-Url: https://github.com/madgraph5/madgraph4gpu/sessions/db0d537e-e75b-4d0b-ba4c-b7f11fb41df6 Co-authored-by: oliviermattelaer <33414646+oliviermattelaer@users.noreply.github.com> --- .../madgraph/iolibs/template_files/gpu/cudacpp_config.mk | 2 +- .../madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk index b920463d6c..748080a70f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat cppfat +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index b3792ad8ec..ef17addc16 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -315,13 +315,4 @@ ispoweroftwo( int n ) #define mg5amcCpu MGONGPU_SIMD_NAMESPACE #endif -// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), -// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). -// This allows compiling those translation units multiple times with different -march flags, -// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. -// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled -// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. -#define mg5amcCpu MGONGPU_SIMD_NAMESPACE -#endif - #endif // MGONGPUCONFIG_H From e3a62b3decd492b535608c6e1a1a09201b4fbeec Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 14 Apr 2026 08:58:53 +0000 Subject: [PATCH 4/4] Fat binary: add MGONGPU_SIMD_LEVEL env-var to override runtime SIMD selection Agent-Logs-Url: https://github.com/madgraph5/madgraph4gpu/sessions/0bd7a4a4-bb24-4bbf-b578-58747cf4f9fa Co-authored-by: oliviermattelaer <33414646+oliviermattelaer@users.noreply.github.com> --- .../gpu/MatrixElementKernels.cc | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index c4b8037cc9..392c82947f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -637,8 +637,72 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- // Detect the best SIMD level available on the current CPU at runtime. + // If the environment variable MGONGPU_SIMD_LEVEL is set to one of the + // recognised level names (avx512z, avx512y, avx2, sse4, none) the + // requested level is used *provided* the hardware actually supports it. + // An unsupported or unrecognised value triggers a warning and falls back + // to auto-detection. This allows benchmarking a lower SIMD tier on a + // machine that supports a higher one, e.g.: + // MGONGPU_SIMD_LEVEL=avx2 ./check_cpp.exe # force AVX2 on AVX512 HW SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) { + // --- optional user override via MGONGPU_SIMD_LEVEL --- + const char* simdEnv = getenv( "MGONGPU_SIMD_LEVEL" ); + if( simdEnv != nullptr ) + { + const std::string requested( simdEnv ); + SimdLevel req = SimdLevel::none; // initialised to keep compiler happy + bool knownLevel = true; + if( requested == "avx512z" ) + req = SimdLevel::avx512z; + else if( requested == "avx512y" ) + req = SimdLevel::avx512y; + else if( requested == "avx2" ) + req = SimdLevel::avx2; + else if( requested == "sse4" ) + req = SimdLevel::sse4; + else if( requested == "none" ) + req = SimdLevel::none; + else + { + std::cerr << "WARNING: MGONGPU_SIMD_LEVEL='" << requested + << "' is not recognised (valid values: avx512z avx512y avx2 sse4 none)." + << " Falling back to auto-detection." << std::endl; + knownLevel = false; + } + if( knownLevel ) + { + // Safety check: refuse to use a level the hardware cannot execute. + bool hwOk = false; +#if defined( __x86_64__ ) || defined( __i386__ ) + switch( req ) + { + case SimdLevel::avx512z: hwOk = __builtin_cpu_supports( "avx512vl" ); break; + case SimdLevel::avx512y: hwOk = __builtin_cpu_supports( "avx512f" ); break; + case SimdLevel::avx2: hwOk = __builtin_cpu_supports( "avx2" ); break; + case SimdLevel::sse4: hwOk = __builtin_cpu_supports( "sse4.2" ); break; + case SimdLevel::none: hwOk = true; break; + } +#else + // Non-x86: only sse4 (NEON/VSX) and none are meaningful overrides. + hwOk = ( req == SimdLevel::sse4 || req == SimdLevel::none ); +#endif + if( hwOk ) + { + if( verbose ) + std::cout << "INFO: Fat binary: MGONGPU_SIMD_LEVEL override: selected SIMD level " + << requested << std::endl; + return req; + } + else + { + std::cerr << "WARNING: MGONGPU_SIMD_LEVEL='" << requested + << "' is not supported by this CPU." + << " Falling back to auto-detection." << std::endl; + } + } + } + // --- auto-detection --- #if defined( __x86_64__ ) || defined( __i386__ ) if( __builtin_cpu_supports( "avx512vl" ) ) {