diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h index 4e3f17e0dd..0fb6ccaf90 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h @@ -194,7 +194,9 @@ namespace mg5amcCpu HostBufferSelectedHelicity m_hstSelHel; HostBufferSelectedColor m_hstSelCol; HostBufferChannelIds m_hstChannelIds; - std::unique_ptr m_pmek; + // In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost; + // use the common base class pointer to support both. + std::unique_ptr m_pmek; #endif }; @@ -281,8 +283,13 @@ namespace mg5amcCpu std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; #endif +#ifdef MGONGPU_CPPFAT + m_pmek.reset( new MatrixElementKernelHostFat( + m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#else m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPU_CPPFAT #endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index b61df224f1..392c82947f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -288,7 +288,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- } -#endif +#endif // !MGONGPUCPP_GPUIMPL //============================================================================ @@ -525,3 +525,271 @@ namespace mg5amcGpu #endif //============================================================================ + +// Fat binary implementation: runtime dispatch to the best available SIMD level. +// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds. +// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none, +// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements +// MatrixElementKernelHostFat, which detects the host CPU at construction time and +// delegates all computations to the best available SIMD version. + +#ifndef MGONGPUCPP_GPUIMPL + +// Forward declarations for per-SIMD namespaces. +// These are resolved at link time to the object files compiled with appropriate -march flags. +// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h. + +#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \ + namespace NS \ + { \ + void computeDependentCouplings( const fptype* allgs, \ + fptype* allcouplings, \ + const int nevt ); \ + void sigmaKin_getGoodHel( const fptype* allmomenta, \ + const fptype* allcouplings, \ + fptype* allMEs, \ + MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + bool* isGoodHel, \ + const int nevt ); \ + int sigmaKin_setGoodHel( const bool* isGoodHel ); \ + void sigmaKin( const fptype* allmomenta, \ + const fptype* allcouplings, \ + const fptype* allrndhel, \ + MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + fptype* allMEs, \ + int* allselhel, \ + MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + const int nevt ); \ + } + +// The multichannel-dependent parameters differ between builds +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \ + fptype* allNumerators, \ + fptype* allDenominators, +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \ + const fptype* allrndcol, \ + const unsigned int* allChannelIds, +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \ + int* allselcol, \ + fptype* allNumerators, \ + fptype* allDenominators, +#else +#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/ +#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/ +#endif + +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y ) +MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z ) + +#undef MG5AMC_CPPFAT_FORWARD_DECLARE +#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS +#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS + +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta, + const BufferGs& gs, + const BufferRndNumHelicity& rndhel, + const BufferRndNumColor& rndcol, + const BufferChannelIds& channelIds, + BufferMatrixElements& matrixElements, + BufferSelectedHelicity& selhel, + BufferSelectedColor& selcol, + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_selectedSimd( detectBestSimd( /*verbose=*/true ) ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" ); + if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" ); + if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" ); + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {} + + //-------------------------------------------------------------------------- + + // Detect the best SIMD level available on the current CPU at runtime. + // If the environment variable MGONGPU_SIMD_LEVEL is set to one of the + // recognised level names (avx512z, avx512y, avx2, sse4, none) the + // requested level is used *provided* the hardware actually supports it. + // An unsupported or unrecognised value triggers a warning and falls back + // to auto-detection. This allows benchmarking a lower SIMD tier on a + // machine that supports a higher one, e.g.: + // MGONGPU_SIMD_LEVEL=avx2 ./check_cpp.exe # force AVX2 on AVX512 HW + SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose ) + { + // --- optional user override via MGONGPU_SIMD_LEVEL --- + const char* simdEnv = getenv( "MGONGPU_SIMD_LEVEL" ); + if( simdEnv != nullptr ) + { + const std::string requested( simdEnv ); + SimdLevel req = SimdLevel::none; // initialised to keep compiler happy + bool knownLevel = true; + if( requested == "avx512z" ) + req = SimdLevel::avx512z; + else if( requested == "avx512y" ) + req = SimdLevel::avx512y; + else if( requested == "avx2" ) + req = SimdLevel::avx2; + else if( requested == "sse4" ) + req = SimdLevel::sse4; + else if( requested == "none" ) + req = SimdLevel::none; + else + { + std::cerr << "WARNING: MGONGPU_SIMD_LEVEL='" << requested + << "' is not recognised (valid values: avx512z avx512y avx2 sse4 none)." + << " Falling back to auto-detection." << std::endl; + knownLevel = false; + } + if( knownLevel ) + { + // Safety check: refuse to use a level the hardware cannot execute. + bool hwOk = false; +#if defined( __x86_64__ ) || defined( __i386__ ) + switch( req ) + { + case SimdLevel::avx512z: hwOk = __builtin_cpu_supports( "avx512vl" ); break; + case SimdLevel::avx512y: hwOk = __builtin_cpu_supports( "avx512f" ); break; + case SimdLevel::avx2: hwOk = __builtin_cpu_supports( "avx2" ); break; + case SimdLevel::sse4: hwOk = __builtin_cpu_supports( "sse4.2" ); break; + case SimdLevel::none: hwOk = true; break; + } +#else + // Non-x86: only sse4 (NEON/VSX) and none are meaningful overrides. + hwOk = ( req == SimdLevel::sse4 || req == SimdLevel::none ); +#endif + if( hwOk ) + { + if( verbose ) + std::cout << "INFO: Fat binary: MGONGPU_SIMD_LEVEL override: selected SIMD level " + << requested << std::endl; + return req; + } + else + { + std::cerr << "WARNING: MGONGPU_SIMD_LEVEL='" << requested + << "' is not supported by this CPU." + << " Falling back to auto-detection." << std::endl; + } + } + } + // --- auto-detection --- +#if defined( __x86_64__ ) || defined( __i386__ ) + if( __builtin_cpu_supports( "avx512vl" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl; + return SimdLevel::avx512z; + } + if( __builtin_cpu_supports( "avx512f" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl; + return SimdLevel::avx512y; + } + if( __builtin_cpu_supports( "avx2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl; + return SimdLevel::avx2; + } + if( __builtin_cpu_supports( "sse4.2" ) ) + { + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl; + return SimdLevel::sse4; + } + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl; + return SimdLevel::none; +#else + // Non-x86: use sse4 (covers ARM NEON/Power VSX) or none + if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl; + return SimdLevel::sse4; +#endif + } + + //-------------------------------------------------------------------------- + +// Convenience macro to dispatch to the correct per-SIMD namespace +#define MG5AMC_CPPFAT_DISPATCH( CALL ) \ + switch( m_selectedSimd ) \ + { \ + case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \ + case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \ + case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \ + case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \ + default: mg5amcCpu_none::CALL; break; \ + } + + int MatrixElementKernelHostFat::computeGoodHelicities() + { + HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb ); + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) ) +#else + MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) ) +#endif + int nGoodHel = 0; + switch( m_selectedSimd ) + { + case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break; + } + return nGoodHel; + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds ) + { + MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) ) +#else + assert( useChannelIds == false ); + MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) ) +#endif +#ifdef MGONGPU_CHANNELID_DEBUG + MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() ); +#endif + } + +#undef MG5AMC_CPPFAT_DISPATCH + + //-------------------------------------------------------------------------- + +} // namespace mg5amcCpu + +#endif // !MGONGPUCPP_GPUIMPL + +//============================================================================ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h index 16f8874888..a9d0521d4a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h @@ -156,6 +156,66 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- +#ifndef MGONGPUCPP_GPUIMPL + // Enum for the SIMD level selected at runtime in a fat binary + enum class SimdLevel { none, sse4, avx2, avx512y, avx512z }; + + // A class encapsulating matrix element calculations on a CPU host, with runtime + // dispatch to the best available SIMD implementation (for use in a fat binary). + // The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into + // separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2, + // mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the + // host CPU capabilities and delegates all ME computations to the best version. + class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHostFat(); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const bool useChannelIds ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Detect the best available SIMD level on the current CPU + static SimdLevel detectBestSimd( const bool verbose = false ); + + private: + + // The selected SIMD level (detected once at construction time) + SimdLevel m_selectedSimd; + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + #ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 583f3df0c9..b37a5ed719 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -602,6 +602,8 @@ else override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] else ifeq ($(BACKEND),cpp512z) override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifeq ($(BACKEND),cppfat) + override AVXFLAGS = -march=x86-64 # Fat binary: dispatcher uses baseline x86-64; per-SIMD objects use their own -march flags (see below) endif endif # For the moment, use AVXFLAGS everywhere (in C++ builds): eventually, use them only in encapsulated implementations? @@ -609,6 +611,11 @@ ifeq ($(GPUCC),) CXXFLAGS+= $(AVXFLAGS) endif +# Add the MGONGPU_CPPFAT flag when building the fat binary backend +ifeq ($(BACKEND),cppfat) + CXXFLAGS += -DMGONGPU_CPPFAT +endif + # Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") $(info FPTYPE='$(FPTYPE)') ifeq ($(FPTYPE),d) @@ -863,6 +870,101 @@ $(BUILDDIR)/%%_$(GPUSUFFIX).o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG $(GPUCC) $(CPPFLAGS) $(INCFLAGS) $(GPUFLAGS) -c -x $(GPULANGUAGE) $< -o $@ endif +# Per-SIMD compilation rules for the fat binary (BACKEND=cppfat). +# Each SIMD variant of CPPProcess.cc and color_sum.cc is compiled with the appropriate +# -march flag and -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, placing all symbols in a +# versioned namespace. The dispatcher (MatrixElementKernels.cc) links them all together. +ifeq ($(BACKEND),cppfat) +# Strip AVXFLAGS (which is -march=x86-64 for cppfat) from per-SIMD compilations; add per-SIMD -march +CXXFLAGS_NOFAT := $(filter-out -march%,$(CXXFLAGS)) $(filter-out -DMGONGPU_PVW512,$(CXXFLAGS)) +CXXFLAGS_FAT_NONE := $(CXXFLAGS_NOFAT) -march=x86-64 +CXXFLAGS_FAT_SSE4 := $(CXXFLAGS_NOFAT) -march=nehalem +CXXFLAGS_FAT_AVX2 := $(CXXFLAGS_NOFAT) -march=haswell +CXXFLAGS_FAT_512Y := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -mprefer-vector-width=256 +CXXFLAGS_FAT_512Z := $(CXXFLAGS_NOFAT) -march=skylake-avx512 -DMGONGPU_PVW512 + +# Identify the process-specific Parameters_.cc source file in src/. +# It must be defined here (before the rules below) so $(PARAMETERS_STEM) is expanded at parse time. +PARAMETERS_SRC := $(wildcard ../../src/Parameters_*.cc) +PARAMETERS_STEM := $(basename $(notdir $(PARAMETERS_SRC))) + +$(BUILDDIR)/CPPProcess_none_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/CPPProcess_sse4_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_avx2_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512y_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/CPPProcess_512z_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +$(BUILDDIR)/color_sum_none_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/color_sum_sse4_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/color_sum_avx2_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/color_sum_512y_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/color_sum_512z_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +# Per-SIMD compilation rules for Parameters_.cc. +# Parameters_ is model-specific (e.g. Parameters_sm.cc) and must be compiled per SIMD level +# because CPPProcess.cc includes Parameters_.h, which uses the mg5amcCpu namespace macro. +# When the macro is active, Parameters_::getInstance() etc. end up in the versioned namespace, +# so the matching definitions must also be compiled into that namespace. +$(BUILDDIR)/$(PARAMETERS_STEM)_none_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_none -c $< -o $@ + +$(BUILDDIR)/$(PARAMETERS_STEM)_sse4_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_SSE4) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_sse4 -c $< -o $@ + +$(BUILDDIR)/$(PARAMETERS_STEM)_avx2_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_AVX2) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2 -c $< -o $@ + +$(BUILDDIR)/$(PARAMETERS_STEM)_512y_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Y) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512y -c $< -o $@ + +$(BUILDDIR)/$(PARAMETERS_STEM)_512z_cpp.o: $(PARAMETERS_SRC) ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_512Z) -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_512z -c $< -o $@ + +# Base (compatibility) objects compiled without MGONGPU_SIMD_NAMESPACE, staying in mg5amcCpu namespace. +# These provide mg5amcCpu::CPPProcess (for check_sa.cc and param_card reading) and +# mg5amcCpu::sigmaKin (for MatrixElementKernelHost, the non-fat fallback kernel). +$(BUILDDIR)/CPPProcess_base_cpp.o: CPPProcess.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -c $< -o $@ + +$(BUILDDIR)/color_sum_base_cpp.o: color_sum.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(INCFLAGS) $(CXXFLAGS_FAT_NONE) -c $< -o $@ +endif + #------------------------------------------------------------------------------- # Target (and build rules): common (src) library @@ -877,7 +979,24 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') ###$(info processid_short=$(processid_short)) MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp + +# For the fat binary backend (cppfat), CPPProcess, color_sum, and Parameters_ are compiled +# multiple times, once per SIMD level with a versioned namespace. Parameters_ must be compiled +# per SIMD level because CPPProcess.cc references it, and when CPPProcess.cc is compiled with +# -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_, all mg5amcCpu:: references (including Parameters_) +# are renamed to mg5amcCpu_::. The dispatcher (MatrixElementKernels.cc) links all versions. +ifeq ($(BACKEND),cppfat) +cppfat_objects_none=$(BUILDDIR)/CPPProcess_none_cpp.o $(BUILDDIR)/color_sum_none_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_none_cpp.o +cppfat_objects_sse4=$(BUILDDIR)/CPPProcess_sse4_cpp.o $(BUILDDIR)/color_sum_sse4_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_sse4_cpp.o +cppfat_objects_avx2=$(BUILDDIR)/CPPProcess_avx2_cpp.o $(BUILDDIR)/color_sum_avx2_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_avx2_cpp.o +cppfat_objects_512y=$(BUILDDIR)/CPPProcess_512y_cpp.o $(BUILDDIR)/color_sum_512y_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_512y_cpp.o +cppfat_objects_512z=$(BUILDDIR)/CPPProcess_512z_cpp.o $(BUILDDIR)/color_sum_512z_cpp.o $(BUILDDIR)/$(PARAMETERS_STEM)_512z_cpp.o +cppfat_objects_base=$(BUILDDIR)/CPPProcess_base_cpp.o $(BUILDDIR)/color_sum_base_cpp.o +cppfat_objects_all=$(cppfat_objects_base) $(cppfat_objects_none) $(cppfat_objects_sse4) $(cppfat_objects_avx2) $(cppfat_objects_512y) $(cppfat_objects_512z) +cxx_objects_lib=$(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o $(cppfat_objects_all) +else cxx_objects_lib=$(BUILDDIR)/CPPProcess_cpp.o $(BUILDDIR)/color_sum_cpp.o $(BUILDDIR)/MatrixElementKernels_cpp.o $(BUILDDIR)/BridgeKernels_cpp.o $(BUILDDIR)/CrossSectionKernels_cpp.o +endif cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cpp.o $(BUILDDIR)/RamboSamplingKernels_cpp.o ifneq ($(GPUCC),) @@ -1132,6 +1251,10 @@ bld512z: @echo $(MAKE) USEBUILDDIR=1 BACKEND=cpp512z -f $(CUDACPP_MAKEFILE) +bldfat: + @echo + $(MAKE) USEBUILDDIR=1 BACKEND=cppfat -f $(CUDACPP_MAKEFILE) + ifeq ($(UNAME_P),ppc64le) ###bldavxs: $(INCDIR)/fbridge.inc bldnone bldsse4 bldavxs: bldnone bldsse4 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk index b57e56d182..748080a70f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_config.mk @@ -34,7 +34,7 @@ endif # Check that the user-defined choices of BACKEND, FPTYPE, HELINL, HRDCOD are supported # (NB: use 'filter' and 'words' instead of 'findstring' because they properly handle whitespace-separated words) -override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto +override SUPPORTED_BACKENDS = cuda hip cppnone cppsse4 cppavx2 cpp512y cpp512z cppauto cppfat ifneq ($(words $(filter $(BACKEND), $(SUPPORTED_BACKENDS))),1) $(error Invalid backend BACKEND='$(BACKEND)': supported backends are $(foreach backend,$(SUPPORTED_BACKENDS),'$(backend)')) endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index 81e1e24e69..ef17addc16 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -305,4 +305,14 @@ ispoweroftwo( int n ) #endif #endif +// For fat binary builds: when MGONGPU_SIMD_NAMESPACE is defined externally (e.g. -DMGONGPU_SIMD_NAMESPACE=mg5amcCpu_avx2), +// redefine mg5amcCpu to the versioned namespace for SIMD-sensitive translation units (CPPProcess, color_sum). +// This allows compiling those translation units multiple times with different -march flags, +// each time placing all symbols in a distinct versioned namespace, for runtime dispatch in a fat binary. +// Files that are NOT SIMD-sensitive (MatrixElementKernels, BridgeKernels, fbridge etc.) are compiled +// without MGONGPU_SIMD_NAMESPACE and therefore continue to use the standard mg5amcCpu namespace. +#if defined MGONGPU_SIMD_NAMESPACE && !defined MGONGPUCPP_GPUIMPL +#define mg5amcCpu MGONGPU_SIMD_NAMESPACE +#endif + #endif // MGONGPUCONFIG_H