Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,9 @@ namespace mg5amcCpu
HostBufferSelectedHelicity m_hstSelHel;
HostBufferSelectedColor m_hstSelCol;
HostBufferChannelIds m_hstChannelIds;
std::unique_ptr<MatrixElementKernelHost> m_pmek;
// In fat binary builds, hold either MatrixElementKernelHostFat or MatrixElementKernelHost;
// use the common base class pointer to support both.
std::unique_ptr<MatrixElementKernelBase> m_pmek;
#endif
};

Expand Down Expand Up @@ -281,8 +283,13 @@ namespace mg5amcCpu
std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")"
<< std::endl;
#endif
#ifdef MGONGPU_CPPFAT
m_pmek.reset( new MatrixElementKernelHostFat(
m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
#else
m_pmek.reset( new MatrixElementKernelHost(
m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstChannelIds, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) );
#endif // MGONGPU_CPPFAT
#endif // MGONGPUCPP_GPUIMPL
// Create a process object, read param card and set parameters
// FIXME: the process instance can happily go out of scope because it is only
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ namespace mg5amcCpu
//--------------------------------------------------------------------------

}
#endif
#endif // !MGONGPUCPP_GPUIMPL

//============================================================================

Expand Down Expand Up @@ -525,3 +525,271 @@ namespace mg5amcGpu
#endif

//============================================================================

// Fat binary implementation: runtime dispatch to the best available SIMD level.
// This section is compiled unconditionally (no MGONGPUCPP_GPUIMPL guard) for C++ builds.
// It provides forward declarations for the per-SIMD namespaces (mg5amcCpu_none,
// mg5amcCpu_sse4, mg5amcCpu_avx2, mg5amcCpu_512y, mg5amcCpu_512z) and implements
// MatrixElementKernelHostFat, which detects the host CPU at construction time and
// delegates all computations to the best available SIMD version.

#ifndef MGONGPUCPP_GPUIMPL

// Forward declarations for per-SIMD namespaces.
// These are resolved at link time to the object files compiled with appropriate -march flags.
// All signatures mirror the C++ (non-GPU) versions declared in CPPProcess.h.

#define MG5AMC_CPPFAT_FORWARD_DECLARE( NS ) \
namespace NS \
{ \
void computeDependentCouplings( const fptype* allgs, \
fptype* allcouplings, \
const int nevt ); \
void sigmaKin_getGoodHel( const fptype* allmomenta, \
const fptype* allcouplings, \
fptype* allMEs, \
MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
bool* isGoodHel, \
const int nevt ); \
int sigmaKin_setGoodHel( const bool* isGoodHel ); \
void sigmaKin( const fptype* allmomenta, \
const fptype* allcouplings, \
const fptype* allrndhel, \
MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
fptype* allMEs, \
int* allselhel, \
MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
const int nevt ); \
}

// The multichannel-dependent parameters differ between builds
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS \
fptype* allNumerators, \
fptype* allDenominators,
#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS \
const fptype* allrndcol, \
const unsigned int* allChannelIds,
#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS \
int* allselcol, \
fptype* allNumerators, \
fptype* allDenominators,
#else
#define MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS /*nothing*/
#define MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS /*nothing*/
#define MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS /*nothing*/
#endif

MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_none )
MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_sse4 )
MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_avx2 )
MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512y )
MG5AMC_CPPFAT_FORWARD_DECLARE( mg5amcCpu_512z )

#undef MG5AMC_CPPFAT_FORWARD_DECLARE
#undef MGONGPU_CPPFAT_SIGMAKIN_GETHEL_NUMDEN_PARAMS
#undef MGONGPU_CPPFAT_SIGMAKIN_NUMDEN_PARAMS
#undef MGONGPU_CPPFAT_SIGMAKIN_SEL_PARAMS

namespace mg5amcCpu
{

//--------------------------------------------------------------------------

MatrixElementKernelHostFat::MatrixElementKernelHostFat( const BufferMomenta& momenta,
const BufferGs& gs,
const BufferRndNumHelicity& rndhel,
const BufferRndNumColor& rndcol,
const BufferChannelIds& channelIds,
BufferMatrixElements& matrixElements,
BufferSelectedHelicity& selhel,
BufferSelectedColor& selcol,
const size_t nevt )
: MatrixElementKernelBase( momenta, gs, rndhel, rndcol, channelIds, matrixElements, selhel, selcol )
, NumberOfEvents( nevt )
, m_selectedSimd( detectBestSimd( /*verbose=*/true ) )
, m_couplings( nevt )
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
, m_numerators( nevt )
, m_denominators( nevt )
#endif
{
if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: momenta must be a host array" );
if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: matrixElements must be a host array" );
if( m_channelIds.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHostFat: channelIds must be a device array" );
if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with momenta" );
if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with matrixElements" );
if( this->nevt() != m_channelIds.nevt() ) throw std::runtime_error( "MatrixElementKernelHostFat: nevt mismatch with channelIds" );
constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout
static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" );
if( nevt % neppM != 0 )
{
std::ostringstream sstr;
sstr << "MatrixElementKernelHostFat: nevt should be a multiple of neppM=" << neppM;
throw std::runtime_error( sstr.str() );
}
}

//--------------------------------------------------------------------------

MatrixElementKernelHostFat::~MatrixElementKernelHostFat() {}

//--------------------------------------------------------------------------

// Detect the best SIMD level available on the current CPU at runtime.
// If the environment variable MGONGPU_SIMD_LEVEL is set to one of the
// recognised level names (avx512z, avx512y, avx2, sse4, none) the
// requested level is used *provided* the hardware actually supports it.
// An unsupported or unrecognised value triggers a warning and falls back
// to auto-detection. This allows benchmarking a lower SIMD tier on a
// machine that supports a higher one, e.g.:
// MGONGPU_SIMD_LEVEL=avx2 ./check_cpp.exe # force AVX2 on AVX512 HW
SimdLevel MatrixElementKernelHostFat::detectBestSimd( const bool verbose )
{
// --- optional user override via MGONGPU_SIMD_LEVEL ---
const char* simdEnv = getenv( "MGONGPU_SIMD_LEVEL" );
if( simdEnv != nullptr )
{
const std::string requested( simdEnv );
SimdLevel req = SimdLevel::none; // initialised to keep compiler happy
bool knownLevel = true;
if( requested == "avx512z" )
req = SimdLevel::avx512z;
else if( requested == "avx512y" )
req = SimdLevel::avx512y;
else if( requested == "avx2" )
req = SimdLevel::avx2;
else if( requested == "sse4" )
req = SimdLevel::sse4;
else if( requested == "none" )
req = SimdLevel::none;
else
{
std::cerr << "WARNING: MGONGPU_SIMD_LEVEL='" << requested
<< "' is not recognised (valid values: avx512z avx512y avx2 sse4 none)."
<< " Falling back to auto-detection." << std::endl;
knownLevel = false;
}
if( knownLevel )
{
// Safety check: refuse to use a level the hardware cannot execute.
bool hwOk = false;
#if defined( __x86_64__ ) || defined( __i386__ )
switch( req )
{
case SimdLevel::avx512z: hwOk = __builtin_cpu_supports( "avx512vl" ); break;
case SimdLevel::avx512y: hwOk = __builtin_cpu_supports( "avx512f" ); break;
case SimdLevel::avx2: hwOk = __builtin_cpu_supports( "avx2" ); break;
case SimdLevel::sse4: hwOk = __builtin_cpu_supports( "sse4.2" ); break;
case SimdLevel::none: hwOk = true; break;
}
#else
// Non-x86: only sse4 (NEON/VSX) and none are meaningful overrides.
hwOk = ( req == SimdLevel::sse4 || req == SimdLevel::none );
#endif
if( hwOk )
{
if( verbose )
std::cout << "INFO: Fat binary: MGONGPU_SIMD_LEVEL override: selected SIMD level "
<< requested << std::endl;
return req;
}
else
{
std::cerr << "WARNING: MGONGPU_SIMD_LEVEL='" << requested
<< "' is not supported by this CPU."
<< " Falling back to auto-detection." << std::endl;
}
}
}
// --- auto-detection ---
#if defined( __x86_64__ ) || defined( __i386__ )
if( __builtin_cpu_supports( "avx512vl" ) )
{
if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512z (AVX512VL with 512-bit width)" << std::endl;
return SimdLevel::avx512z;
}
if( __builtin_cpu_supports( "avx512f" ) )
{
if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx512y (AVX512 with 256-bit width)" << std::endl;
return SimdLevel::avx512y;
}
if( __builtin_cpu_supports( "avx2" ) )
{
if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level avx2 (AVX2 with 256-bit width)" << std::endl;
return SimdLevel::avx2;
}
if( __builtin_cpu_supports( "sse4.2" ) )
{
if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (SSE4.2 with 128-bit width)" << std::endl;
return SimdLevel::sse4;
}
if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level none (no SIMD)" << std::endl;
return SimdLevel::none;
#else
// Non-x86: use sse4 (covers ARM NEON/Power VSX) or none
if( verbose ) std::cout << "INFO: Fat binary: selected SIMD level sse4 (non-x86 platform)" << std::endl;
return SimdLevel::sse4;
#endif
}

//--------------------------------------------------------------------------

// Convenience macro to dispatch to the correct per-SIMD namespace
#define MG5AMC_CPPFAT_DISPATCH( CALL ) \
switch( m_selectedSimd ) \
{ \
case SimdLevel::avx512z: mg5amcCpu_512z::CALL; break; \
case SimdLevel::avx512y: mg5amcCpu_512y::CALL; break; \
case SimdLevel::avx2: mg5amcCpu_avx2::CALL; break; \
case SimdLevel::sse4: mg5amcCpu_sse4::CALL; break; \
default: mg5amcCpu_none::CALL; break; \
}

int MatrixElementKernelHostFat::computeGoodHelicities()
{
HostBufferHelicityMask hstIsGoodHel( CPPProcess::ncomb );
MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ) )
#else
MG5AMC_CPPFAT_DISPATCH( sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ) )
#endif
int nGoodHel = 0;
switch( m_selectedSimd )
{
case SimdLevel::avx512z: nGoodHel = mg5amcCpu_512z::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
case SimdLevel::avx512y: nGoodHel = mg5amcCpu_512y::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
case SimdLevel::avx2: nGoodHel = mg5amcCpu_avx2::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
case SimdLevel::sse4: nGoodHel = mg5amcCpu_sse4::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
default: nGoodHel = mg5amcCpu_none::sigmaKin_setGoodHel( hstIsGoodHel.data() ); break;
}
return nGoodHel;
}

//--------------------------------------------------------------------------

void MatrixElementKernelHostFat::computeMatrixElements( const bool useChannelIds )
{
MG5AMC_CPPFAT_DISPATCH( computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ) )
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
const unsigned int* pChannelIds = ( useChannelIds ? m_channelIds.data() : nullptr );
MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), pChannelIds, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), m_numerators.data(), m_denominators.data(), nevt() ) )
#else
assert( useChannelIds == false );
MG5AMC_CPPFAT_DISPATCH( sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_matrixElements.data(), m_selhel.data(), nevt() ) )
#endif
#ifdef MGONGPU_CHANNELID_DEBUG
MatrixElementKernelBase::updateNevtProcessedByChannel( pChannelIds, nevt() );
#endif
}

#undef MG5AMC_CPPFAT_DISPATCH

//--------------------------------------------------------------------------

} // namespace mg5amcCpu

#endif // !MGONGPUCPP_GPUIMPL

//============================================================================
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,66 @@ namespace mg5amcCpu

//--------------------------------------------------------------------------

#ifndef MGONGPUCPP_GPUIMPL
// Enum for the SIMD level selected at runtime in a fat binary
enum class SimdLevel { none, sse4, avx2, avx512y, avx512z };

// A class encapsulating matrix element calculations on a CPU host, with runtime
// dispatch to the best available SIMD implementation (for use in a fat binary).
// The fat binary contains per-SIMD versions of CPPProcess/color_sum compiled into
// separate versioned namespaces (mg5amcCpu_none, mg5amcCpu_sse4, mg5amcCpu_avx2,
// mg5amcCpu_512y, mg5amcCpu_512z). At construction time, this class detects the
// host CPU capabilities and delegates all ME computations to the best version.
class MatrixElementKernelHostFat final : public MatrixElementKernelBase, public NumberOfEvents
{
public:

// Constructor from existing input and output buffers
MatrixElementKernelHostFat( const BufferMomenta& momenta, // input: momenta
const BufferGs& gs, // input: gs for alphaS
const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection
const BufferRndNumColor& rndcol, // input: random numbers for color selection
const BufferChannelIds& channelIds, // input: channel ids for single-diagram enhancement
BufferMatrixElements& matrixElements, // output: matrix elements
BufferSelectedHelicity& selhel, // output: helicity selection
BufferSelectedColor& selcol, // output: color selection
const size_t nevt );

// Destructor
virtual ~MatrixElementKernelHostFat();

// Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb)
int computeGoodHelicities() override final;

// Compute matrix elements
void computeMatrixElements( const bool useChannelIds ) override final;

// Is this a host or device kernel?
bool isOnDevice() const override final { return false; }

// Detect the best available SIMD level on the current CPU
static SimdLevel detectBestSimd( const bool verbose = false );

private:

// The selected SIMD level (detected once at construction time)
SimdLevel m_selectedSimd;

// The buffer for the event-by-event couplings that depends on alphas QCD
HostBufferCouplings m_couplings;

#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
// The buffer for the event-by-event numerators of multichannel factors
HostBufferNumerators m_numerators;

// The buffer for the event-by-event denominators of multichannel factors
HostBufferDenominators m_denominators;
#endif
};
#endif

//--------------------------------------------------------------------------

#ifdef MGONGPUCPP_GPUIMPL
// A class encapsulating matrix element calculations on a GPU device
class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents
Expand Down
Loading