update xbyak

This commit is contained in:
SimoneN64
2024-09-02 23:47:06 +02:00
parent 5340b77f40
commit 5e93f1dd32
3 changed files with 2216 additions and 1445 deletions

856
external/xbyak/xbyak.h vendored

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -9,6 +9,13 @@
#define XBYAK_THROW(x) ; #define XBYAK_THROW(x) ;
#define XBYAK_THROW_RET(x, y) return y; #define XBYAK_THROW_RET(x, y) return y;
#endif #endif
#ifndef XBYAK_CONSTEXPR
#if ((__cplusplus >= 201402L) && !(!defined(__clang__) && defined(__GNUC__) && (__GNUC__ <= 5))) || (defined(_MSC_VER) && _MSC_VER >= 1910)
#define XBYAK_CONSTEXPR constexpr
#else
#define XBYAK_CONSTEXPR
#endif
#endif
#else #else
#include <string.h> #include <string.h>
@@ -20,7 +27,7 @@
#include "xbyak.h" #include "xbyak.h"
#endif // XBYAK_ONLY_CLASS_CPU #endif // XBYAK_ONLY_CLASS_CPU
#if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) #if defined(__i386__) || (defined(__x86_64__) && !defined(__arm64ec__)) || defined(_M_IX86) || (defined(_M_X64) && !defined(_M_ARM64EC))
#define XBYAK_INTEL_CPU_SPECIFIC #define XBYAK_INTEL_CPU_SPECIFIC
#endif #endif
@@ -84,7 +91,8 @@ namespace Xbyak { namespace util {
typedef enum { typedef enum {
SmtLevel = 1, SmtLevel = 1,
CoreLevel = 2 CoreLevel = 2
} IntelCpuTopologyLevel; } CpuTopologyLevel;
typedef CpuTopologyLevel IntelCpuTopologyLevel; // for backward compatibility
namespace local { namespace local {
@@ -93,7 +101,7 @@ struct TypeT {
}; };
template<uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2> template<uint64_t L1, uint64_t H1, uint64_t L2, uint64_t H2>
TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) { return TypeT<L1 | L2, H1 | H2>(); } XBYAK_CONSTEXPR TypeT<L1 | L2, H1 | H2> operator|(TypeT<L1, H1>, TypeT<L2, H2>) { return TypeT<L1 | L2, H1 | H2>(); }
template<typename T> template<typename T>
inline T max_(T x, T y) { return x >= y ? x : y; } inline T max_(T x, T y) { return x >= y ? x : y; }
@@ -129,14 +137,14 @@ public:
private: private:
Type type_; Type type_;
//system topology //system topology
bool x2APIC_supported_;
static const size_t maxTopologyLevels = 2; static const size_t maxTopologyLevels = 2;
uint32_t numCores_[maxTopologyLevels]; uint32_t numCores_[maxTopologyLevels];
static const uint32_t maxNumberCacheLevels = 10; static const uint32_t maxNumberCacheLevels = 10;
uint32_t dataCacheSize_[maxNumberCacheLevels]; uint32_t dataCacheSize_[maxNumberCacheLevels];
uint32_t coresSharignDataCache_[maxNumberCacheLevels]; uint32_t coresSharingDataCache_[maxNumberCacheLevels];
uint32_t dataCacheLevels_; uint32_t dataCacheLevels_;
uint32_t avx10version_;
uint32_t get32bitAsBE(const char *x) const uint32_t get32bitAsBE(const char *x) const
{ {
@@ -146,84 +154,200 @@ private:
{ {
return (1U << n) - 1; return (1U << n) - 1;
} }
// [EBX:ECX:EDX] == s?
bool isEqualStr(uint32_t EBX, uint32_t ECX, uint32_t EDX, const char s[12]) const
{
return get32bitAsBE(&s[0]) == EBX && get32bitAsBE(&s[4]) == EDX && get32bitAsBE(&s[8]) == ECX;
}
uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end) const
{
return (val >> base) & ((1u << (end + 1 - base)) - 1);
}
void setFamily() void setFamily()
{ {
uint32_t data[4] = {}; uint32_t data[4] = {};
getCpuid(1, data); getCpuid(1, data);
stepping = data[0] & mask(4); stepping = extractBit(data[0], 0, 3);
model = (data[0] >> 4) & mask(4); model = extractBit(data[0], 4, 7);
family = (data[0] >> 8) & mask(4); family = extractBit(data[0], 8, 11);
// type = (data[0] >> 12) & mask(2); //type = extractBit(data[0], 12, 13);
extModel = (data[0] >> 16) & mask(4); extModel = extractBit(data[0], 16, 19);
extFamily = (data[0] >> 20) & mask(8); extFamily = extractBit(data[0], 20, 27);
if (family == 0x0f) { if (family == 0x0f) {
displayFamily = family + extFamily; displayFamily = family + extFamily;
} else { } else {
displayFamily = family; displayFamily = family;
} }
if (family == 6 || family == 0x0f) { if ((has(tINTEL) && family == 6) || family == 0x0f) {
displayModel = (extModel << 4) + model; displayModel = (extModel << 4) + model;
} else { } else {
displayModel = model; displayModel = model;
} }
} }
uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end)
{
return (val >> base) & ((1u << (end - base)) - 1);
}
void setNumCores() void setNumCores()
{ {
if (!has(tINTEL)) return; if (!has(tINTEL) && !has(tAMD)) return;
uint32_t data[4] = {}; uint32_t data[4] = {};
getCpuid(0x0, data);
/* CAUTION: These numbers are configuration as shipped by Intel. */
getCpuidEx(0x0, 0, data);
if (data[0] >= 0xB) { if (data[0] >= 0xB) {
// Check if "Extended Topology Enumeration" is implemented.
getCpuidEx(0xB, 0, data);
if (data[0] != 0 || data[1] != 0) {
/* /*
if leaf 11 exists(x2APIC is supported), if leaf 11 exists(x2APIC is supported),
we use it to get the number of smt cores and cores on socket we use it to get the number of smt cores and cores on socket
leaf 0xB can be zeroed-out by a hypervisor leaf 0xB can be zeroed-out by a hypervisor
*/ */
x2APIC_supported_ = true;
for (uint32_t i = 0; i < maxTopologyLevels; i++) { for (uint32_t i = 0; i < maxTopologyLevels; i++) {
getCpuidEx(0xB, i, data); getCpuidEx(0xB, i, data);
IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15); CpuTopologyLevel level = (CpuTopologyLevel)extractBit(data[2], 8, 15);
if (level == SmtLevel || level == CoreLevel) { if (level == SmtLevel || level == CoreLevel) {
numCores_[level - 1] = extractBit(data[1], 0, 15); numCores_[level - 1] = extractBit(data[1], 0, 15);
} }
} }
/* /*
Fallback values in case a hypervisor has 0xB leaf zeroed-out. Fallback values in case a hypervisor has the leaf zeroed-out.
*/ */
numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]); numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]); numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
return;
}
}
// "Extended Topology Enumeration" is not supported.
if (has(tAMD)) {
/*
AMD - Legacy Method
*/
int physicalThreadCount = 0;
getCpuid(0x1, data);
int logicalProcessorCount = extractBit(data[1], 16, 23);
int htt = extractBit(data[3], 28, 28); // Hyper-threading technology.
getCpuid(0x80000000, data);
uint32_t highestExtendedLeaf = data[0];
if (highestExtendedLeaf >= 0x80000008) {
getCpuid(0x80000008, data);
physicalThreadCount = extractBit(data[2], 0, 7) + 1;
}
if (htt == 0) {
numCores_[SmtLevel - 1] = 1;
numCores_[CoreLevel - 1] = 1;
} else if (physicalThreadCount > 1) {
if ((displayFamily >= 0x17) && (highestExtendedLeaf >= 0x8000001E)) {
// Zen overreports its core count by a factor of two.
getCpuid(0x8000001E, data);
int threadsPerComputeUnit = extractBit(data[1], 8, 15) + 1;
physicalThreadCount /= threadsPerComputeUnit;
}
numCores_[SmtLevel - 1] = logicalProcessorCount / physicalThreadCount;
numCores_[CoreLevel - 1] = logicalProcessorCount;
} else {
numCores_[SmtLevel - 1] = 1;
numCores_[CoreLevel - 1] = logicalProcessorCount > 1 ? logicalProcessorCount : 2;
}
} else { } else {
/* /*
Failed to deremine num of cores without x2APIC support. Intel - Legacy Method
TODO: USE initial APIC ID to determine ncores.
*/ */
numCores_[SmtLevel - 1] = 0; int physicalThreadCount = 0;
numCores_[CoreLevel - 1] = 0; getCpuid(0x1, data);
int logicalProcessorCount = extractBit(data[1], 16, 23);
int htt = extractBit(data[3], 28, 28); // Hyper-threading technology.
getCpuid(0, data);
if (data[0] >= 0x4) {
getCpuid(0x4, data);
physicalThreadCount = extractBit(data[0], 26, 31) + 1;
}
if (htt == 0) {
numCores_[SmtLevel - 1] = 1;
numCores_[CoreLevel - 1] = 1;
} else if (physicalThreadCount > 1) {
numCores_[SmtLevel - 1] = logicalProcessorCount / physicalThreadCount;
numCores_[CoreLevel - 1] = logicalProcessorCount;
} else {
numCores_[SmtLevel - 1] = 1;
numCores_[CoreLevel - 1] = logicalProcessorCount > 0 ? logicalProcessorCount : 1;
}
} }
} }
void setCacheHierarchy() void setCacheHierarchy()
{ {
if (!has(tINTEL)) return; uint32_t data[4] = {};
if (has(tAMD)) {
getCpuid(0x80000000, data);
if (data[0] >= 0x8000001D) {
// For modern AMD CPUs.
dataCacheLevels_ = 0;
for (uint32_t subLeaf = 0; dataCacheLevels_ < maxNumberCacheLevels; subLeaf++) {
getCpuidEx(0x8000001D, subLeaf, data);
int cacheType = extractBit(data[0], 0, 4);
/*
cacheType
00h - Null; no more caches
01h - Data cache
02h - Instrution cache
03h - Unified cache
04h-1Fh - Reserved
*/
if (cacheType == 0) break; // No more caches.
if (cacheType == 0x2) continue; // Skip instruction cache.
int fullyAssociative = extractBit(data[0], 9, 9);
int numSharingCache = extractBit(data[0], 14, 25) + 1;
int cacheNumWays = extractBit(data[1], 22, 31) + 1;
int cachePhysPartitions = extractBit(data[1], 12, 21) + 1;
int cacheLineSize = extractBit(data[1], 0, 11) + 1;
int cacheNumSets = data[2] + 1;
dataCacheSize_[dataCacheLevels_] =
cacheLineSize * cachePhysPartitions * cacheNumWays;
if (fullyAssociative == 0) {
dataCacheSize_[dataCacheLevels_] *= cacheNumSets;
}
if (subLeaf > 0) {
numSharingCache = local::min_(numSharingCache, (int)numCores_[1]);
numSharingCache /= local::max_(1u, coresSharingDataCache_[0]);
}
coresSharingDataCache_[dataCacheLevels_] = numSharingCache;
dataCacheLevels_ += 1;
}
coresSharingDataCache_[0] = local::min_(1u, coresSharingDataCache_[0]);
} else if (data[0] >= 0x80000006) {
// For legacy AMD CPUs, use leaf 0x80000005 for L1 cache
// and 0x80000006 for L2 and L3 cache.
dataCacheLevels_ = 1;
getCpuid(0x80000005, data);
int l1dc_size = extractBit(data[2], 24, 31);
dataCacheSize_[0] = l1dc_size * 1024;
coresSharingDataCache_[0] = 1;
getCpuid(0x80000006, data);
// L2 cache
int l2_assoc = extractBit(data[2], 12, 15);
if (l2_assoc > 0) {
dataCacheLevels_ = 2;
int l2_size = extractBit(data[2], 16, 31);
dataCacheSize_[1] = l2_size * 1024;
coresSharingDataCache_[1] = 1;
}
// L3 cache
int l3_assoc = extractBit(data[3], 12, 15);
if (l3_assoc > 0) {
dataCacheLevels_ = 3;
int l3_size = extractBit(data[3], 18, 31);
dataCacheSize_[2] = l3_size * 512 * 1024;
coresSharingDataCache_[2] = numCores_[1];
}
}
} else if (has(tINTEL)) {
// Use the "Deterministic Cache Parameters" leaf is supported.
const uint32_t NO_CACHE = 0; const uint32_t NO_CACHE = 0;
const uint32_t DATA_CACHE = 1; const uint32_t DATA_CACHE = 1;
//const uint32_t INSTRUCTION_CACHE = 2; //const uint32_t INSTRUCTION_CACHE = 2;
const uint32_t UNIFIED_CACHE = 3; const uint32_t UNIFIED_CACHE = 3;
uint32_t smt_width = 0; uint32_t smt_width = 0;
uint32_t logical_cores = 0; uint32_t logical_cores = 0;
uint32_t data[4] = {};
if (x2APIC_supported_) {
smt_width = numCores_[0]; smt_width = numCores_[0];
logical_cores = numCores_[1]; logical_cores = numCores_[1];
}
/* /*
Assumptions: Assumptions:
@@ -251,11 +375,12 @@ private:
* (data[2] + 1); * (data[2] + 1);
if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores; if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
assert(smt_width != 0); assert(smt_width != 0);
coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u); coresSharingDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
dataCacheLevels_++; dataCacheLevels_++;
} }
} }
} }
}
public: public:
int model; int model;
@@ -266,8 +391,7 @@ public:
int displayFamily; // family + extFamily int displayFamily; // family + extFamily
int displayModel; // model + extModel int displayModel; // model + extModel
uint32_t getNumCores(IntelCpuTopologyLevel level) const { uint32_t getNumCores(CpuTopologyLevel level) const {
if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
switch (level) { switch (level) {
case SmtLevel: return numCores_[level - 1]; case SmtLevel: return numCores_[level - 1];
case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1]; case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1];
@@ -279,7 +403,7 @@ public:
uint32_t getCoresSharingDataCache(uint32_t i) const uint32_t getCoresSharingDataCache(uint32_t i) const
{ {
if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0) if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
return coresSharignDataCache_[i]; return coresSharingDataCache_[i];
} }
uint32_t getDataCacheSize(uint32_t i) const uint32_t getDataCacheSize(uint32_t i) const
{ {
@@ -290,19 +414,6 @@ public:
/* /*
data[] = { eax, ebx, ecx, edx } data[] = { eax, ebx, ecx, edx }
*/ */
static inline void getCpuid(uint32_t eaxIn, uint32_t data[4])
{
#ifdef XBYAK_INTEL_CPU_SPECIFIC
#ifdef _WIN32
__cpuid(reinterpret_cast<int*>(data), eaxIn);
#else
__cpuid(eaxIn, data[0], data[1], data[2], data[3]);
#endif
#else
(void)eaxIn;
(void)data;
#endif
}
static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4]) static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4])
{ {
#ifdef XBYAK_INTEL_CPU_SPECIFIC #ifdef XBYAK_INTEL_CPU_SPECIFIC
@@ -317,6 +428,10 @@ public:
(void)data; (void)data;
#endif #endif
} }
static inline void getCpuid(uint32_t eaxIn, uint32_t data[4])
{
getCpuidEx(eaxIn, 0, data);
}
static inline uint64_t getXfeature() static inline uint64_t getXfeature()
{ {
#ifdef XBYAK_INTEL_CPU_SPECIFIC #ifdef XBYAK_INTEL_CPU_SPECIFIC
@@ -414,17 +529,36 @@ public:
XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8); XBYAK_DEFINE_TYPE(69, tAVX_VNNI_INT8);
XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT); XBYAK_DEFINE_TYPE(70, tAVX_NE_CONVERT);
XBYAK_DEFINE_TYPE(71, tAVX_IFMA); XBYAK_DEFINE_TYPE(71, tAVX_IFMA);
XBYAK_DEFINE_TYPE(72, tRAO_INT);
XBYAK_DEFINE_TYPE(73, tCMPCCXADD);
XBYAK_DEFINE_TYPE(74, tPREFETCHITI);
XBYAK_DEFINE_TYPE(75, tSERIALIZE);
XBYAK_DEFINE_TYPE(76, tUINTR);
XBYAK_DEFINE_TYPE(77, tXSAVE);
XBYAK_DEFINE_TYPE(78, tSHA512);
XBYAK_DEFINE_TYPE(79, tSM3);
XBYAK_DEFINE_TYPE(80, tSM4);
XBYAK_DEFINE_TYPE(81, tAVX_VNNI_INT16);
XBYAK_DEFINE_TYPE(82, tAPX_F);
XBYAK_DEFINE_TYPE(83, tAVX10);
XBYAK_DEFINE_TYPE(84, tAESKLE);
XBYAK_DEFINE_TYPE(85, tWIDE_KL);
XBYAK_DEFINE_TYPE(86, tKEYLOCKER);
XBYAK_DEFINE_TYPE(87, tKEYLOCKER_WIDE);
XBYAK_DEFINE_TYPE(88, tSSE4a);
XBYAK_DEFINE_TYPE(89, tCLWB);
XBYAK_DEFINE_TYPE(90, tTSXLDTRK);
#undef XBYAK_SPLIT_ID #undef XBYAK_SPLIT_ID
#undef XBYAK_DEFINE_TYPE #undef XBYAK_DEFINE_TYPE
Cpu() Cpu()
: type_() : type_()
, x2APIC_supported_(false)
, numCores_() , numCores_()
, dataCacheSize_() , dataCacheSize_()
, coresSharignDataCache_() , coresSharingDataCache_()
, dataCacheLevels_(0) , dataCacheLevels_(0)
, avx10version_(0)
{ {
uint32_t data[4] = {}; uint32_t data[4] = {};
const uint32_t& EAX = data[0]; const uint32_t& EAX = data[0];
@@ -433,9 +567,7 @@ public:
const uint32_t& EDX = data[3]; const uint32_t& EDX = data[3];
getCpuid(0, data); getCpuid(0, data);
const uint32_t maxNum = EAX; const uint32_t maxNum = EAX;
static const char intel[] = "ntel"; if (isEqualStr(EBX, ECX, EDX, "AuthenticAMD")) {
static const char amd[] = "cAMD";
if (ECX == get32bitAsBE(amd)) {
type_ |= tAMD; type_ |= tAMD;
getCpuid(0x80000001, data); getCpuid(0x80000001, data);
if (EDX & (1U << 31)) { if (EDX & (1U << 31)) {
@@ -448,8 +580,7 @@ public:
// Long mode implies support for PREFETCHW on AMD // Long mode implies support for PREFETCHW on AMD
type_ |= tPREFETCHW; type_ |= tPREFETCHW;
} }
} } else if (isEqualStr(EBX, ECX, EDX, "GenuineIntel")) {
if (ECX == get32bitAsBE(intel)) {
type_ |= tINTEL; type_ |= tINTEL;
} }
@@ -459,13 +590,14 @@ public:
if (maxExtendedNum >= 0x80000001) { if (maxExtendedNum >= 0x80000001) {
getCpuid(0x80000001, data); getCpuid(0x80000001, data);
if (EDX & (1U << 31)) type_ |= t3DN;
if (EDX & (1U << 30)) type_ |= tE3DN;
if (EDX & (1U << 27)) type_ |= tRDTSCP;
if (EDX & (1U << 22)) type_ |= tMMX2;
if (EDX & (1U << 15)) type_ |= tCMOV;
if (ECX & (1U << 5)) type_ |= tLZCNT; if (ECX & (1U << 5)) type_ |= tLZCNT;
if (ECX & (1U << 6)) type_ |= tSSE4a;
if (ECX & (1U << 8)) type_ |= tPREFETCHW; if (ECX & (1U << 8)) type_ |= tPREFETCHW;
if (EDX & (1U << 15)) type_ |= tCMOV;
if (EDX & (1U << 22)) type_ |= tMMX2;
if (EDX & (1U << 27)) type_ |= tRDTSCP;
if (EDX & (1U << 30)) type_ |= tE3DN;
if (EDX & (1U << 31)) type_ |= t3DN;
} }
if (maxExtendedNum >= 0x80000008) { if (maxExtendedNum >= 0x80000008) {
@@ -475,16 +607,17 @@ public:
getCpuid(1, data); getCpuid(1, data);
if (ECX & (1U << 0)) type_ |= tSSE3; if (ECX & (1U << 0)) type_ |= tSSE3;
if (ECX & (1U << 1)) type_ |= tPCLMULQDQ;
if (ECX & (1U << 9)) type_ |= tSSSE3; if (ECX & (1U << 9)) type_ |= tSSSE3;
if (ECX & (1U << 19)) type_ |= tSSE41; if (ECX & (1U << 19)) type_ |= tSSE41;
if (ECX & (1U << 20)) type_ |= tSSE42; if (ECX & (1U << 20)) type_ |= tSSE42;
if (ECX & (1U << 22)) type_ |= tMOVBE; if (ECX & (1U << 22)) type_ |= tMOVBE;
if (ECX & (1U << 23)) type_ |= tPOPCNT; if (ECX & (1U << 23)) type_ |= tPOPCNT;
if (ECX & (1U << 25)) type_ |= tAESNI; if (ECX & (1U << 25)) type_ |= tAESNI;
if (ECX & (1U << 1)) type_ |= tPCLMULQDQ; if (ECX & (1U << 26)) type_ |= tXSAVE;
if (ECX & (1U << 27)) type_ |= tOSXSAVE; if (ECX & (1U << 27)) type_ |= tOSXSAVE;
if (ECX & (1U << 30)) type_ |= tRDRAND;
if (ECX & (1U << 29)) type_ |= tF16C; if (ECX & (1U << 29)) type_ |= tF16C;
if (ECX & (1U << 30)) type_ |= tRDRAND;
if (EDX & (1U << 15)) type_ |= tCMOV; if (EDX & (1U << 15)) type_ |= tCMOV;
if (EDX & (1U << 23)) type_ |= tMMX; if (EDX & (1U << 23)) type_ |= tMMX;
@@ -495,8 +628,8 @@ public:
// check XFEATURE_ENABLED_MASK[2:1] = '11b' // check XFEATURE_ENABLED_MASK[2:1] = '11b'
uint64_t bv = getXfeature(); uint64_t bv = getXfeature();
if ((bv & 6) == 6) { if ((bv & 6) == 6) {
if (ECX & (1U << 28)) type_ |= tAVX;
if (ECX & (1U << 12)) type_ |= tFMA; if (ECX & (1U << 12)) type_ |= tFMA;
if (ECX & (1U << 28)) type_ |= tAVX;
// do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support // do *not* check AVX-512 state on macOS because it has on-demand AVX-512 support
#if !defined(__APPLE__) #if !defined(__APPLE__)
if (((bv >> 5) & 7) == 7) if (((bv >> 5) & 7) == 7)
@@ -530,39 +663,63 @@ public:
const uint32_t maxNumSubLeaves = EAX; const uint32_t maxNumSubLeaves = EAX;
if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2; if (type_ & tAVX && (EBX & (1U << 5))) type_ |= tAVX2;
if (EBX & (1U << 3)) type_ |= tBMI1; if (EBX & (1U << 3)) type_ |= tBMI1;
if (EBX & (1U << 4)) type_ |= tHLE;
if (EBX & (1U << 8)) type_ |= tBMI2; if (EBX & (1U << 8)) type_ |= tBMI2;
if (EBX & (1U << 9)) type_ |= tENHANCED_REP; if (EBX & (1U << 9)) type_ |= tENHANCED_REP;
if (EBX & (1U << 11)) type_ |= tRTM;
if (EBX & (1U << 14)) type_ |= tMPX;
if (EBX & (1U << 18)) type_ |= tRDSEED; if (EBX & (1U << 18)) type_ |= tRDSEED;
if (EBX & (1U << 19)) type_ |= tADX; if (EBX & (1U << 19)) type_ |= tADX;
if (EBX & (1U << 20)) type_ |= tSMAP; if (EBX & (1U << 20)) type_ |= tSMAP;
if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT; if (EBX & (1U << 23)) type_ |= tCLFLUSHOPT;
if (EBX & (1U << 4)) type_ |= tHLE; if (EBX & (1U << 24)) type_ |= tCLWB;
if (EBX & (1U << 11)) type_ |= tRTM;
if (EBX & (1U << 14)) type_ |= tMPX;
if (EBX & (1U << 29)) type_ |= tSHA; if (EBX & (1U << 29)) type_ |= tSHA;
if (ECX & (1U << 0)) type_ |= tPREFETCHWT1; if (ECX & (1U << 0)) type_ |= tPREFETCHWT1;
if (ECX & (1U << 5)) type_ |= tWAITPKG; if (ECX & (1U << 5)) type_ |= tWAITPKG;
if (ECX & (1U << 8)) type_ |= tGFNI; if (ECX & (1U << 8)) type_ |= tGFNI;
if (ECX & (1U << 9)) type_ |= tVAES; if (ECX & (1U << 9)) type_ |= tVAES;
if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ; if (ECX & (1U << 10)) type_ |= tVPCLMULQDQ;
if (ECX & (1U << 23)) type_ |= tKEYLOCKER;
if (ECX & (1U << 25)) type_ |= tCLDEMOTE; if (ECX & (1U << 25)) type_ |= tCLDEMOTE;
if (ECX & (1U << 27)) type_ |= tMOVDIRI; if (ECX & (1U << 27)) type_ |= tMOVDIRI;
if (ECX & (1U << 28)) type_ |= tMOVDIR64B; if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
if (EDX & (1U << 5)) type_ |= tUINTR;
if (EDX & (1U << 14)) type_ |= tSERIALIZE;
if (EDX & (1U << 16)) type_ |= tTSXLDTRK;
if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (EDX & (1U << 24)) type_ |= tAMX_TILE; if (EDX & (1U << 24)) type_ |= tAMX_TILE;
if (EDX & (1U << 25)) type_ |= tAMX_INT8; if (EDX & (1U << 25)) type_ |= tAMX_INT8;
if (EDX & (1U << 22)) type_ |= tAMX_BF16;
if (maxNumSubLeaves >= 1) { if (maxNumSubLeaves >= 1) {
getCpuidEx(7, 1, data); getCpuidEx(7, 1, data);
if (EAX & (1U << 0)) type_ |= tSHA512;
if (EAX & (1U << 1)) type_ |= tSM3;
if (EAX & (1U << 2)) type_ |= tSM4;
if (EAX & (1U << 3)) type_ |= tRAO_INT;
if (EAX & (1U << 4)) type_ |= tAVX_VNNI; if (EAX & (1U << 4)) type_ |= tAVX_VNNI;
if (type_ & tAVX512F) { if (type_ & tAVX512F) {
if (EAX & (1U << 5)) type_ |= tAVX512_BF16; if (EAX & (1U << 5)) type_ |= tAVX512_BF16;
} }
if (EAX & (1U << 7)) type_ |= tCMPCCXADD;
if (EAX & (1U << 21)) type_ |= tAMX_FP16; if (EAX & (1U << 21)) type_ |= tAMX_FP16;
if (EAX & (1U << 23)) type_ |= tAVX_IFMA; if (EAX & (1U << 23)) type_ |= tAVX_IFMA;
if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8; if (EDX & (1U << 4)) type_ |= tAVX_VNNI_INT8;
if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT; if (EDX & (1U << 5)) type_ |= tAVX_NE_CONVERT;
if (EDX & (1U << 10)) type_ |= tAVX_VNNI_INT16;
if (EDX & (1U << 14)) type_ |= tPREFETCHITI;
if (EDX & (1U << 19)) type_ |= tAVX10;
if (EDX & (1U << 21)) type_ |= tAPX_F;
} }
} }
if (maxNum >= 0x19) {
getCpuidEx(0x19, 0, data);
if (EBX & (1U << 0)) type_ |= tAESKLE;
if (EBX & (1U << 2)) type_ |= tWIDE_KL;
if (type_ & (tKEYLOCKER|tAESKLE|tWIDE_KL)) type_ |= tKEYLOCKER_WIDE;
}
if (has(tAVX10) && maxNum >= 0x24) {
getCpuidEx(0x24, 0, data);
avx10version_ = EBX & mask(7);
}
setFamily(); setFamily();
setNumCores(); setNumCores();
setCacheHierarchy(); setCacheHierarchy();
@@ -579,6 +736,7 @@ public:
{ {
return (type & type_) == type; return (type & type_) == type;
} }
int getAVX10version() const { return avx10version_; }
}; };
#ifndef XBYAK_ONLY_CLASS_CPU #ifndef XBYAK_ONLY_CLASS_CPU