This commit is contained in:
2026-03-23 12:13:12 +01:00
commit a615280259
153 changed files with 33843 additions and 0 deletions
+94
View File
@@ -0,0 +1,94 @@
ldtilecfg(ptr[rax + rcx * 4 + 64]);
ldtilecfg(ptr [r30+r29*4+0x12]);
ldtilecfg(ptr [rax]);
sttilecfg(ptr[rsp + rax * 8 + 128]);
sttilecfg(ptr [r30+r29*4+0x12]);
sttilecfg(ptr [r30]);
tileloadd(tmm3, ptr[rdi + rdx * 2 + 8]);
tileloadd(tmm2, ptr [r30+r29*4+0x12]);
tileloaddt1(tmm4, ptr[r8 + r9 + 32]);
tileloaddt1(tmm7, ptr [r30+r29*4+0x12]);
tilerelease();
tilestored(ptr[r10 + r11 * 2 + 32], tmm2);
tilestored(ptr [r30+r29*4+0x12], tmm1);
tilezero(tmm7);
tdpbssd(tmm1, tmm2, tmm3);
tdpbsud(tmm2, tmm3, tmm4);
tdpbusd(tmm3, tmm4, tmm5);
tdpbuud(tmm4, tmm5, tmm6);
tdpfp16ps(tmm5, tmm6, tmm7);
tdpbf16ps(tmm5, tmm6, tmm7);
tileloadd(tmm1, ptr[r8+r8]);
tileloadd(tmm1, ptr[rax+rcx*4]);
tileloadd(tmm1, ptr[r8+r9*1+0x40]);
tileloadd(tmm1, ptr[r30+r29*1+0x80]);
tileloaddrs(tmm3, ptr[rdi + rdx * 2 + 8]);
tileloaddrs(tmm7, ptr[r31 + rdx * 2 + 8]);
tileloaddrst1(tmm4, ptr[r8 + r9 + 32]);
tileloaddrst1(tmm4, ptr[r25 + r9 + 32]);
tdpbf8ps(tmm1, tmm2, tmm3);
tdpbhf8ps(tmm1, tmm2, tmm3);
tdphbf8ps(tmm1, tmm2, tmm3);
tdphf8ps(tmm1, tmm2, tmm3);
tmmultf32ps(tmm1, tmm2, tmm3);
t2rpntlvwz0(tmm1, ptr[rax+r8*2+0x80]);
t2rpntlvwz0(tmm7, ptr[r30+r8*2+0x80]);
t2rpntlvwz0t1(tmm1, ptr[rax+r8*2+0x80]);
t2rpntlvwz0t1(tmm7, ptr[r30+r8*2+0x80]);
t2rpntlvwz1(tmm1, ptr[rax+r8*2+0x80]);
t2rpntlvwz1(tmm7, ptr[r30+r8*2+0x80]);
t2rpntlvwz1t1(tmm1, ptr[rax+r8*2+0x80]);
t2rpntlvwz1t1(tmm7, ptr[r30+r8*2+0x80]);
t2rpntlvwz0rs(tmm1, ptr[rax+r8*2+0x80]);
t2rpntlvwz0rs(tmm7, ptr[r30+r8*2+0x80]);
t2rpntlvwz0rst1(tmm1, ptr[rax+r8*2+0x80]);
t2rpntlvwz0rst1(tmm7, ptr[r30+r8*2+0x80]);
t2rpntlvwz1rs(tmm1, ptr[rax+r8*2+0x80]);
t2rpntlvwz1rs(tmm7, ptr[r30+r8*2+0x80]);
t2rpntlvwz1rst1(tmm1, ptr[rax+r8*2+0x80]);
t2rpntlvwz1rst1(tmm7, ptr[r30+r8*2+0x80]);
tcmmimfp16ps(tmm1, tmm2, tmm3);
tcmmrlfp16ps(tmm1, tmm2, tmm3);
tconjtcmmimfp16ps(tmm1, tmm2, tmm3);
tconjtfp16(tmm1, tmm2);
tcvtrowps2bf16h(zmm1, tmm2, r30d);
tcvtrowps2bf16h(zmm29, tmm2, 0x12);
tcvtrowps2bf16l(zmm1, tmm2, r30d);
tcvtrowps2bf16l(zmm29, tmm2, 0x12);
tcvtrowps2phh(zmm1, tmm2, r30d);
tcvtrowps2phh(zmm29, tmm2, 0x12);
tcvtrowps2phl(zmm1, tmm2, r30d);
tcvtrowps2phl(zmm29, tmm2, 0x12);
tilemovrow(zmm1, tmm2, r30d);
tilemovrow(zmm29, tmm2, 0x12);
ttcmmimfp16ps(tmm1, tmm2, tmm3);
ttcmmrlfp16ps(tmm1, tmm2, tmm3);
ttdpbf16ps(tmm1, tmm2, tmm3);
ttdpfp16ps(tmm1, tmm2, tmm3);
ttmmultf32ps(tmm1, tmm2, tmm3);
ttransposed(tmm1, tmm2);
tcvtrowd2ps(zmm20, tmm1, r30d);
tcvtrowd2ps(zmm20, tmm1, 0x12);
+21
View File
@@ -0,0 +1,21 @@
// https://github.com/herumi/xbyak/pull/202
sal(rax, r8, 1);
sar(rax, r9, 4);
shl(rax, rdi, 8);
shr(rax, rsi, 12);
rcl(rax, r10, 16);
rcr(rax, r11, 20);
rol(rax, r14, 24);
ror(rax, r15, 28);
sal(rcx, qword[r8], 32);
sar(rcx, qword[r9], 36);
sal(rcx, qword[rdi], 40);
sar(rcx, qword[rsi], 44);
rcl(rcx, qword[r10], 48);
rcr(rcx, qword[r11], 52);
rol(rcx, qword[r14], 56);
ror(rcx, qword[r15], 60);
imul(rax, rdx, r10);
imul(rcx, r15, qword[rdi]);
+210
View File
@@ -0,0 +1,210 @@
vaddbf16(xm1, xm2, xm3);
vaddbf16(ym1|k1, ym2, ptr[rax+128]);
vaddbf16(ym1|k1, ym2, ptr_b[rax+128]);
vaddbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vdivbf16(xm1, xm2, xm3);
vdivbf16(ym1|k1, ym2, ptr[rax+128]);
vdivbf16(ym1|k1, ym2, ptr_b[rax+128]);
vdivbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vmaxbf16(xm1, xm2, xm3);
vmaxbf16(ym1|k1, ym2, ptr[rax+128]);
vmaxbf16(ym1|k1, ym2, ptr_b[rax+128]);
vmaxbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vminbf16(xm1, xm2, xm3);
vminbf16(ym1|k1, ym2, ptr[rax+128]);
vminbf16(ym1|k1, ym2, ptr_b[rax+128]);
vminbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vmulbf16(xm1, xm2, xm3);
vmulbf16(ym1|k1, ym2, ptr[rax+128]);
vmulbf16(ym1|k1, ym2, ptr_b[rax+128]);
vmulbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vscalefbf16(xm1, xm2, xm3);
vscalefbf16(ym1|k1, ym2, ptr[rax+128]);
vscalefbf16(ym1|k1, ym2, ptr_b[rax+128]);
vscalefbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vsubbf16(xm1, xm2, xm3);
vsubbf16(ym1|k1, ym2, ptr[rax+128]);
vsubbf16(ym1|k1, ym2, ptr_b[rax+128]);
vsubbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// madd
vfmadd132bf16(xm1, xm2, xm3);
vfmadd132bf16(ym1|k1, ym2, ptr[rax+128]);
vfmadd132bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmadd132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfmadd213bf16(xm1, xm2, xm3);
vfmadd213bf16(ym1|k1, ym2, ptr[rax+128]);
vfmadd213bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmadd213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfmadd231bf16(xm1, xm2, xm3);
vfmadd231bf16(ym1|k1, ym2, ptr[rax+128]);
vfmadd231bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmadd231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// nmadd
vfnmadd132bf16(xm1, xm2, xm3);
vfnmadd132bf16(ym1|k1, ym2, ptr[rax+128]);
vfnmadd132bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmadd132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfnmadd213bf16(xm1, xm2, xm3);
vfnmadd213bf16(ym1|k1, ym2, ptr[rax+128]);
vfnmadd213bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmadd213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfnmadd231bf16(xm1, xm2, xm3);
vfnmadd231bf16(ym1|k1, ym2, ptr[rax+128]);
vfnmadd231bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmadd231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// msub
vfmsub132bf16(xm1, xm2, xm3);
vfmsub132bf16(ym1|k1, ym2, ptr[rax+128]);
vfmsub132bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmsub132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfmsub213bf16(xm1, xm2, xm3);
vfmsub213bf16(ym1|k1, ym2, ptr[rax+128]);
vfmsub213bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmsub213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfmsub231bf16(xm1, xm2, xm3);
vfmsub231bf16(ym1|k1, ym2, ptr[rax+128]);
vfmsub231bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfmsub231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
// nmsub
vfnmsub132bf16(xm1, xm2, xm3);
vfnmsub132bf16(ym1|k1, ym2, ptr[rax+128]);
vfnmsub132bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmsub132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfnmsub213bf16(xm1, xm2, xm3);
vfnmsub213bf16(ym1|k1, ym2, ptr[rax+128]);
vfnmsub213bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmsub213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vfnmsub231bf16(xm1, xm2, xm3);
vfnmsub231bf16(ym1|k1, ym2, ptr[rax+128]);
vfnmsub231bf16(ym1|k1, ym2, ptr_b[rax+128]);
vfnmsub231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
vcmpbf16(k1, xm5, xm4, 5);
vcmpbf16(k2, ym5, ym4, 6);
vcmpbf16(k3, ym15, ptr_b[rax+128], 7);
vcmpbf16(k4, zm30, zm20, 8);
vcmpbf16(k5, zm1, ptr[rax+128], 9);
vcmpbf16(k6, zm10, ptr_b[rax+128], 10);
vfpclassbf16(k1, xm4, 5);
vfpclassbf16(k2|k5, ym4, 6);
vfpclassbf16(k3|k5, zm20, 7);
vfpclassbf16(k3|k5, xword[rax+128], 8);
vfpclassbf16(k3, xword_b[rax+128], 9);
vfpclassbf16(k5|k5, yword[rax+128], 10);
vfpclassbf16(k6|k5, yword_b[rax+128], 11);
vfpclassbf16(k7|k5, zword[rax+128], 12);
vfpclassbf16(k7|k5, zword_b[rax+128], 13);
vcomisbf16(xm2, xm3);
vcomisbf16(xm2, ptr[rax+128]);
vgetexpbf16(xm1|k3, xmm2);
vgetexpbf16(xm1|k3, ptr[rax+128]);
vgetexpbf16(xm1|k3, ptr_b[rax+128]);
vgetexpbf16(ym1|k3, ymm2);
vgetexpbf16(ym1|k3, ptr[rax+128]);
vgetexpbf16(ym1|k3, ptr_b[rax+128]);
vgetexpbf16(zm1|k3, zmm2);
vgetexpbf16(zm1|k3, ptr[rax+128]);
vgetexpbf16(zm1|k3, ptr_b[rax+128]);
vgetmantbf16(xm1|k3, xmm2, 3);
vgetmantbf16(xm1|k3, ptr[rax+128], 5);
vgetmantbf16(xm1|k3, ptr_b[rax+128], 9);
vgetmantbf16(ym1|k3, ymm2, 3);
vgetmantbf16(ym1|k3, ptr[rax+128], 5);
vgetmantbf16(ym1|k3, ptr_b[rax+128], 9);
vgetmantbf16(zm1|k3, zmm2, 3);
vgetmantbf16(zm1|k3, ptr[rax+128], 5);
vgetmantbf16(zm1|k3, ptr_b[rax+128], 9);
vrcpbf16(xm1|k5, xm2);
vrcpbf16(xm1|k5, ptr[rcx+128]);
vrcpbf16(xm1|k5, ptr_b[rcx+128]);
vrcpbf16(ym1|k5, ym2);
vrcpbf16(ym1|k5, ptr[rcx+128]);
vrcpbf16(ym1|k5, ptr_b[rcx+128]);
vrcpbf16(zm1|k5, zm2);
vrcpbf16(zm1|k5, ptr[rcx+128]);
vrcpbf16(zm1|k5, ptr_b[rcx+128]);
vreducebf16(xm1|k4, xm2, 1);
vreducebf16(xm1|k4, ptr[rax+128], 1);
vreducebf16(xm1|k4, ptr_b[rax+128], 1);
vreducebf16(ym1|k4, ym2, 1);
vreducebf16(ym1|k4, ptr[rax+128], 1);
vreducebf16(ym1|k4, ptr_b[rax+128], 1);
vreducebf16(zm1|k4, zm2, 1);
vreducebf16(zm1|k4, ptr[rax+128], 1);
vreducebf16(zm1|k4, ptr_b[rax+128], 1);
vrndscalebf16(xm1|k4, xm2, 1);
vrndscalebf16(xm1|k4, ptr[rax+128], 1);
vrndscalebf16(xm1|k4, ptr_b[rax+128], 1);
vrndscalebf16(ym1|k4, ym2, 1);
vrndscalebf16(ym1|k4, ptr[rax+128], 1);
vrndscalebf16(ym1|k4, ptr_b[rax+128], 1);
vrndscalebf16(zm1|k4, zm2, 1);
vrndscalebf16(zm1|k4, ptr[rax+128], 1);
vrndscalebf16(zm1|k4, ptr_b[rax+128], 1);
vrsqrtbf16(xm1|k5, xm2);
vrsqrtbf16(xm1|k5, ptr[rcx+128]);
vrsqrtbf16(xm1|k5, ptr_b[rcx+128]);
vrsqrtbf16(ym1|k5, ym2);
vrsqrtbf16(ym1|k5, ptr[rcx+128]);
vrsqrtbf16(ym1|k5, ptr_b[rcx+128]);
vrsqrtbf16(zm1|k5, zm2);
vrsqrtbf16(zm1|k5, ptr[rcx+128]);
vrsqrtbf16(zm1|k5, ptr_b[rcx+128]);
vscalefbf16(xm1|k5, xm5, xm2);
vscalefbf16(xm1|k5, xm5, ptr[rcx+128]);
vscalefbf16(xm1|k5, xm5, ptr_b[rcx+128]);
vscalefbf16(ym1|k5, ym9, ym2);
vscalefbf16(ym1|k5, ym9, ptr[rcx+128]);
vscalefbf16(ym1|k5, ym9, ptr_b[rcx+128]);
vscalefbf16(zm1|k5, zm30, zm2);
vscalefbf16(zm1|k5, zm30, ptr[rcx+128]);
vscalefbf16(zm1|k5, zm30, ptr_b[rcx+128]);
vsqrtbf16(xm5|k3, xmm4);
vsqrtbf16(xm5|k3, ptr[rax+128]);
vsqrtbf16(xm5|k3, ptr_b[rax+128]);
vsqrtbf16(ym5|k3, ymm4);
vsqrtbf16(ym5|k3, ptr[rax+128]);
vsqrtbf16(ym5|k3, ptr_b[rax+128]);
vsqrtbf16(zm5|k3, zmm4);
vsqrtbf16(zm5|k3, ptr[rax+128]);
vsqrtbf16(zm5|k3, ptr_b[rax+128]);
+17
View File
@@ -0,0 +1,17 @@
vcomxsd(xm1, xm2|T_sae);
vcomxsd(xm1, ptr[rax+128]);
vcomxsh(xm1, xm2|T_sae);
vcomxsh(xm1, ptr[rax+128]);
vcomxss(xm1, xm2|T_sae);
vcomxss(xm1, ptr[rax+128]);
vucomxsd(xm1, xm2|T_sae);
vucomxsd(xm1, ptr[rax+128]);
vucomxsh(xm1, xm2|T_sae);
vucomxsh(xm1, ptr[rax+128]);
vucomxss(xm1, xm2|T_sae);
vucomxss(xm1, ptr[rax+128]);
+200
View File
@@ -0,0 +1,200 @@
vcvt2ps2phx(xm1|k5, xm2, xm3);
vcvt2ps2phx(xm1|k5, xm2, ptr[rax+128]);
vcvt2ps2phx(xm1|k5, xm2, ptr_b[rax+128]);
vcvt2ps2phx(ym1|k5, ym2, ym3);
vcvt2ps2phx(ym1|k5, ym2, ptr[rax+128]);
vcvt2ps2phx(ym1|k5, ym2, ptr_b[rax+128]);
vcvt2ps2phx(zm1|k5, zm2, zm3);
vcvt2ps2phx(zm1|k5, zm2, ptr[rax+128]);
vcvt2ps2phx(zm1|k5, zm2, ptr_b[rax+128]);
// vcvtbiasph2hf8
vcvtbiasph2bf8(xm1|k2, xm3, xm5);
vcvtbiasph2bf8(xm1|k2, xm3, ptr[rax+128]);
vcvtbiasph2bf8(xm1|k2, xm3, ptr_b[rax+128]);
vcvtbiasph2bf8(xm1|k2, ym3, ym5);
vcvtbiasph2bf8(xm1|k2, ym3, ptr[rax+128]);
vcvtbiasph2bf8(xm1|k2, ym3, ptr_b[rax+128]);
vcvtbiasph2bf8(ym1|k2, zm3, zm5);
vcvtbiasph2bf8(ym1|k2, zm3, ptr[rax+128]);
vcvtbiasph2bf8(ym1|k2, zm3, ptr_b[rax+128]);
// vcvtbiasph2bf8s
vcvtbiasph2bf8s(xm1|k2, xm3, xm5);
vcvtbiasph2bf8s(xm1|k2, xm3, ptr[rax+128]);
vcvtbiasph2bf8s(xm1|k2, xm3, ptr_b[rax+128]);
vcvtbiasph2bf8s(xm1|k2, ym3, ym5);
vcvtbiasph2bf8s(xm1|k2, ym3, ptr[rax+128]);
vcvtbiasph2bf8s(xm1|k2, ym3, ptr_b[rax+128]);
vcvtbiasph2bf8s(ym1|k2, zm3, zm5);
vcvtbiasph2bf8s(ym1|k2, zm3, ptr[rax+128]);
vcvtbiasph2bf8s(ym1|k2, zm3, ptr_b[rax+128]);
// vcvtbiasph2hf8
vcvtbiasph2hf8(xm1|k2, xm3, xm5);
vcvtbiasph2hf8(xm1|k2, xm3, ptr[rax+128]);
vcvtbiasph2hf8(xm1|k2, xm3, ptr_b[rax+128]);
vcvtbiasph2hf8(xm1|k2, ym3, ym5);
vcvtbiasph2hf8(xm1|k2, ym3, ptr[rax+128]);
vcvtbiasph2hf8(xm1|k2, ym3, ptr_b[rax+128]);
vcvtbiasph2hf8(ym1|k2, zm3, zm5);
vcvtbiasph2hf8(ym1|k2, zm3, ptr[rax+128]);
vcvtbiasph2hf8(ym1|k2, zm3, ptr_b[rax+128]);
// vcvtbiasph2hf8s
vcvtbiasph2hf8s(xm1|k2, xm3, xm5);
vcvtbiasph2hf8s(xm1|k2, xm3, ptr[rax+128]);
vcvtbiasph2hf8s(xm1|k2, xm3, ptr_b[rax+128]);
vcvtbiasph2hf8s(xm1|k2, ym3, ym5);
vcvtbiasph2hf8s(xm1|k2, ym3, ptr[rax+128]);
vcvtbiasph2hf8s(xm1|k2, ym3, ptr_b[rax+128]);
vcvtbiasph2hf8s(ym1|k2, zm3, zm5);
vcvtbiasph2hf8s(ym1|k2, zm3, ptr[rax+128]);
vcvtbiasph2hf8s(ym1|k2, zm3, ptr_b[rax+128]);
vcvthf82ph(xm1|k5|T_z, xm2);
vcvthf82ph(xm1|k5|T_z, ptr[rax+128]);
vcvthf82ph(ym1|k5|T_z, xm2);
vcvthf82ph(ym1|k5|T_z, ptr[rax+128]);
vcvthf82ph(zm1|k5|T_z, ym2);
vcvthf82ph(zm1|k5|T_z, ptr[rax+128]);
//
vcvt2ph2bf8(xm1|k4|T_z, xm2, xm3);
vcvt2ph2bf8(xm1|k4, xm2, ptr[rax+128]);
vcvt2ph2bf8(xm1|T_z, xm2, ptr_b[rax+128]);
vcvt2ph2bf8(ym1|k4|T_z, ym2, ym3);
vcvt2ph2bf8(ym1|k4, ym2, ptr[rax+128]);
vcvt2ph2bf8(ym1|T_z, ym2, ptr_b[rax+128]);
vcvt2ph2bf8(zm1|k4|T_z, zm2, zm3);
vcvt2ph2bf8(zm1|k4, zm2, ptr[rax+128]);
vcvt2ph2bf8(zm1|T_z, zm2, ptr_b[rax+128]);
//
vcvt2ph2bf8s(xm1|k4|T_z, xm2, xm3);
vcvt2ph2bf8s(xm1|k4, xm2, ptr[rax+128]);
vcvt2ph2bf8s(xm1|T_z, xm2, ptr_b[rax+128]);
vcvt2ph2bf8s(ym1|k4|T_z, ym2, ym3);
vcvt2ph2bf8s(ym1|k4, ym2, ptr[rax+128]);
vcvt2ph2bf8s(ym1|T_z, ym2, ptr_b[rax+128]);
vcvt2ph2bf8s(zm1|k4|T_z, zm2, zm3);
vcvt2ph2bf8s(zm1|k4, zm2, ptr[rax+128]);
vcvt2ph2bf8s(zm1|T_z, zm2, ptr_b[rax+128]);
//
vcvt2ph2hf8(xm1|k4|T_z, xm2, xm3);
vcvt2ph2hf8(xm1|k4, xm2, ptr[rax+128]);
vcvt2ph2hf8(xm1|T_z, xm2, ptr_b[rax+128]);
vcvt2ph2hf8(ym1|k4|T_z, ym2, ym3);
vcvt2ph2hf8(ym1|k4, ym2, ptr[rax+128]);
vcvt2ph2hf8(ym1|T_z, ym2, ptr_b[rax+128]);
vcvt2ph2hf8(zm1|k4|T_z, zm2, zm3);
vcvt2ph2hf8(zm1|k4, zm2, ptr[rax+128]);
vcvt2ph2hf8(zm1|T_z, zm2, ptr_b[rax+128]);
//
vcvt2ph2hf8s(xm1|k4|T_z, xm2, xm3);
vcvt2ph2hf8s(xm1|k4, xm2, ptr[rax+128]);
vcvt2ph2hf8s(xm1|T_z, xm2, ptr_b[rax+128]);
vcvt2ph2hf8s(ym1|k4|T_z, ym2, ym3);
vcvt2ph2hf8s(ym1|k4, ym2, ptr[rax+128]);
vcvt2ph2hf8s(ym1|T_z, ym2, ptr_b[rax+128]);
vcvt2ph2hf8s(zm1|k4|T_z, zm2, zm3);
vcvt2ph2hf8s(zm1|k4, zm2, ptr[rax+128]);
vcvt2ph2hf8s(zm1|T_z, zm2, ptr_b[rax+128]);
// vcvtph2bf8
vcvtph2bf8(xmm1|k2|T_z, xmm2);
vcvtph2bf8(xmm1|k2|T_z, xword [rax+128]);
vcvtph2bf8(xmm1|k2|T_z, xword_b[rax+128]);
vcvtph2bf8(xmm1|k2|T_z, ymm2);
vcvtph2bf8(xmm1|k2|T_z, yword[rax+128]);
vcvtph2bf8(xmm1|k2|T_z, yword_b[rax+128]);
vcvtph2bf8(ymm1|k2|T_z, zmm2);
vcvtph2bf8(ymm1|k2|T_z, zword[rax+128]);
vcvtph2bf8(ymm1|k2|T_z, zword_b[rax+128]);
// vcvtph2bf8s
vcvtph2bf8s(xmm1|k2|T_z, xmm2);
vcvtph2bf8s(xmm1|k2|T_z, xword [rax+128]);
vcvtph2bf8s(xmm1|k2|T_z, xword_b[rax+128]);
vcvtph2bf8s(xmm1|k2|T_z, ymm2);
vcvtph2bf8s(xmm1|k2|T_z, yword[rax+128]);
vcvtph2bf8s(xmm1|k2|T_z, yword_b[rax+128]);
vcvtph2bf8s(ymm1|k2|T_z, zmm2);
vcvtph2bf8s(ymm1|k2|T_z, zword[rax+128]);
vcvtph2bf8s(ymm1|k2|T_z, zword_b[rax+128]);
// vcvtph2hf8
vcvtph2hf8(xmm1|k2|T_z, xmm2);
vcvtph2hf8(xmm1|k2|T_z, xword [rax+128]);
vcvtph2hf8(xmm1|k2|T_z, xword_b[rax+128]);
vcvtph2hf8(xmm1|k2|T_z, ymm2);
vcvtph2hf8(xmm1|k2|T_z, yword[rax+128]);
vcvtph2hf8(xmm1|k2|T_z, yword_b[rax+128]);
vcvtph2hf8(ymm1|k2|T_z, zmm2);
vcvtph2hf8(ymm1|k2|T_z, zword[rax+128]);
vcvtph2hf8(ymm1|k2|T_z, zword_b[rax+128]);
// vcvtph2hf8s
vcvtph2hf8s(xmm1|k2|T_z, xmm2);
vcvtph2hf8s(xmm1|k2|T_z, xword [rax+128]);
vcvtph2hf8s(xmm1|k2|T_z, xword_b[rax+128]);
vcvtph2hf8s(xmm1|k2|T_z, ymm2);
vcvtph2hf8s(xmm1|k2|T_z, yword[rax+128]);
vcvtph2hf8s(xmm1|k2|T_z, yword_b[rax+128]);
vcvtph2hf8s(ymm1|k2|T_z, zmm2);
vcvtph2hf8s(ymm1|k2|T_z, zword[rax+128]);
vcvtph2hf8s(ymm1|k2|T_z, zword_b[rax+128]);
// AVX-NE-CONVERT
vbcstnebf162ps(xmm15, ptr[rax+128]);
vbcstnebf162ps(xmm15, ptr[rax+128]);
vbcstnesh2ps(ymm15, ptr[rax+128]);
vbcstnesh2ps(ymm15, ptr[rax+128]);
vcvtneebf162ps(xmm15, ptr[rax+128]);
vcvtneebf162ps(ymm15, ptr[rax+128]);
vcvtneeph2ps(xmm15, ptr[rax+128]);
vcvtneeph2ps(ymm15, ptr[rax+128]);
vcvtneobf162ps(xmm15, ptr[rax+128]);
vcvtneobf162ps(ymm15, ptr[rax+128]);
vcvtneoph2ps(xmm15, ptr[rax+128]);
vcvtneoph2ps(ymm15, ptr[rax+128]);
vcvtneps2bf16(xmm15, xmm3, VexEncoding);
vcvtneps2bf16(xmm15, ptr[rax+128], VexEncoding);
vcvtneps2bf16(xmm15, ymm3, VexEncoding);
vcvtneps2bf16(xmm15, ptr[rax+128], VexEncoding);
+63
View File
@@ -0,0 +1,63 @@
vminmaxbf16(xm1|k3|T_z, xm2, xm3, 5);
vminmaxbf16(xm1|k3|T_z, xm2, ptr[rax+128], 5);
vminmaxbf16(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
vminmaxbf16(ym1|k3|T_z, ym2, ym3, 5);
vminmaxbf16(ym1|k3|T_z, ym2, ptr[rax+128], 5);
vminmaxbf16(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
vminmaxbf16(zm1|k3|T_z, zm2, zm3, 5);
vminmaxbf16(zm1|k3|T_z, zm2, ptr[rax+128], 5);
vminmaxbf16(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
//
vminmaxpd(xm1|k3|T_z, xm2, xm3, 5);
vminmaxpd(xm1|k3|T_z, xm2, ptr[rax+128], 5);
vminmaxpd(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
vminmaxpd(ym1|k3|T_z, ym2, ym3, 5);
vminmaxpd(ym1|k3|T_z, ym2, ptr[rax+128], 5);
vminmaxpd(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
vminmaxpd(zm1|k3|T_z, zm2, zm3, 5);
vminmaxpd(zm1|k3|T_z, zm2, zm3|T_sae, 5);
vminmaxpd(zm1|k3|T_z, zm2, ptr[rax+128], 5);
vminmaxpd(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
//
vminmaxph(xm1|k3|T_z, xm2, xm3, 5);
vminmaxph(xm1|k3|T_z, xm2, ptr[rax+128], 5);
vminmaxph(xm1|k3|T_z, xm2, ptr[rax+128], 5);
vminmaxph(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
vminmaxph(ym1|k3|T_z, ym2, ym3, 5);
vminmaxph(ym1|k3|T_z, ym2, ptr[rax+128], 5);
vminmaxph(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
vminmaxph(zm1|k3|T_z, zm2, zm3, 5);
vminmaxph(zm1|k3|T_z, zm2, zm3|T_sae, 5);
vminmaxph(zm1|k3|T_z, zm2, ptr[rax+128], 5);
vminmaxph(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
//
vminmaxps(xm1|k3|T_z, xm2, xm3, 5);
vminmaxps(xm1|k3|T_z, xm2, ptr[rax+128], 5);
vminmaxps(xm1|k3|T_z, xm2, ptr_b[rax+128], 5);
vminmaxps(ym1|k3|T_z, ym2, ym3, 5);
vminmaxps(ym1|k3|T_z, ym2, ptr[rax+128], 5);
vminmaxps(ym1|k3|T_z, ym2, ptr_b[rax+128], 5);
vminmaxps(zm1|k3|T_z, zm2, zm3, 5);
vminmaxps(zm1|k3|T_z, zm2, zm3|T_sae, 5);
vminmaxps(zm1|k3|T_z, zm2, ptr[rax+128], 5);
vminmaxps(zm1|k3|T_z, zm2, ptr_b[rax+128], 5);
//
vminmaxsd(xm1|k3|T_z, xm2, xm3, 5);
vminmaxsd(xm1|k3|T_z, xm2, xm3|T_sae, 5);
vminmaxsd(xm1|k3|T_z, xm2, ptr[rax+128], 5);
//
vminmaxsh(xm1|k3|T_z, xm2, xm3, 5);
vminmaxsh(xm1|k3|T_z, xm2, xm3|T_sae, 5);
vminmaxsh(xm1|k3|T_z, xm2, ptr[rax+128], 5);
//
vminmaxss(xm1|k3|T_z, xm2, xm3, 5);
vminmaxss(xm1|k3|T_z, xm2, xm3|T_sae, 5);
vminmaxss(xm1|k3|T_z, xm2, ptr[rax+128], 5);
+265
View File
@@ -0,0 +1,265 @@
// AVX10 integer and FP16 VNNI, media and zero-extending
vdpphps(xm1, xm2, xm3);
vdpphps(xm1, xm2, ptr[rax+128]);
vdpphps(xm1, xm2, ptr_b[rax+128]);
vdpphps(ym1, ym2, ym3);
vdpphps(ym1, ym2, ptr[rax+128]);
vdpphps(ym1, ym2, ptr_b[rax+128]);
vdpphps(zm1, zm2, zm3);
vdpphps(zm1, zm2, ptr[rax+128]);
vdpphps(zm1, zm2, ptr_b[rax+128]);
//
vmpsadbw(xm1, xm3, xm15, 3);
vmpsadbw(xm1|T_z, xm4, ptr[rax+128], 5);
vmpsadbw(ym1|k4, ym3, ym15, 3);
vmpsadbw(ym1, ym4, ptr[rax+128], 5);
vmpsadbw(zm1|k4, zm3, zm15, 3);
vmpsadbw(zm1, zm4, ptr[rax+128], 5);
//
vpdpbssd(xm1, xm2, xm3);
vpdpbssd(xm1, xm2, ptr[rax+128]);
vpdpbssd(xm1, xm2, ptr_b[rax+128]);
vpdpbssd(ym1, ym2, ym3);
vpdpbssd(ym1, ym2, ptr[rax+128]);
vpdpbssd(ym1, ym2, ptr_b[rax+128]);
vpdpbssd(zm1, zm2, zm3);
vpdpbssd(zm1, zm2, ptr[rax+128]);
vpdpbssd(zm1, zm2, ptr_b[rax+128]);
//
vpdpbssds(xm1, xm2, xm3);
vpdpbssds(xm1, xm2, ptr[rax+128]);
vpdpbssds(xm1, xm2, ptr_b[rax+128]);
vpdpbssds(ym1, ym2, ym3);
vpdpbssds(ym1, ym2, ptr[rax+128]);
vpdpbssds(ym1, ym2, ptr_b[rax+128]);
vpdpbssds(zm1, zm2, zm3);
vpdpbssds(zm1, zm2, ptr[rax+128]);
vpdpbssds(zm1, zm2, ptr_b[rax+128]);
//
vpdpbsud(xm1, xm2, xm3);
vpdpbsud(xm1, xm2, ptr[rax+128]);
vpdpbsud(xm1, xm2, ptr_b[rax+128]);
vpdpbsud(ym1, ym2, ym3);
vpdpbsud(ym1, ym2, ptr[rax+128]);
vpdpbsud(ym1, ym2, ptr_b[rax+128]);
vpdpbsud(zm1, zm2, zm3);
vpdpbsud(zm1, zm2, ptr[rax+128]);
vpdpbsud(zm1, zm2, ptr_b[rax+128]);
//
vpdpbsuds(xm1, xm2, xm3);
vpdpbsuds(xm1, xm2, ptr[rax+128]);
vpdpbsuds(xm1, xm2, ptr_b[rax+128]);
vpdpbsuds(ym1, ym2, ym3);
vpdpbsuds(ym1, ym2, ptr[rax+128]);
vpdpbsuds(ym1, ym2, ptr_b[rax+128]);
vpdpbsuds(zm1, zm2, zm3);
vpdpbsuds(zm1, zm2, ptr[rax+128]);
vpdpbsuds(zm1, zm2, ptr_b[rax+128]);
//
vpdpbuud(xm1, xm2, xm3);
vpdpbuud(xm1, xm2, ptr[rax+128]);
vpdpbuud(xm1, xm2, ptr_b[rax+128]);
vpdpbuud(ym1, ym2, ym3);
vpdpbuud(ym1, ym2, ptr[rax+128]);
vpdpbuud(ym1, ym2, ptr_b[rax+128]);
vpdpbuud(zm1, zm2, zm3);
vpdpbuud(zm1, zm2, ptr[rax+128]);
vpdpbuud(zm1, zm2, ptr_b[rax+128]);
//
vpdpbuuds(xm1, xm2, xm3);
vpdpbuuds(xm1, xm2, ptr[rax+128]);
vpdpbuuds(xm1, xm2, ptr_b[rax+128]);
vpdpbuuds(ym1, ym2, ym3);
vpdpbuuds(ym1, ym2, ptr[rax+128]);
vpdpbuuds(ym1, ym2, ptr_b[rax+128]);
vpdpbuuds(zm1, zm2, zm3);
vpdpbuuds(zm1, zm2, ptr[rax+128]);
vpdpbuuds(zm1, zm2, ptr_b[rax+128]);
//
vpdpwsud(xm1, xm2, xm3);
vpdpwsud(xm1, xm2, ptr[rax+128]);
vpdpwsud(xm1, xm2, ptr_b[rax+128]);
vpdpwsud(ym1, ym2, ym3);
vpdpwsud(ym1, ym2, ptr[rax+128]);
vpdpwsud(ym1, ym2, ptr_b[rax+128]);
vpdpwsud(zm1, zm2, zm3);
vpdpwsud(zm1, zm2, ptr[rax+128]);
vpdpwsud(zm1, zm2, ptr_b[rax+128]);
//
vpdpwsuds(xm1, xm2, xm3);
vpdpwsuds(xm1, xm2, ptr[rax+128]);
vpdpwsuds(xm1, xm2, ptr_b[rax+128]);
vpdpwsuds(ym1, ym2, ym3);
vpdpwsuds(ym1, ym2, ptr[rax+128]);
vpdpwsuds(ym1, ym2, ptr_b[rax+128]);
vpdpwsuds(zm1, zm2, zm3);
vpdpwsuds(zm1, zm2, ptr[rax+128]);
vpdpwsuds(zm1, zm2, ptr_b[rax+128]);
//
vpdpwsud(xm1, xm2, xm3);
vpdpwsud(xm1, xm2, ptr[rax+128]);
vpdpwsud(xm1, xm2, ptr_b[rax+128]);
vpdpwsud(ym1, ym2, ym3);
vpdpwsud(ym1, ym2, ptr[rax+128]);
vpdpwsud(ym1, ym2, ptr_b[rax+128]);
vpdpwsud(zm1, zm2, zm3);
vpdpwsud(zm1, zm2, ptr[rax+128]);
vpdpwsud(zm1, zm2, ptr_b[rax+128]);
//
vpdpwsuds(xm1, xm2, xm3);
vpdpwsuds(xm1, xm2, ptr[rax+128]);
vpdpwsuds(xm1, xm2, ptr_b[rax+128]);
vpdpwsuds(ym1, ym2, ym3);
vpdpwsuds(ym1, ym2, ptr[rax+128]);
vpdpwsuds(ym1, ym2, ptr_b[rax+128]);
vpdpwsuds(zm1, zm2, zm3);
vpdpwsuds(zm1, zm2, ptr[rax+128]);
vpdpwsuds(zm1, zm2, ptr_b[rax+128]);
//
vpdpwuud(xm1, xm2, xm3);
vpdpwuud(xm1, xm2, ptr[rax+128]);
vpdpwuud(xm1, xm2, ptr_b[rax+128]);
vpdpwuud(ym1, ym2, ym3);
vpdpwuud(ym1, ym2, ptr[rax+128]);
vpdpwuud(ym1, ym2, ptr_b[rax+128]);
vpdpwuud(zm1, zm2, zm3);
vpdpwuud(zm1, zm2, ptr[rax+128]);
vpdpwuud(zm1, zm2, ptr_b[rax+128]);
//
vpdpwuuds(xm1, xm2, xm3);
vpdpwuuds(xm1, xm2, ptr[rax+128]);
vpdpwuuds(xm1, xm2, ptr_b[rax+128]);
vpdpwuuds(ym1, ym2, ym3);
vpdpwuuds(ym1, ym2, ptr[rax+128]);
vpdpwuuds(ym1, ym2, ptr_b[rax+128]);
vpdpwuuds(zm1, zm2, zm3);
vpdpwuuds(zm1, zm2, ptr[rax+128]);
vpdpwuuds(zm1, zm2, ptr_b[rax+128]);
//
vmovd(xm10, xm20);
vmovd(xm1, xm2);
vmovd(xm10, ptr[rax+128]);
vmovd(ptr[rax+128], xm30);
//
vmovw(xm1, xm20);
vmovw(xm1, xm2);
vmovw(xm3, ptr [rax+0x40]);
vmovw(ptr [rax+0x40], xm7);
//
push(rax);
push(rcx);
push(rdx);
push(rbx);
push(rsp);
push(rbp);
push(rsi);
push(rdi);
push(r8);
push(r9);
push(r10);
push(r11);
push(r12);
push(r13);
push(r14);
push(r15);
push(r16);
push(r17);
push(r18);
push(r19);
push(r20);
push(r21);
push(r22);
push(r23);
push(r24);
push(r25);
push(r26);
push(r27);
push(r28);
push(r29);
push(r30);
push(r31);
pop(rax);
pop(rcx);
pop(rdx);
pop(rbx);
pop(rsp);
pop(rbp);
pop(rsi);
pop(rdi);
pop(r8);
pop(r9);
pop(r10);
pop(r11);
pop(r12);
pop(r13);
pop(r14);
pop(r15);
pop(r16);
pop(r17);
pop(r18);
pop(r19);
pop(r20);
pop(r21);
pop(r22);
pop(r23);
pop(r24);
pop(r25);
pop(r26);
pop(r27);
pop(r28);
pop(r29);
pop(r30);
pop(r31);
movrs(rcx, ptr[rax]);
movrs(ecx, ptr[rax]);
movrs(cx, ptr[rax]);
movrs(cl, ptr[rax+rdx*4]);
vmovrsb(xm1|k1|T_z, ptr[rax+128]);
vmovrsb(ym1|k1|T_z, ptr[rax+128]);
vmovrsb(zm1|k1|T_z, ptr[rax+128]);
vmovrsd(xm1|k1|T_z, ptr[rax+128]);
vmovrsd(ym1|k1|T_z, ptr[rax+128]);
vmovrsd(zm1|k1|T_z, ptr[rax+128]);
vmovrsq(xm1|k1|T_z, ptr[rax+128]);
vmovrsq(ym1|k1|T_z, ptr[rax+128]);
vmovrsq(zm1|k1|T_z, ptr[rax+128]);
vmovrsw(xm1|k1|T_z, ptr[rax+128]);
vmovrsw(ym1|k1|T_z, ptr[rax+128]);
vmovrsw(zm1|k1|T_z, ptr[rax+128]);
+638
View File
@@ -0,0 +1,638 @@
v4fmaddps(zmm1, zmm8, ptr [rdx + 64]);
v4fmaddss(xmm15, xmm8, ptr [rax + 64]);
v4fnmaddps(zmm5 | k5, zmm2, ptr [rcx + 0x80]);
v4fnmaddss(xmm31, xmm2, ptr [rsp + 0x80]);
vp4dpwssd(zmm23 | k7 | T_z, zmm1, ptr [rax + 64]);
vp4dpwssds(zmm10 | k4, zmm3, ptr [rsp + rax * 4 + 64]);
vaesdec(xmm20, xmm30, ptr [rcx + 64]);
vaesdec(ymm1, ymm2, ptr [rcx + 64]);
vaesdec(zmm1, zmm2, ptr [rcx + 64]);
vaesdeclast(xmm20, xmm30, ptr [rax + 64]);
vaesdeclast(ymm20, ymm30, ptr [rax + 64]);
vaesdeclast(zmm20, zmm30, ptr [rax + 64]);
vaesenc(xmm20, xmm30, ptr [rcx + 64]);
vaesenc(ymm1, ymm2, ptr [rcx + 64]);
vaesenc(zmm1, zmm2, ptr [rcx + 64]);
vaesenclast(xmm20, xmm30, ptr [rax + 64]);
vaesenclast(ymm20, ymm30, ptr [rax + 64]);
vaesenclast(zmm20, zmm30, ptr [rax + 64]);
vpclmulqdq(xmm2, xmm3, ptr [rax + 64], 3);
vpclmulqdq(ymm2, ymm3, ptr [rax + 64], 3);
vpclmulqdq(zmm2, zmm3, ptr [rax + 64], 3);
vpclmulqdq(xmm20, xmm3, ptr [rax + 64], 3);
vpclmulqdq(ymm20, ymm3, ptr [rax + 64], 3);
vpclmulqdq(zmm20, zmm3, ptr [rax + 64], 3);
vpcompressb(ptr[rax + 64], xmm1);
vpcompressb(xmm30 | k5, xmm1);
vpcompressb(ptr[rax + 64], ymm1);
vpcompressb(ymm30 | k3 |T_z, ymm1);
vpcompressb(ptr[rax + 64], zmm1);
vpcompressb(zmm30 | k2 |T_z, zmm1);
vpcompressw(ptr[rax + 64], xmm1);
vpcompressw(xmm30 | k5, xmm1);
vpcompressw(ptr[rax + 64], ymm1);
vpcompressw(ymm30 | k3 |T_z, ymm1);
vpcompressw(ptr[rax + 64], zmm1);
vpcompressw(zmm30 | k2 |T_z, zmm1);
vpshldw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
vpshldw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
vpshldw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
vpshldd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
vpshldd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
vpshldd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
vpshldq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
vpshldq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
vpshldq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
vpshldvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
vpshldvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
vpshldvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
vpshldvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
vpshldvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
vpshldvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
vpshldvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
vpshldvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
vpshldvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
vpshrdw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
vpshrdw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
vpshrdw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
vpshrdd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
vpshrdd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
vpshrdd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
vpshrdq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40], 5);
vpshrdq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40], 5);
vpshrdq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40], 5);
vpshrdvw(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
vpshrdvw(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
vpshrdvw(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
vpshrdvd(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
vpshrdvd(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
vpshrdvd(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
vpshrdvq(xmm5|k3|T_z, xmm2, ptr [rax + 0x40]);
vpshrdvq(ymm5|k3|T_z, ymm2, ptr [rax + 0x40]);
vpshrdvq(zmm5|k3|T_z, zmm2, ptr [rax + 0x40]);
vpshrdd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);
vpshrdd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5);
vpshrdd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5);
vpshrdq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40], 5);
vpshrdq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40], 5);
vpshrdq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40], 5);
vpshrdvd(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]);
vpshrdvd(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]);
vpshrdvd(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]);
vpshrdvq(xmm5|k3|T_z, xmm2, ptr_b [rax + 0x40]);
vpshrdvq(ymm5|k3|T_z, ymm2, ptr_b [rax + 0x40]);
vpshrdvq(zmm5|k3|T_z, zmm2, ptr_b [rax + 0x40]);
vpopcntb(xmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntb(ymm5|k3|T_z, ptr [rax + 0x40]);
vpopcntb(zmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntw(xmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntw(ymm5|k3|T_z, ptr [rax + 0x40]);
vpopcntw(zmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntd(xmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntd(ymm5|k3|T_z, ptr [rax + 0x40]);
vpopcntd(zmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntd(xmm5|k3|T_z, ptr_b [rax + 0x40]);
vpopcntd(ymm5|k3|T_z, ptr_b [rax + 0x40]);
vpopcntd(zmm5|k3|T_z, ptr_b [rax + 0x40]);
vpopcntq(xmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntq(ymm5|k3|T_z, ptr [rax + 0x40]);
vpopcntq(zmm5|k3|T_z, ptr [rax + 0x40]);
vpopcntq(xmm5|k3|T_z, ptr_b [rax + 0x40]);
vpopcntq(ymm5|k3|T_z, ptr_b [rax + 0x40]);
vpopcntq(zmm5|k3|T_z, ptr_b [rax + 0x40]);
vpdpbusd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
vpdpbusd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
vpdpbusd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
vpdpbusd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
vpdpbusd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
vpdpbusd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
vpdpbusds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
vpdpbusds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
vpdpbusds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
vpdpbusds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
vpdpbusds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
vpdpbusds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
vpdpwssd(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
vpdpwssd(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
vpdpwssd(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
vpdpwssd(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
vpdpwssd(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
vpdpwssd(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
vpdpwssds(xmm5|k3|T_z, xmm20, ptr [rax + 0x40]);
vpdpwssds(ymm5|k3|T_z, ymm20, ptr [rax + 0x40]);
vpdpwssds(zmm5|k3|T_z, zmm20, ptr [rax + 0x40]);
vpdpwssds(xmm5|k3|T_z, xmm20, ptr_b [rax + 0x40]);
vpdpwssds(ymm5|k3|T_z, ymm20, ptr_b [rax + 0x40]);
vpdpwssds(zmm5|k3|T_z, zmm20, ptr_b [rax + 0x40]);
vpexpandb(xmm5|k3|T_z, xmm30);
vpexpandb(ymm5|k3|T_z, ymm30);
vpexpandb(zmm5|k3|T_z, zmm30);
vpexpandb(xmm5|k3|T_z, ptr [rax + 0x40]);
vpexpandb(ymm5|k3|T_z, ptr [rax + 0x40]);
vpexpandb(zmm5|k3|T_z, ptr [rax + 0x40]);
vpexpandw(xmm5|k3|T_z, xmm30);
vpexpandw(ymm5|k3|T_z, ymm30);
vpexpandw(zmm5|k3|T_z, zmm30);
vpexpandw(xmm5|k3|T_z, ptr [rax + 0x40]);
vpexpandw(ymm5|k3|T_z, ptr [rax + 0x40]);
vpexpandw(zmm5|k3|T_z, ptr [rax + 0x40]);
vpshufbitqmb(k1|k2, xmm2, ptr [rax + 0x40]);
vpshufbitqmb(k1|k2, ymm2, ptr [rax + 0x40]);
vpshufbitqmb(k1|k2, zmm2, ptr [rax + 0x40]);
gf2p8affineinvqb(xmm1, xmm2, 3);
gf2p8affineinvqb(xmm1, ptr [rax + 0x40], 3);
vgf2p8affineinvqb(xmm1, xmm5, xmm2, 3);
vgf2p8affineinvqb(ymm1, ymm5, ymm2, 3);
vgf2p8affineinvqb(xmm1, xmm5, ptr [rax + 0x40], 3);
vgf2p8affineinvqb(ymm1, ymm5, ptr [rax + 0x40], 3);
vgf2p8affineinvqb(xmm30, xmm31, xmm4, 5);
vgf2p8affineinvqb(ymm30, ymm31, ymm4, 5);
vgf2p8affineinvqb(zmm30, zmm31, zmm4, 5);
vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5);
vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5);
vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5);
vgf2p8affineinvqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5);
vgf2p8affineinvqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5);
vgf2p8affineinvqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5);
gf2p8affineqb(xmm1, xmm2, 3);
gf2p8affineqb(xmm1, ptr [rax + 0x40], 3);
vgf2p8affineqb(xmm1, xmm5, xmm2, 3);
vgf2p8affineqb(ymm1, ymm5, ymm2, 3);
vgf2p8affineqb(xmm1, xmm5, ptr [rax + 0x40], 3);
vgf2p8affineqb(ymm1, ymm5, ptr [rax + 0x40], 3);
vgf2p8affineqb(xmm30, xmm31, xmm4, 5);
vgf2p8affineqb(ymm30, ymm31, ymm4, 5);
vgf2p8affineqb(zmm30, zmm31, zmm4, 5);
vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40], 5);
vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40], 5);
vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40], 5);
vgf2p8affineqb(xmm30|k1|T_z, xmm5, ptr_b [rax + 0x40], 5);
vgf2p8affineqb(ymm30|k1|T_z, ymm5, ptr_b [rax + 0x40], 5);
vgf2p8affineqb(zmm30|k1|T_z, zmm5, ptr_b [rax + 0x40], 5);
gf2p8mulb(xmm1, xmm2);
gf2p8mulb(xmm1, ptr [rax + 0x40]);
vgf2p8mulb(xmm1, xmm5, xmm2);
vgf2p8mulb(ymm1, ymm5, ymm2);
vgf2p8mulb(xmm1, xmm5, ptr [rax + 0x40]);
vgf2p8mulb(ymm1, ymm5, ptr [rax + 0x40]);
vgf2p8mulb(xmm30, xmm31, xmm4);
vgf2p8mulb(ymm30, ymm31, ymm4);
vgf2p8mulb(zmm30, zmm31, zmm4);
vgf2p8mulb(xmm30|k1|T_z, xmm5, ptr [rax + 0x40]);
vgf2p8mulb(ymm30|k1|T_z, ymm5, ptr [rax + 0x40]);
vgf2p8mulb(zmm30|k1|T_z, zmm5, ptr [rax + 0x40]);
vcvtne2ps2bf16(xmm0 | k1, xmm1, ptr [rax + 64]);
vcvtne2ps2bf16(ymm0 | k1 | T_z, ymm0, ptr [rax + 64]);
vcvtne2ps2bf16(zmm0 | k1, zmm1, ptr [rax + 64]);
vcvtneps2bf16(xmm0, xword [rax + 64]);
vcvtneps2bf16(xmm0 | k1, yword [rax + 64]);
vcvtneps2bf16(ymm0 | k1, zword [rax + 64]);
vcvtneps2bf16(ymm0 | k1, ptr [rax + 64]);
vdpbf16ps(xmm0 | k1, xmm1, ptr [rax + 64]);
vdpbf16ps(ymm0 | k1, ymm1, ptr [rax + 64]);
vdpbf16ps(zmm0 | k1, zmm1, ptr [rax + 64]);
vaddph(zmm0, zmm1, ptr[rax+64]);
vaddph(ymm0, ymm1, ptr[rax+64]);
vaddph(xmm0, xmm1, ptr[rax+64]);
vaddph(zmm0, zmm1, ptr_b[rax+64]);
vaddph(ymm0, ymm1, ptr_b[rax+64]);
vaddph(xmm0, xmm1, ptr_b[rax+64]);
vaddsh(xmm0, xmm15, ptr[rax+64]);
vaddsh(xmm0|k5|T_z|T_rd_sae, xmm15, xmm3);
vcmpph(k1, xm15, ptr[rax+64], 1);
vcmpph(k2, ym15, ptr[rax+64], 2);
vcmpph(k3, zm15, ptr[rax+64], 3);
vcmpph(k1, xm15, ptr_b[rax+64], 1);
vcmpph(k2, ym15, ptr_b[rax+64], 2);
vcmpph(k3, zm15, ptr_b[rax+64], 3);
vcmpsh(k1, xm15, ptr[rax+64], 1);
vcmpsh(k3|k5, xmm1, xmm25|T_sae, 4);
vcomish(xmm1, ptr[rax+64]);
vcomish(xmm1|T_sae, xmm15);
vucomish(xmm1, ptr [rax+0x40]);
vucomish(xmm1|T_sae, xmm15);
vfmaddsub213ph(xmm1, xmm2, ptr [rax+0x40]);
vfmaddsub213ph(xmm1, xmm2, ptr_b [rax+0x40]);
vfmaddsub213ph(xmm1|k3, xmm2, xmm5);
vfmaddsub213ph(ymm1, ymm2, ptr [rax+0x40]);
vfmaddsub213ph(ymm1, ymm2, ptr_b[rax+0x40]);
vfmaddsub213ph(ymm1|k3, ymm2, ymm5);
vfmaddsub213ph(zmm1, zmm2, ptr [rax+0x40]);
vfmaddsub213ph(zmm1, zmm2, ptr_b [rax+0x40]);
vfmaddsub213ph(zmm1|T_ru_sae, zmm2, zmm5);
vfmsubadd132ph(xmm1, xmm2, ptr [rax+0x40]);
vfmsubadd132ph(xmm1, xmm2, ptr_b [rax+0x40]);
vfmsubadd132ph(ymm1, ymm2, ptr [rax+0x40]);
vfmsubadd132ph(ymm1, ymm2, ptr_b [rax+0x40]);
vfmsubadd132ph(zmm1, zmm2, ptr [rax+0x40]);
vfmsubadd132ph(zmm1, zmm2, ptr_b [rax+0x40]);
vfmsubadd132ph(zmm1|T_ru_sae, zmm2, zmm5);
vfmadd132ph(xmm1, xmm2, ptr [rax+0x40]);
vfmadd132ph(xmm1, xmm2, ptr_b [rax+0x40]);
vfmadd132ph(ymm1, ymm2, ptr [rax+0x40]);
vfmadd132ph(ymm1, ymm2, ptr_b [rax+0x40]);
vfmadd132ph(zmm1, zmm2, ptr [rax+0x40]);
vfmadd132ph(zmm1, zmm2, ptr_b [rax+0x40]);
vfmadd132ph(zmm1|T_rd_sae, zmm2, zmm5);
vfmsub231ph(xmm1, xmm2, ptr [rax+0x40]);
vfmsub231ph(xmm1, xmm2, ptr_b [rax+0x40]);
vfmsub231ph(ymm1, ymm2, ptr [rax+0x40]);
vfmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]);
vfmsub231ph(zmm1, zmm2, ptr [rax+0x40]);
vfmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]);
vfmsub231ph(zmm1|T_rd_sae, zmm2, zmm5);
vfnmsub231ph(xmm1, xmm2, ptr [rax+0x40]);
vfnmsub231ph(ymm1, ymm2, ptr_b [rax+0x40]);
vfnmsub231ph(zmm1, zmm2, ptr_b [rax+0x40]);
vfnmsub231ph(zmm1|T_rd_sae, zmm2, zmm5);
vfmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
vfmadd132sh(xmm1, xmm2, ptr [rax+0x40]);
vfnmadd132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
vfnmadd132sh(xmm1, xmm2, ptr [rax+0x40]);
vfmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
vfmsub132sh(xmm1, xmm2, ptr [rax+0x40]);
vfnmsub132sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
vfnmsub132sh(xmm1, xmm2, ptr [rax+0x40]);
vfcmaddcph(xmm1|k1|T_z, xmm2, ptr [rax+0x40]);
vfcmaddcph(ymm1|k1|T_z, ymm2, ptr [rax+0x40]);
vfcmaddcph(zmm1|k1, zmm2, ptr [rax+0x40]);
vfcmaddcph(zmm1|k1|T_rd_sae, zmm2, zmm5);
vfcmaddcph(xmm1|k1|T_z, xmm2, ptr_b [rax+0x40]);
vfcmaddcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]);
vfcmaddcph(zmm1|k1|T_z, zmm2, ptr_b [rax+0x40]);
vfmaddcph(xm1, xm2, ptr[rax+0x40]);
vfmaddcph(ym1|k1|T_z, ym2, ptr_b[rax+0x40]);
vfmaddcph(zm1, zm2, ptr_b[rax+0x40]);
vfcmulcph(xmm1, xmm2, ptr [rax+0x40]);
vfcmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]);
vfcmulcph(zmm1, zmm2, ptr_b [rax+0x40]);
vfmulcph(xmm1, xmm2, ptr [rax+0x40]);
vfmulcph(ymm1|k1|T_z, ymm2, ptr_b [rax+0x40]);
vfmulcph(zmm1, zmm2, ptr_b [rax+0x40]);
vrcpph(xmm1, ptr [rax+0x40]);
vrcpph(xmm1, ptr_b [rax+0x40]);
vrcpph(ymm1, ptr [rax+0x40]);
vrcpph(ymm1, ptr_b [rax+0x40]);
vrcpph(zmm1, ptr [rax+0x40]);
vrcpph(zmm1, ptr_b [rax+0x40]);
vrcpsh(xmm1, xmm3, ptr [rax+0x40]);
vrsqrtph(xmm1, ptr [rax+0x40]);
vrsqrtph(xmm1, ptr_b [rax+0x40]);
vrsqrtph(ymm2, ptr [rax+0x40]);
vrsqrtph(ymm2, ptr_b [rax+0x40]);
vrsqrtph(zmm2, ptr [rax+0x40]);
vrsqrtph(zmm2, ptr_b [rax+0x40]);
vrsqrtsh(xmm1|k5|T_z, xmm7, ptr [rax+0x40]);
vsqrtph(xmm1|k4|T_z, ptr [rax+0x40]);
vsqrtph(xmm1|k4|T_z, ptr_b [rax+0x40]);
vsqrtph(ymm1|k4|T_z, ptr_b [rax+0x40]);
vsqrtph(zmm1|k4|T_z, ptr [rax+0x40]);
vsqrtph(zmm1|k4|T_z, ptr_b [rax+0x40]);
vsqrtsh(xmm1|k4|T_z, xmm5, ptr [rax+0x40]);
vsqrtsh(xmm1|k4|T_z|T_rd_sae, xmm5, xmm7);
vscalefph(xmm1, xmm5, ptr [rax+0x40]);
vscalefph(xmm1, xmm5, ptr_b [rax+0x40]);
vscalefph(ymm1, ymm5, ptr [rax+0x40]);
vscalefph(ymm1, ymm5, ptr_b [rax+0x40]);
vscalefph(zmm1, zmm5, ptr [rax+0x40]);
vscalefph(zmm1, zmm5, ptr_b [rax+0x40]);
vscalefph(zmm1|k1|T_z|T_rd_sae, zmm5, zmm7);
vscalefsh(xmm1, xmm5, ptr [rax+0x40]);
vscalefsh(xmm1|k1|T_z|T_rd_sae, xmm5, xmm7);
vreduceph(xmm1, ptr [rax+0x40], 0x1);
vreduceph(xmm1, ptr_b [rax+0x40], 0x2);
vreduceph(ymm1, ptr [rax+0x40], 0x3);
vreduceph(ymm1, ptr_b [rax+0x40], 0x4);
vreduceph(zmm1, ptr [rax+0x40], 0x5);
vreduceph(zmm1, ptr_b [rax+0x40], 0x6);
vreduceph(zmm1|k1|T_z|T_sae, zmm5, 0x7);
vreducesh(xmm1, xmm3, ptr [rax+0x40], 0x1);
vreducesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2);
vrndscaleph(xmm1, ptr [rax+0x40], 0x1);
vrndscaleph(xmm1, ptr_b [rax+0x40], 0x2);
vrndscaleph(ymm1, ptr [rax+0x40], 0x3);
vrndscaleph(ymm1, ptr_b [rax+0x40], 0x4);
vrndscaleph(zmm1, ptr [rax+0x40], 0x5);
vrndscaleph(zmm1, ptr_b [rax+0x40], 0x6);
vrndscaleph(zmm1|k1|T_z|T_sae, zmm5, 0x7);
vrndscalesh(xmm1, xmm3, ptr [rax+0x40], 0x1);
vrndscalesh(xmm1|k1|T_z|T_sae, xmm5, xmm4, 0x2);
vfpclassph(k1, xword [rax+0x40], 0x1);
vfpclassph(k1, xword_b[rax+0x40], 0x2);
vfpclassph(k1, yword [rax+0x40], 0x3);
vfpclassph(k1, yword_b[rax+0x40], 0x4);
vfpclassph(k1, zword [rax+0x40], 0x5);
vfpclassph(k1, zword_b[rax+0x40], 0x6);
vfpclasssh(k1|k2, xmm3, 0x5);
vfpclasssh(k1|k2, ptr [rax+0x40], 0x5);
vgetexpph(xmm1, ptr [rax+0x40]);
vgetexpph(ymm1, ptr_b [rax+0x40]);
vgetexpph(zmm1, ptr [rax+0x40]);
vgetexpph(zmm1|k1|T_z|T_sae, zmm5);
vgetexpsh(xmm1, xmm5, ptr [rax+0x40]);
vgetexpsh(xmm1|k1|T_z|T_sae, xmm3, xmm5);
vgetmantph(xmm1, ptr [rax+0x40], 0x1);
vgetmantph(ymm1, ptr_b [rax+0x40], 0x2);
vgetmantph(zmm1, ptr [rax+0x40], 0x3);
vgetmantph(zmm1|k1|T_z|T_sae, zmm5, 0x4);
vgetmantsh(xmm1, xmm5, ptr [rax+0x40], 0x5);
vgetmantsh(xmm1|k1|T_z|T_sae, xmm3, xmm5, 0x6);
vmovsh(xmm1|k1|T_z, ptr [rax+0x40]);
vmovsh(ptr [rax+0x40]|k1, xmm1);
vmovsh(xmm1|k2|T_z, xmm3, xmm5);
vcvtsd2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
vcvtsd2sh(xmm1, xmm2, ptr [rax+0x40]);
vcvtsh2sd(xmm1|k1|T_z|T_sae, xmm2, xmm3);
vcvtsh2sd(xmm1, xmm2, ptr [rax+0x40]);
vcvtsh2ss(xmm1|k1|T_z|T_sae, xmm2, xmm3);
vcvtsh2ss(xmm1, xmm2, ptr [rax+0x40]);
vcvtss2sh(xmm1|k1|T_z|T_rd_sae, xmm2, xmm3);
vcvtss2sh(xmm1, xmm2, ptr [rax+0x40]);
vcvtsh2si(edx|T_rd_sae, xmm1);
vcvtsh2si(edx, ptr [rax+0x40]);
vcvtsh2si(rdx|T_rd_sae, xmm1);
vcvtsh2si(r8, ptr [rax+0x40]);
vcvtph2dq(xmm1, xmm5);
vcvtph2dq(xmm1, ptr [rax+0x40]);
vcvtph2dq(xmm1, ptr_b [rax+0x40]);
vcvtph2dq(ymm1|k2|T_z, xmm5);
vcvtph2dq(ymm1, ptr [rax+0x40]);
vcvtph2dq(ymm1, ptr_b [rax+0x40]);
vcvtph2dq(zmm1|k5|T_z|T_rd_sae, ymm3);
vcvtph2dq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvtph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvtph2psx(xmm1, xmm5);
vcvtph2psx(xmm1, ptr [rax+0x40]);
vcvtph2psx(xmm1, ptr_b [rax+0x40]);
vcvtph2psx(ymm1|k2|T_z, xmm5);
vcvtph2psx(ymm1, ptr [rax+0x40]);
vcvtph2psx(ymm1, ptr_b [rax+0x40]);
vcvtph2psx(zmm1|k5|T_z|T_sae, ymm3);
vcvtph2psx(zmm1|k5|T_z, ptr [rax+0x40]);
vcvtph2psx(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvtph2udq(xmm1, xmm5);
vcvtph2udq(xmm1, ptr [rax+0x40]);
vcvtph2udq(xmm1, ptr_b [rax+0x40]);
vcvtph2udq(ymm1|k2|T_z, xmm5);
vcvtph2udq(ymm1, ptr [rax+0x40]);
vcvtph2udq(ymm1, ptr_b [rax+0x40]);
vcvtph2udq(zmm1|k5|T_z|T_rd_sae, ymm3);
vcvtph2udq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvtph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvttph2dq(xmm1, xmm5);
vcvttph2dq(xmm1, ptr [rax+0x40]);
vcvttph2dq(xmm1, ptr_b [rax+0x40]);
vcvttph2dq(ymm1|k2|T_z, xmm5);
vcvttph2dq(ymm1, ptr [rax+0x40]);
vcvttph2dq(ymm1, ptr_b [rax+0x40]);
vcvttph2dq(zmm1|k5|T_z|T_sae, ymm3);
vcvttph2dq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvttph2dq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvttph2udq(xmm1, xmm5);
vcvttph2udq(xmm1, ptr [rax+0x40]);
vcvttph2udq(xmm1, ptr_b [rax+0x40]);
vcvttph2udq(ymm1|k2|T_z, xmm5);
vcvttph2udq(ymm1, ptr [rax+0x40]);
vcvttph2udq(ymm1, ptr_b [rax+0x40]);
vcvttph2udq(zmm1|k5|T_z|T_sae, ymm3);
vcvttph2udq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvttph2udq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvtph2pd(xmm1, xmm5);
vcvtph2pd(xmm1, ptr [rax+0x40]);
vcvtph2pd(xmm1, ptr_b [rax+0x40]);
vcvtph2pd(ymm1|k2|T_z, xmm5);
vcvtph2pd(ymm1, ptr [rax+0x40]);
vcvtph2pd(ymm1, ptr_b [rax+0x40]);
vcvtph2pd(zmm1|k5|T_z|T_sae, xmm3);
vcvtph2pd(zmm1|k5|T_z, ptr [rax+0x40]);
vcvtph2pd(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvtph2qq(xmm1, xmm5);
vcvtph2qq(xmm1, ptr [rax+0x40]);
vcvtph2qq(xmm1, ptr_b [rax+0x40]);
vcvtph2qq(ymm1|k2|T_z, xmm5);
vcvtph2qq(ymm1, ptr [rax+0x40]);
vcvtph2qq(ymm1, ptr_b [rax+0x40]);
vcvtph2qq(zmm1|k5|T_z|T_rd_sae, xmm3);
vcvtph2qq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvtph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvtph2uqq(xmm1, xmm5);
vcvtph2uqq(xmm1, ptr [rax+0x40]);
vcvtph2uqq(xmm1, ptr_b [rax+0x40]);
vcvtph2uqq(ymm1|k2|T_z, xmm5);
vcvtph2uqq(ymm1, ptr [rax+0x40]);
vcvtph2uqq(ymm1, ptr_b [rax+0x40]);
vcvtph2uqq(zmm1|k5|T_z|T_rd_sae, xmm3);
vcvtph2uqq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvtph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvttph2uqq(xmm1, xmm5);
vcvttph2uqq(xmm1, ptr [rax+0x40]);
vcvttph2uqq(xmm1, ptr_b [rax+0x40]);
vcvttph2uqq(ymm1|k2|T_z, xmm5);
vcvttph2uqq(ymm1, ptr [rax+0x40]);
vcvttph2uqq(ymm1, ptr_b [rax+0x40]);
vcvttph2uqq(zmm1|k5|T_z|T_sae, xmm3);
vcvttph2uqq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvttph2uqq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvtdq2ph(xmm1, xmm5);
vcvtdq2ph(xmm1, xword [rax+0x40]);
vcvtdq2ph(xmm1, xword_b [rax+0x40]);
vcvtdq2ph(xmm1, yword [rax+0x40]);
vcvtdq2ph(xmm1, yword_b [rax+0x40]);
vcvtdq2ph(ymm1|k2|T_z|T_rd_sae, zmm5);
vcvtdq2ph(ymm1, ptr [rax+0x40]);
vcvtdq2ph(ymm1, ptr_b [rax+0x40]);
vcvtps2phx(xmm1, xmm5);
vcvtps2phx(xmm1, xword [rax+0x40]);
vcvtps2phx(xmm1, xword_b [rax+0x40]);
vcvtps2phx(xmm1, yword [rax+0x40]);
vcvtps2phx(xmm1, yword_b [rax+0x40]);
vcvtps2phx(ymm1|k2|T_z|T_rd_sae, zmm5);
vcvtps2phx(ymm1, ptr [rax+0x40]);
vcvtps2phx(ymm1, ptr_b [rax+0x40]);
vcvtudq2ph(xmm1, xmm5);
vcvtudq2ph(xmm1, xword [rax+0x40]);
vcvtudq2ph(xmm1, xword_b [rax+0x40]);
vcvtudq2ph(xmm1, yword [rax+0x40]);
vcvtudq2ph(xmm1, yword_b [rax+0x40]);
vcvtudq2ph(ymm1|k2|T_z|T_rd_sae, zmm5);
vcvtudq2ph(ymm1, ptr [rax+0x40]);
vcvtudq2ph(ymm1, ptr_b [rax+0x40]);
vcvtpd2ph(xmm1, xmm5);
vcvtpd2ph(xmm1, ymm5);
vcvtpd2ph(xmm1|k2|T_z|T_rd_sae, zmm5);
vcvtpd2ph(xmm1, xword [rax+0x40]);
vcvtpd2ph(xmm1, xword_b [rax+0x40]);
vcvtpd2ph(xmm1, yword [rax+0x40]);
vcvtpd2ph(xmm1, yword_b [rax+0x40]);
vcvtpd2ph(xmm1, zword [rax+0x40]);
vcvtpd2ph(xmm1, zword_b [rax+0x40]);
vcvtqq2ph(xmm1, xmm5);
vcvtqq2ph(xmm1, ymm5);
vcvtqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5);
vcvtqq2ph(xmm1, xword [rax+0x40]);
vcvtqq2ph(xmm1, xword_b [rax+0x40]);
vcvtqq2ph(xmm1, yword [rax+0x40]);
vcvtqq2ph(xmm1, yword_b [rax+0x40]);
vcvtqq2ph(xmm1, zword [rax+0x40]);
vcvtqq2ph(xmm1, zword_b [rax+0x40]);
vcvtuqq2ph(xmm1, xmm5);
vcvtuqq2ph(xmm1, ymm5);
vcvtuqq2ph(xmm1|k2|T_z|T_rd_sae, zmm5);
vcvtuqq2ph(xmm1, xword [rax+0x40]);
vcvtuqq2ph(xmm1, xword_b [rax+0x40]);
vcvtuqq2ph(xmm1, yword [rax+0x40]);
vcvtuqq2ph(xmm1, yword_b [rax+0x40]);
vcvtuqq2ph(xmm1, zword [rax+0x40]);
vcvtuqq2ph(xmm1, zword_b [rax+0x40]);
vcvtph2uw(xmm1, xmm5);
vcvtph2uw(xmm1, ptr [rax+0x40]);
vcvtph2uw(xmm1, ptr_b [rax+0x40]);
vcvtph2uw(ymm1, ptr [rax+0x40]);
vcvtph2uw(ymm1, ptr_b [rax+0x40]);
vcvtph2uw(zmm1|k2|T_z|T_rd_sae, zmm5);
vcvtph2uw(zmm1, ptr [rax+0x40]);
vcvtph2uw(zmm1, ptr_b [rax+0x40]);
vcvtph2w(xmm1, xmm5);
vcvtph2w(xmm1, ptr [rax+0x40]);
vcvtph2w(xmm1, ptr_b [rax+0x40]);
vcvtph2w(ymm1, ptr [rax+0x40]);
vcvtph2w(ymm1, ptr_b [rax+0x40]);
vcvtph2w(zmm1|k2|T_z|T_rd_sae, zmm5);
vcvtph2w(zmm1, ptr [rax+0x40]);
vcvtph2w(zmm1, ptr_b [rax+0x40]);
vcvttph2uw(xmm1, xmm5);
vcvttph2uw(xmm1, ptr [rax+0x40]);
vcvttph2uw(xmm1, ptr_b [rax+0x40]);
vcvttph2uw(ymm1, ptr [rax+0x40]);
vcvttph2uw(ymm1, ptr_b [rax+0x40]);
vcvttph2uw(zmm1|k2|T_z|T_sae, zmm5);
vcvttph2uw(zmm1, ptr [rax+0x40]);
vcvttph2uw(zmm1, ptr_b [rax+0x40]);
vcvttph2w(xmm1, xmm5);
vcvttph2w(xmm1, ptr [rax+0x40]);
vcvttph2w(xmm1, ptr_b [rax+0x40]);
vcvttph2w(ymm1, ptr [rax+0x40]);
vcvttph2w(ymm1, ptr_b [rax+0x40]);
vcvttph2w(zmm1|k2|T_z|T_sae, zmm5);
vcvttph2w(zmm1, ptr [rax+0x40]);
vcvttph2w(zmm1, ptr_b [rax+0x40]);
vcvtuw2ph(xmm1, xmm5);
vcvtuw2ph(xmm1, ptr [rax+0x40]);
vcvtuw2ph(xmm1, ptr_b [rax+0x40]);
vcvtuw2ph(ymm1, ptr [rax+0x40]);
vcvtuw2ph(ymm1, ptr_b [rax+0x40]);
vcvtuw2ph(zmm1|k2|T_z|T_rd_sae, zmm5);
vcvtuw2ph(zmm1, ptr [rax+0x40]);
vcvtuw2ph(zmm1, ptr_b [rax+0x40]);
vcvtw2ph(xmm1, xmm5);
vcvtw2ph(xmm1, ptr [rax+0x40]);
vcvtw2ph(xmm1, ptr_b [rax+0x40]);
vcvtw2ph(ymm1, ptr [rax+0x40]);
vcvtw2ph(ymm1, ptr_b [rax+0x40]);
vcvtw2ph(zmm1|k2|T_z|T_rd_sae, zmm5);
vcvtw2ph(zmm1, ptr [rax+0x40]);
vcvtw2ph(zmm1, ptr_b [rax+0x40]);
vcvtps2ph(xmm1, xmm2, 0x1);
vcvtps2ph(ptr [rax+0x40], xmm2, 0x2);
vcvtps2ph(xmm1, ymm2, 0x3);
vcvtps2ph(ptr [rax+0x40], ymm2, 0x4);
vcvtps2ph(xmm1|k1|T_z, xmm2, 0x5);
vcvtps2ph(ptr [rax+0x40]|k1, xmm3, 0x6);
vcvtps2ph(xmm1|k2, ymm4, 0x7);
vcvtps2ph(ptr [rax+0x40]|k2, ymm5, 0x8);
vcvtps2ph(ymm1|k2|T_sae, zmm5, 0x9);
vcvtps2ph(ptr [rax+0x40]|k5, zmm4, 0xa);
vcvtsh2usi(ecx|T_rd_sae, xmm1);
vcvtsh2usi(eax, ptr [rax+0x40]);
vcvtsh2usi(r9|T_rd_sae, xmm1);
vcvtsh2usi(r13, ptr [rax+0x40]);
vcvttsh2si(ecx|T_sae, xmm1);
vcvttsh2si(eax, ptr [rax+0x40]);
vcvttsh2si(r9|T_sae, xmm1);
vcvttsh2si(r13, ptr [rax+0x40]);
vcvttsh2usi(ecx|T_sae, xmm1);
vcvttsh2usi(eax, ptr [rax+0x40]);
vcvttsh2usi(r9|T_sae, xmm1);
vcvttsh2usi(r13, ptr [rax+0x40]);
vcvttph2qq(xmm1, xmm5);
vcvttph2qq(xmm1, ptr [rax+0x40]);
vcvttph2qq(xmm1, ptr_b [rax+0x40]);
vcvttph2qq(ymm1|k2|T_z, xmm5);
vcvttph2qq(ymm1, ptr [rax+0x40]);
vcvttph2qq(ymm1, ptr_b [rax+0x40]);
vcvttph2qq(zmm1|k5|T_z|T_sae, xmm3);
vcvttph2qq(zmm1|k5|T_z, ptr [rax+0x40]);
vcvttph2qq(zmm1|k5|T_z, ptr_b [rax+0x40]);
vcvtsi2sh(xmm1|T_rd_sae, xmm2, eax);
vcvtsi2sh(xmm1, xmm2, dword [rax+0x40]);
vcvtsi2sh(xmm1|T_rd_sae, xmm2, r9);
vcvtsi2sh(xmm1, xmm2, qword [rax+0x40]);
vcvtusi2sh(xmm1|T_rd_sae, xmm2, eax);
vcvtusi2sh(xmm1, xmm2, dword [rax+0x40]);
vcvtusi2sh(xmm1|T_rd_sae, xmm2, r9);
vcvtusi2sh(xmm1, xmm2, qword [rax+0x40]);
aadd(ptr[rax], ecx);
aadd(ptr[eax], ecx);
aadd(ptr[rax], r10);
aand(ptr[rax], ecx);
aand(ptr[eax], ecx);
aand(ptr[rax], r10);
aor(ptr[rax], ecx);
aor(ptr[eax], ecx);
aor(ptr[rax], r10);
axor(ptr[rax], ecx);
axor(ptr[eax], ecx);
axor(ptr[rax], r10);
cmpbexadd(ptr[rax+r10*4], rcx, rdx);
cmpbxadd(ptr[rax+r10*4], rcx, rdx);
cmplexadd(ptr[rax+r10*4], rcx, rdx);
cmplxadd(ptr[rax+r10*4], rcx, rdx);
cmpnbexadd(ptr[rax+r10*4], rcx, rdx);
cmpnbxadd(ptr[rax+r10*4], rcx, rdx);
cmpnlexadd(ptr[rax+r10*4], rcx, rdx);
cmpnlxadd(ptr[rax+r10*4], rcx, rdx);
cmpnoxadd(ptr[rax+r10*4], rcx, rdx);
cmpnpxadd(ptr[rax+r10*4], rcx, rdx);
cmpnsxadd(ptr[rax+r10*4], rcx, rdx);
cmpnzxadd(ptr[rax+r10*4], rcx, rdx);
cmpoxadd(ptr[rax+r10*4], rcx, rdx);
cmppxadd(ptr[rax+r10*4], rcx, rdx);
cmpsxadd(ptr[rax+r10*4], rcx, rdx);
cmpzxadd(ptr[rax+r10*4], rcx, rdx);
vsha512msg1(ymm3, xmm5);
vsha512msg2(ymm9, ymm10);
vsha512rnds2(ymm1, ymm3, xmm2);
vsm3msg1(xmm1, xmm2, xmm3);
vsm3msg1(xmm1, xmm2, ptr [rax]);
vsm3msg2(xmm5, xmm7, xmm3);
vsm3msg2(xmm5, xmm6, ptr [rax]);
vsm3rnds2(xmm5, xmm7, xmm3, 0x12);
vsm3rnds2(xmm5, xmm7, ptr [rcx], 0x34);
vsm4key4(xmm1, xmm2, xmm3);
vsm4key4(xmm1, xmm2, ptr [rdx]);
vsm4rnds4(xmm1, xmm2, xmm3);
vsm4rnds4(xmm5, xmm6, ptr [rcx+rax*4]);
vpdpbssd(xmm1, xmm2, xmm3);
vpdpbssd(ymm1, ymm2, ptr [rax]);
vpdpbssds(xmm1, xmm2, xmm3);
vpdpbssds(ymm1, ymm2, ptr [rax]);
vpdpbsud(xmm1, xmm2, xmm3);
vpdpbsud(ymm1, ymm2, ptr [rax]);
vpdpbsuds(xmm1, xmm2, xmm3);
vpdpbsuds(ymm1, ymm2, ptr [rax]);
vpdpbuud(xmm1, xmm2, xmm3);
vpdpbuud(ymm1, ymm2, ptr [rax]);
vpdpbuuds(xmm1, xmm2, xmm3);
vpdpbuuds(ymm1, ymm2, ptr [rax]);
vpdpwsud(xmm1, xmm2, xmm3);
vpdpwsud(ymm1, ymm2, ptr [rax]);
vpdpwsuds(xmm1, xmm2, xmm3);
vpdpwsuds(ymm1, ymm2, ptr [rax]);
vpdpwusd(xmm1, xmm2, xmm3);
vpdpwusd(ymm1, ymm2, ptr [rax]);
vpdpwusds(xmm1, xmm2, xmm3);
vpdpwusds(ymm1, ymm2, ptr [rax]);
vpdpwuud(xmm1, xmm2, xmm3);
vpdpwuud(ymm1, ymm2, ptr [rax]);
vpdpwuuds(xmm1, xmm2, xmm3);
vpdpwuuds(ymm1, ymm2, ptr [rax]);
+294
View File
@@ -0,0 +1,294 @@
//
vcvtbf162ibs(xm1, xm2);
vcvtbf162ibs(xm1, ptr[rax+128]);
vcvtbf162ibs(xm1, ptr_b[rax+128]);
vcvtbf162ibs(ym1, ym2);
vcvtbf162ibs(ym1, ptr[rax+128]);
vcvtbf162ibs(ym1, ptr_b[rax+128]);
vcvtbf162ibs(zm1, zm2);
vcvtbf162ibs(zm1, ptr[rax+128]);
vcvtbf162ibs(zm1, ptr_b[rax+128]);
//
vcvtbf162iubs(xm1, xm2);
vcvtbf162iubs(xm1, ptr[rax+128]);
vcvtbf162iubs(xm1, ptr_b[rax+128]);
vcvtbf162iubs(ym1, ym2);
vcvtbf162iubs(ym1, ptr[rax+128]);
vcvtbf162iubs(ym1, ptr_b[rax+128]);
vcvtbf162iubs(zm1, zm2);
vcvtbf162iubs(zm1, ptr[rax+128]);
vcvtbf162iubs(zm1, ptr_b[rax+128]);
//
vcvttbf162ibs(xm1, xm2);
vcvttbf162ibs(xm1, ptr[rax+128]);
vcvttbf162ibs(xm1, ptr_b[rax+128]);
vcvttbf162ibs(ym1, ym2);
vcvttbf162ibs(ym1, ptr[rax+128]);
vcvttbf162ibs(ym1, ptr_b[rax+128]);
vcvttbf162ibs(zm1, zm2);
vcvttbf162ibs(zm1, ptr[rax+128]);
vcvttbf162ibs(zm1, ptr_b[rax+128]);
//
vcvttbf162iubs(xm1, xm2);
vcvttbf162iubs(xm1, ptr[rax+128]);
vcvttbf162iubs(xm1, ptr_b[rax+128]);
vcvttbf162iubs(ym1, ym2);
vcvttbf162iubs(ym1, ptr[rax+128]);
vcvttbf162iubs(ym1, ptr_b[rax+128]);
vcvttbf162iubs(zm1, zm2);
vcvttbf162iubs(zm1, ptr[rax+128]);
vcvttbf162iubs(zm1, ptr_b[rax+128]);
//
vcvttpd2qqs(xm1, xm2);
vcvttpd2qqs(xm1, ptr[rax+128]);
vcvttpd2qqs(xm1, ptr_b[rax+128]);
vcvttpd2qqs(ym1, ym2);
vcvttpd2qqs(ym1, ptr[rax+128]);
vcvttpd2qqs(ym1, ptr_b[rax+128]);
vcvttpd2qqs(zm1, zm2);
vcvttpd2qqs(zm1, zm2|T_sae);
vcvttpd2qqs(zm1, ptr[rax+128]);
vcvttpd2qqs(zm1, ptr_b[rax+128]);
//
vcvttpd2uqqs(xm1, xm2);
vcvttpd2uqqs(xm1, ptr[rax+128]);
vcvttpd2uqqs(xm1, ptr_b[rax+128]);
vcvttpd2uqqs(ym1, ym2);
vcvttpd2uqqs(ym1, ptr[rax+128]);
vcvttpd2uqqs(ym1, ptr_b[rax+128]);
vcvttpd2uqqs(zm1, zm2);
vcvttpd2uqqs(zm1, zm2|T_sae);
vcvttpd2uqqs(zm1, ptr[rax+128]);
vcvttpd2uqqs(zm1, ptr_b[rax+128]);
//
vcvtph2ibs(xm1, xm2);
vcvtph2ibs(xm1, ptr[rax+128]);
vcvtph2ibs(xm1, ptr_b[rax+128]);
vcvtph2ibs(ym1, ym2);
vcvtph2ibs(ym1, ptr[rax+128]);
vcvtph2ibs(ym1, ptr_b[rax+128]);
vcvtph2ibs(zm1, zm2);
vcvtph2ibs(zm1, zm2|T_ru_sae);
vcvtph2ibs(zm1, ptr[rax+128]);
vcvtph2ibs(zm1, ptr_b[rax+128]);
//
vcvtph2iubs(xm1, xm2);
vcvtph2iubs(xm1, ptr[rax+128]);
vcvtph2iubs(xm1, ptr_b[rax+128]);
vcvtph2iubs(ym1, ym2);
vcvtph2iubs(ym1, ptr[rax+128]);
vcvtph2iubs(ym1, ptr_b[rax+128]);
vcvtph2iubs(zm1, zm2);
vcvtph2iubs(zm1, zm2|T_ru_sae);
vcvtph2iubs(zm1, ptr[rax+128]);
vcvtph2iubs(zm1, ptr_b[rax+128]);
//
vcvttph2ibs(xm1, xm2);
vcvttph2ibs(xm1, ptr[rax+128]);
vcvttph2ibs(xm1, ptr_b[rax+128]);
vcvttph2ibs(ym1, ym2);
vcvttph2ibs(ym1, ptr[rax+128]);
vcvttph2ibs(ym1, ptr_b[rax+128]);
vcvttph2ibs(zm1, zm2);
vcvttph2ibs(zm1, zm2|T_ru_sae);
vcvttph2ibs(zm1, ptr[rax+128]);
vcvttph2ibs(zm1, ptr_b[rax+128]);
//
vcvttph2iubs(xm1, xm2);
vcvttph2iubs(xm1, ptr[rax+128]);
vcvttph2iubs(xm1, ptr_b[rax+128]);
vcvttph2iubs(ym1, ym2);
vcvttph2iubs(ym1, ptr[rax+128]);
vcvttph2iubs(ym1, ptr_b[rax+128]);
vcvttph2iubs(zm1, zm2);
vcvttph2iubs(zm1, zm2|T_ru_sae);
vcvttph2iubs(zm1, ptr[rax+128]);
vcvttph2iubs(zm1, ptr_b[rax+128]);
//
vcvttps2dqs(xm1, xm2);
vcvttps2dqs(xm1, ptr[rax+128]);
vcvttps2dqs(xm1, ptr_b[rax+128]);
vcvttps2dqs(ym1, ym2);
vcvttps2dqs(ym1, ptr[rax+128]);
vcvttps2dqs(ym1, ptr_b[rax+128]);
vcvttps2dqs(zm1, zm2);
vcvttps2dqs(zm1, zm2|T_sae);
vcvttps2dqs(zm1, ptr[rax+128]);
vcvttps2dqs(zm1, ptr_b[rax+128]);
//
vcvtps2ibs(xm1, xm2);
vcvtps2ibs(xm1, ptr[rax+128]);
vcvtps2ibs(xm1, ptr_b[rax+128]);
vcvtps2ibs(ym1, ym2);
vcvtps2ibs(ym1, ptr[rax+128]);
vcvtps2ibs(ym1, ptr_b[rax+128]);
vcvtps2ibs(zm1, zm2);
vcvtps2ibs(zm1, zm2|T_ru_sae);
vcvtps2ibs(zm1, ptr[rax+128]);
vcvtps2ibs(zm1, ptr_b[rax+128]);
//
vcvtps2iubs(xm1, xm2);
vcvtps2iubs(xm1, ptr[rax+128]);
vcvtps2iubs(xm1, ptr_b[rax+128]);
vcvtps2iubs(ym1, ym2);
vcvtps2iubs(ym1, ptr[rax+128]);
vcvtps2iubs(ym1, ptr_b[rax+128]);
vcvtps2iubs(zm1, zm2);
vcvtps2iubs(zm1, zm2|T_ru_sae);
vcvtps2iubs(zm1, ptr[rax+128]);
vcvtps2iubs(zm1, ptr_b[rax+128]);
//
vcvttps2ibs(xm1, xm2);
vcvttps2ibs(xm1, ptr[rax+128]);
vcvttps2ibs(xm1, ptr_b[rax+128]);
vcvttps2ibs(ym1, ym2);
vcvttps2ibs(ym1, ptr[rax+128]);
vcvttps2ibs(ym1, ptr_b[rax+128]);
vcvttps2ibs(zm1, zm2);
vcvttps2ibs(zm1, zm2|T_ru_sae);
vcvttps2ibs(zm1, ptr[rax+128]);
vcvttps2ibs(zm1, ptr_b[rax+128]);
//
vcvttps2iubs(xm1, xm2);
vcvttps2iubs(xm1, ptr[rax+128]);
vcvttps2iubs(xm1, ptr_b[rax+128]);
vcvttps2iubs(ym1, ym2);
vcvttps2iubs(ym1, ptr[rax+128]);
vcvttps2iubs(ym1, ptr_b[rax+128]);
vcvttps2iubs(zm1, zm2);
vcvttps2iubs(zm1, zm2|T_ru_sae);
vcvttps2iubs(zm1, ptr[rax+128]);
vcvttps2iubs(zm1, ptr_b[rax+128]);
//
vcvttps2udqs(xm1, xm2);
vcvttps2udqs(xm1, ptr[rax+128]);
vcvttps2udqs(xm1, ptr_b[rax+128]);
vcvttps2udqs(ym1, ym2);
vcvttps2udqs(ym1, ptr[rax+128]);
vcvttps2udqs(ym1, ptr_b[rax+128]);
vcvttps2udqs(zm1, zm2);
vcvttps2udqs(zm1, zm2|T_sae);
vcvttps2udqs(zm1, ptr[rax+128]);
vcvttps2udqs(zm1, ptr_b[rax+128]);
//
vcvttpd2dqs(xm1|k1|T_z, xm2);
vcvttpd2dqs(xm1|k1|T_z, xword [rax+128]);
vcvttpd2dqs(xm1|k1|T_z, xword_b[rax+128]);
vcvttpd2dqs(xm1|k1|T_z, ym2);
vcvttpd2dqs(xm1|k1|T_z, yword [rax+128]);
vcvttpd2dqs(xm1|k1|T_z, yword_b[rax+128]);
vcvttpd2dqs(ym1|k1|T_z, zm2);
vcvttpd2dqs(ym1|k1|T_z, zm2|T_sae);
vcvttpd2dqs(ym1|k1|T_z, zword [rax+128]);
vcvttpd2dqs(ym1|k1|T_z, zword_b[rax+128]);
//
vcvttpd2udqs(xm1|k1|T_z, xm2);
vcvttpd2udqs(xm1|k1|T_z, xword [rax+128]);
vcvttpd2udqs(xm1|k1|T_z, xword_b[rax+128]);
vcvttpd2udqs(xm1|k1|T_z, ym2);
vcvttpd2udqs(xm1|k1|T_z, yword [rax+128]);
vcvttpd2udqs(xm1|k1|T_z, yword_b[rax+128]);
vcvttpd2udqs(ym1|k1|T_z, zm2);
vcvttpd2udqs(ym1|k1|T_z, zm2|T_sae);
vcvttpd2udqs(ym1|k1|T_z, zword [rax+128]);
vcvttpd2udqs(ym1|k1|T_z, zword_b[rax+128]);
//
vcvttps2qqs(xm1|k1|T_z, xm2);
vcvttps2qqs(xm1|k1|T_z, ptr [rax+128]);
vcvttps2qqs(xm1|k1|T_z, ptr_b[rax+128]);
vcvttps2qqs(ym1|k1|T_z, xm2);
vcvttps2qqs(ym1|k1|T_z, ptr [rax+128]);
vcvttps2qqs(ym1|k1|T_z, ptr_b[rax+128]);
vcvttps2qqs(zm1, ym2);
vcvttps2qqs(zm1|k1|T_z, ym2);
vcvttps2qqs(zm1|k1|T_z|T_sae, ym2);
vcvttps2qqs(zm1|k1|T_z, ptr [rax+128]);
vcvttps2qqs(zm1|k1|T_z, ptr_b[rax+128]);
//
vcvttps2uqqs(xm1|k1|T_z, xm2);
vcvttps2uqqs(xm1|k1|T_z, ptr [rax+128]);
vcvttps2uqqs(xm1|k1|T_z, ptr_b[rax+128]);
vcvttps2uqqs(ym1|k1|T_z, xm2);
vcvttps2uqqs(ym1|k1|T_z, ptr [rax+128]);
vcvttps2uqqs(ym1|k1|T_z, ptr_b[rax+128]);
vcvttps2uqqs(zm1, ym2);
vcvttps2uqqs(zm1|k1|T_z, ym2);
vcvttps2uqqs(zm1|k1|T_z|T_sae, ym2);
vcvttps2uqqs(zm1|k1|T_z, ptr [rax+128]);
vcvttps2uqqs(zm1|k1|T_z, ptr_b[rax+128]);
//
vcvttsd2sis(eax, xm1);
vcvttsd2sis(eax, xm1|T_sae);
vcvttsd2sis(eax, ptr[rax+128]);
vcvttsd2sis(r30, xm1);
vcvttsd2sis(r30, xm1|T_sae);
vcvttsd2sis(r30, ptr[rax+128]);
//
vcvttsd2usis(eax, xm1);
vcvttsd2usis(eax, xm1|T_sae);
vcvttsd2usis(eax, ptr[rax+128]);
vcvttsd2usis(r30, xm1);
vcvttsd2usis(r30, xm1|T_sae);
vcvttsd2usis(r30, ptr[rax+128]);
//
vcvttss2sis(eax, xm1);
vcvttss2sis(eax, xm1|T_sae);
vcvttss2sis(eax, ptr[rax+128]);
vcvttss2sis(r30, xm1);
vcvttss2sis(r30, xm1|T_sae);
vcvttss2sis(r30, ptr[rax+128]);
//
vcvttss2usis(eax, xm1);
vcvttss2usis(eax, xm1|T_sae);
vcvttss2usis(eax, ptr[rax+128]);
vcvttss2usis(r30, xm1);
vcvttss2usis(r30, xm1|T_sae);
vcvttss2usis(r30, ptr[rax+128]);