211 lines
6.0 KiB
Plaintext
211 lines
6.0 KiB
Plaintext
vaddbf16(xm1, xm2, xm3);
|
|
vaddbf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vaddbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vaddbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vdivbf16(xm1, xm2, xm3);
|
|
vdivbf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vdivbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vdivbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vmaxbf16(xm1, xm2, xm3);
|
|
vmaxbf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vmaxbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vmaxbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vminbf16(xm1, xm2, xm3);
|
|
vminbf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vminbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vminbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vmulbf16(xm1, xm2, xm3);
|
|
vmulbf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vmulbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vmulbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vscalefbf16(xm1, xm2, xm3);
|
|
vscalefbf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vscalefbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vscalefbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vsubbf16(xm1, xm2, xm3);
|
|
vsubbf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vsubbf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vsubbf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
// madd
|
|
vfmadd132bf16(xm1, xm2, xm3);
|
|
vfmadd132bf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vfmadd132bf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vfmadd132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vfmadd213bf16(xm1, xm2, xm3);
|
|
vfmadd213bf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vfmadd213bf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vfmadd213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vfmadd231bf16(xm1, xm2, xm3);
|
|
vfmadd231bf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vfmadd231bf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vfmadd231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
// nmadd
|
|
vfnmadd132bf16(xm1, xm2, xm3);
|
|
vfnmadd132bf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vfnmadd132bf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vfnmadd132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vfnmadd213bf16(xm1, xm2, xm3);
|
|
vfnmadd213bf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vfnmadd213bf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vfnmadd213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vfnmadd231bf16(xm1, xm2, xm3);
|
|
vfnmadd231bf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vfnmadd231bf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vfnmadd231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
// msub
|
|
vfmsub132bf16(xm1, xm2, xm3);
|
|
vfmsub132bf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vfmsub132bf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vfmsub132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vfmsub213bf16(xm1, xm2, xm3);
|
|
vfmsub213bf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vfmsub213bf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vfmsub213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vfmsub231bf16(xm1, xm2, xm3);
|
|
vfmsub231bf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vfmsub231bf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vfmsub231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
// nmsub
|
|
vfnmsub132bf16(xm1, xm2, xm3);
|
|
vfnmsub132bf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vfnmsub132bf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vfnmsub132bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vfnmsub213bf16(xm1, xm2, xm3);
|
|
vfnmsub213bf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vfnmsub213bf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vfnmsub213bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vfnmsub231bf16(xm1, xm2, xm3);
|
|
vfnmsub231bf16(ym1|k1, ym2, ptr[rax+128]);
|
|
vfnmsub231bf16(ym1|k1, ym2, ptr_b[rax+128]);
|
|
vfnmsub231bf16(zm1|k2|T_z, zm2, ptr_b[rax+128]);
|
|
|
|
vcmpbf16(k1, xm5, xm4, 5);
|
|
vcmpbf16(k2, ym5, ym4, 6);
|
|
vcmpbf16(k3, ym15, ptr_b[rax+128], 7);
|
|
vcmpbf16(k4, zm30, zm20, 8);
|
|
vcmpbf16(k5, zm1, ptr[rax+128], 9);
|
|
vcmpbf16(k6, zm10, ptr_b[rax+128], 10);
|
|
|
|
vfpclassbf16(k1, xm4, 5);
|
|
vfpclassbf16(k2|k5, ym4, 6);
|
|
vfpclassbf16(k3|k5, zm20, 7);
|
|
vfpclassbf16(k3|k5, xword[rax+128], 8);
|
|
vfpclassbf16(k3, xword_b[rax+128], 9);
|
|
vfpclassbf16(k5|k5, yword[rax+128], 10);
|
|
vfpclassbf16(k6|k5, yword_b[rax+128], 11);
|
|
vfpclassbf16(k7|k5, zword[rax+128], 12);
|
|
vfpclassbf16(k7|k5, zword_b[rax+128], 13);
|
|
|
|
vcomisbf16(xm2, xm3);
|
|
vcomisbf16(xm2, ptr[rax+128]);
|
|
|
|
vgetexpbf16(xm1|k3, xmm2);
|
|
vgetexpbf16(xm1|k3, ptr[rax+128]);
|
|
vgetexpbf16(xm1|k3, ptr_b[rax+128]);
|
|
|
|
vgetexpbf16(ym1|k3, ymm2);
|
|
vgetexpbf16(ym1|k3, ptr[rax+128]);
|
|
vgetexpbf16(ym1|k3, ptr_b[rax+128]);
|
|
|
|
vgetexpbf16(zm1|k3, zmm2);
|
|
vgetexpbf16(zm1|k3, ptr[rax+128]);
|
|
vgetexpbf16(zm1|k3, ptr_b[rax+128]);
|
|
|
|
vgetmantbf16(xm1|k3, xmm2, 3);
|
|
vgetmantbf16(xm1|k3, ptr[rax+128], 5);
|
|
vgetmantbf16(xm1|k3, ptr_b[rax+128], 9);
|
|
|
|
vgetmantbf16(ym1|k3, ymm2, 3);
|
|
vgetmantbf16(ym1|k3, ptr[rax+128], 5);
|
|
vgetmantbf16(ym1|k3, ptr_b[rax+128], 9);
|
|
|
|
vgetmantbf16(zm1|k3, zmm2, 3);
|
|
vgetmantbf16(zm1|k3, ptr[rax+128], 5);
|
|
vgetmantbf16(zm1|k3, ptr_b[rax+128], 9);
|
|
|
|
vrcpbf16(xm1|k5, xm2);
|
|
vrcpbf16(xm1|k5, ptr[rcx+128]);
|
|
vrcpbf16(xm1|k5, ptr_b[rcx+128]);
|
|
|
|
vrcpbf16(ym1|k5, ym2);
|
|
vrcpbf16(ym1|k5, ptr[rcx+128]);
|
|
vrcpbf16(ym1|k5, ptr_b[rcx+128]);
|
|
|
|
vrcpbf16(zm1|k5, zm2);
|
|
vrcpbf16(zm1|k5, ptr[rcx+128]);
|
|
vrcpbf16(zm1|k5, ptr_b[rcx+128]);
|
|
|
|
vreducebf16(xm1|k4, xm2, 1);
|
|
vreducebf16(xm1|k4, ptr[rax+128], 1);
|
|
vreducebf16(xm1|k4, ptr_b[rax+128], 1);
|
|
|
|
vreducebf16(ym1|k4, ym2, 1);
|
|
vreducebf16(ym1|k4, ptr[rax+128], 1);
|
|
vreducebf16(ym1|k4, ptr_b[rax+128], 1);
|
|
|
|
vreducebf16(zm1|k4, zm2, 1);
|
|
vreducebf16(zm1|k4, ptr[rax+128], 1);
|
|
vreducebf16(zm1|k4, ptr_b[rax+128], 1);
|
|
|
|
vrndscalebf16(xm1|k4, xm2, 1);
|
|
vrndscalebf16(xm1|k4, ptr[rax+128], 1);
|
|
vrndscalebf16(xm1|k4, ptr_b[rax+128], 1);
|
|
|
|
vrndscalebf16(ym1|k4, ym2, 1);
|
|
vrndscalebf16(ym1|k4, ptr[rax+128], 1);
|
|
vrndscalebf16(ym1|k4, ptr_b[rax+128], 1);
|
|
|
|
vrndscalebf16(zm1|k4, zm2, 1);
|
|
vrndscalebf16(zm1|k4, ptr[rax+128], 1);
|
|
vrndscalebf16(zm1|k4, ptr_b[rax+128], 1);
|
|
|
|
vrsqrtbf16(xm1|k5, xm2);
|
|
vrsqrtbf16(xm1|k5, ptr[rcx+128]);
|
|
vrsqrtbf16(xm1|k5, ptr_b[rcx+128]);
|
|
|
|
vrsqrtbf16(ym1|k5, ym2);
|
|
vrsqrtbf16(ym1|k5, ptr[rcx+128]);
|
|
vrsqrtbf16(ym1|k5, ptr_b[rcx+128]);
|
|
|
|
vrsqrtbf16(zm1|k5, zm2);
|
|
vrsqrtbf16(zm1|k5, ptr[rcx+128]);
|
|
vrsqrtbf16(zm1|k5, ptr_b[rcx+128]);
|
|
|
|
vscalefbf16(xm1|k5, xm5, xm2);
|
|
vscalefbf16(xm1|k5, xm5, ptr[rcx+128]);
|
|
vscalefbf16(xm1|k5, xm5, ptr_b[rcx+128]);
|
|
|
|
vscalefbf16(ym1|k5, ym9, ym2);
|
|
vscalefbf16(ym1|k5, ym9, ptr[rcx+128]);
|
|
vscalefbf16(ym1|k5, ym9, ptr_b[rcx+128]);
|
|
|
|
vscalefbf16(zm1|k5, zm30, zm2);
|
|
vscalefbf16(zm1|k5, zm30, ptr[rcx+128]);
|
|
vscalefbf16(zm1|k5, zm30, ptr_b[rcx+128]);
|
|
|
|
vsqrtbf16(xm5|k3, xmm4);
|
|
vsqrtbf16(xm5|k3, ptr[rax+128]);
|
|
vsqrtbf16(xm5|k3, ptr_b[rax+128]);
|
|
|
|
vsqrtbf16(ym5|k3, ymm4);
|
|
vsqrtbf16(ym5|k3, ptr[rax+128]);
|
|
vsqrtbf16(ym5|k3, ptr_b[rax+128]);
|
|
|
|
vsqrtbf16(zm5|k3, zmm4);
|
|
vsqrtbf16(zm5|k3, ptr[rax+128]);
|
|
vsqrtbf16(zm5|k3, ptr_b[rax+128]);
|