git-subtree-dir: external/xbyak git-subtree-split: 9d8ff37306f39c6a71cf998078cbe880ce5dc224
266 lines
5.3 KiB
Plaintext
266 lines
5.3 KiB
Plaintext
// AVX10 integer and FP16 VNNI, media and zero-extending
|
|
vdpphps(xm1, xm2, xm3);
|
|
vdpphps(xm1, xm2, ptr[rax+128]);
|
|
vdpphps(xm1, xm2, ptr_b[rax+128]);
|
|
|
|
vdpphps(ym1, ym2, ym3);
|
|
vdpphps(ym1, ym2, ptr[rax+128]);
|
|
vdpphps(ym1, ym2, ptr_b[rax+128]);
|
|
|
|
vdpphps(zm1, zm2, zm3);
|
|
vdpphps(zm1, zm2, ptr[rax+128]);
|
|
vdpphps(zm1, zm2, ptr_b[rax+128]);
|
|
//
|
|
vmpsadbw(xm1, xm3, xm15, 3);
|
|
vmpsadbw(xm1|T_z, xm4, ptr[rax+128], 5);
|
|
|
|
vmpsadbw(ym1|k4, ym3, ym15, 3);
|
|
vmpsadbw(ym1, ym4, ptr[rax+128], 5);
|
|
|
|
vmpsadbw(zm1|k4, zm3, zm15, 3);
|
|
vmpsadbw(zm1, zm4, ptr[rax+128], 5);
|
|
//
|
|
vpdpbssd(xm1, xm2, xm3);
|
|
vpdpbssd(xm1, xm2, ptr[rax+128]);
|
|
vpdpbssd(xm1, xm2, ptr_b[rax+128]);
|
|
|
|
vpdpbssd(ym1, ym2, ym3);
|
|
vpdpbssd(ym1, ym2, ptr[rax+128]);
|
|
vpdpbssd(ym1, ym2, ptr_b[rax+128]);
|
|
|
|
vpdpbssd(zm1, zm2, zm3);
|
|
vpdpbssd(zm1, zm2, ptr[rax+128]);
|
|
vpdpbssd(zm1, zm2, ptr_b[rax+128]);
|
|
//
|
|
vpdpbssds(xm1, xm2, xm3);
|
|
vpdpbssds(xm1, xm2, ptr[rax+128]);
|
|
vpdpbssds(xm1, xm2, ptr_b[rax+128]);
|
|
|
|
vpdpbssds(ym1, ym2, ym3);
|
|
vpdpbssds(ym1, ym2, ptr[rax+128]);
|
|
vpdpbssds(ym1, ym2, ptr_b[rax+128]);
|
|
|
|
vpdpbssds(zm1, zm2, zm3);
|
|
vpdpbssds(zm1, zm2, ptr[rax+128]);
|
|
vpdpbssds(zm1, zm2, ptr_b[rax+128]);
|
|
//
|
|
vpdpbsud(xm1, xm2, xm3);
|
|
vpdpbsud(xm1, xm2, ptr[rax+128]);
|
|
vpdpbsud(xm1, xm2, ptr_b[rax+128]);
|
|
|
|
vpdpbsud(ym1, ym2, ym3);
|
|
vpdpbsud(ym1, ym2, ptr[rax+128]);
|
|
vpdpbsud(ym1, ym2, ptr_b[rax+128]);
|
|
|
|
vpdpbsud(zm1, zm2, zm3);
|
|
vpdpbsud(zm1, zm2, ptr[rax+128]);
|
|
vpdpbsud(zm1, zm2, ptr_b[rax+128]);
|
|
//
|
|
vpdpbsuds(xm1, xm2, xm3);
|
|
vpdpbsuds(xm1, xm2, ptr[rax+128]);
|
|
vpdpbsuds(xm1, xm2, ptr_b[rax+128]);
|
|
|
|
vpdpbsuds(ym1, ym2, ym3);
|
|
vpdpbsuds(ym1, ym2, ptr[rax+128]);
|
|
vpdpbsuds(ym1, ym2, ptr_b[rax+128]);
|
|
|
|
vpdpbsuds(zm1, zm2, zm3);
|
|
vpdpbsuds(zm1, zm2, ptr[rax+128]);
|
|
vpdpbsuds(zm1, zm2, ptr_b[rax+128]);
|
|
|
|
//
|
|
vpdpbuud(xm1, xm2, xm3);
|
|
vpdpbuud(xm1, xm2, ptr[rax+128]);
|
|
vpdpbuud(xm1, xm2, ptr_b[rax+128]);
|
|
|
|
vpdpbuud(ym1, ym2, ym3);
|
|
vpdpbuud(ym1, ym2, ptr[rax+128]);
|
|
vpdpbuud(ym1, ym2, ptr_b[rax+128]);
|
|
|
|
vpdpbuud(zm1, zm2, zm3);
|
|
vpdpbuud(zm1, zm2, ptr[rax+128]);
|
|
vpdpbuud(zm1, zm2, ptr_b[rax+128]);
|
|
//
|
|
vpdpbuuds(xm1, xm2, xm3);
|
|
vpdpbuuds(xm1, xm2, ptr[rax+128]);
|
|
vpdpbuuds(xm1, xm2, ptr_b[rax+128]);
|
|
|
|
vpdpbuuds(ym1, ym2, ym3);
|
|
vpdpbuuds(ym1, ym2, ptr[rax+128]);
|
|
vpdpbuuds(ym1, ym2, ptr_b[rax+128]);
|
|
|
|
vpdpbuuds(zm1, zm2, zm3);
|
|
vpdpbuuds(zm1, zm2, ptr[rax+128]);
|
|
vpdpbuuds(zm1, zm2, ptr_b[rax+128]);
|
|
|
|
//
|
|
vpdpwsud(xm1, xm2, xm3);
|
|
vpdpwsud(xm1, xm2, ptr[rax+128]);
|
|
vpdpwsud(xm1, xm2, ptr_b[rax+128]);
|
|
|
|
vpdpwsud(ym1, ym2, ym3);
|
|
vpdpwsud(ym1, ym2, ptr[rax+128]);
|
|
vpdpwsud(ym1, ym2, ptr_b[rax+128]);
|
|
|
|
vpdpwsud(zm1, zm2, zm3);
|
|
vpdpwsud(zm1, zm2, ptr[rax+128]);
|
|
vpdpwsud(zm1, zm2, ptr_b[rax+128]);
|
|
//
|
|
vpdpwsuds(xm1, xm2, xm3);
|
|
vpdpwsuds(xm1, xm2, ptr[rax+128]);
|
|
vpdpwsuds(xm1, xm2, ptr_b[rax+128]);
|
|
|
|
vpdpwsuds(ym1, ym2, ym3);
|
|
vpdpwsuds(ym1, ym2, ptr[rax+128]);
|
|
vpdpwsuds(ym1, ym2, ptr_b[rax+128]);
|
|
|
|
vpdpwsuds(zm1, zm2, zm3);
|
|
vpdpwsuds(zm1, zm2, ptr[rax+128]);
|
|
vpdpwsuds(zm1, zm2, ptr_b[rax+128]);
|
|
//
|
|
vpdpwsud(xm1, xm2, xm3);
|
|
vpdpwsud(xm1, xm2, ptr[rax+128]);
|
|
vpdpwsud(xm1, xm2, ptr_b[rax+128]);
|
|
|
|
vpdpwsud(ym1, ym2, ym3);
|
|
vpdpwsud(ym1, ym2, ptr[rax+128]);
|
|
vpdpwsud(ym1, ym2, ptr_b[rax+128]);
|
|
|
|
vpdpwsud(zm1, zm2, zm3);
|
|
vpdpwsud(zm1, zm2, ptr[rax+128]);
|
|
vpdpwsud(zm1, zm2, ptr_b[rax+128]);
|
|
//
|
|
vpdpwsuds(xm1, xm2, xm3);
|
|
vpdpwsuds(xm1, xm2, ptr[rax+128]);
|
|
vpdpwsuds(xm1, xm2, ptr_b[rax+128]);
|
|
|
|
vpdpwsuds(ym1, ym2, ym3);
|
|
vpdpwsuds(ym1, ym2, ptr[rax+128]);
|
|
vpdpwsuds(ym1, ym2, ptr_b[rax+128]);
|
|
|
|
vpdpwsuds(zm1, zm2, zm3);
|
|
vpdpwsuds(zm1, zm2, ptr[rax+128]);
|
|
vpdpwsuds(zm1, zm2, ptr_b[rax+128]);
|
|
|
|
//
|
|
vpdpwuud(xm1, xm2, xm3);
|
|
vpdpwuud(xm1, xm2, ptr[rax+128]);
|
|
vpdpwuud(xm1, xm2, ptr_b[rax+128]);
|
|
|
|
vpdpwuud(ym1, ym2, ym3);
|
|
vpdpwuud(ym1, ym2, ptr[rax+128]);
|
|
vpdpwuud(ym1, ym2, ptr_b[rax+128]);
|
|
|
|
vpdpwuud(zm1, zm2, zm3);
|
|
vpdpwuud(zm1, zm2, ptr[rax+128]);
|
|
vpdpwuud(zm1, zm2, ptr_b[rax+128]);
|
|
//
|
|
vpdpwuuds(xm1, xm2, xm3);
|
|
vpdpwuuds(xm1, xm2, ptr[rax+128]);
|
|
vpdpwuuds(xm1, xm2, ptr_b[rax+128]);
|
|
|
|
vpdpwuuds(ym1, ym2, ym3);
|
|
vpdpwuuds(ym1, ym2, ptr[rax+128]);
|
|
vpdpwuuds(ym1, ym2, ptr_b[rax+128]);
|
|
|
|
vpdpwuuds(zm1, zm2, zm3);
|
|
vpdpwuuds(zm1, zm2, ptr[rax+128]);
|
|
vpdpwuuds(zm1, zm2, ptr_b[rax+128]);
|
|
|
|
//
|
|
vmovd(xm10, xm20);
|
|
vmovd(xm1, xm2);
|
|
vmovd(xm10, ptr[rax+128]);
|
|
vmovd(ptr[rax+128], xm30);
|
|
//
|
|
vmovw(xm1, xm20);
|
|
vmovw(xm1, xm2);
|
|
vmovw(xm3, ptr [rax+0x40]);
|
|
vmovw(ptr [rax+0x40], xm7);
|
|
//
|
|
push(rax);
|
|
push(rcx);
|
|
push(rdx);
|
|
push(rbx);
|
|
push(rsp);
|
|
push(rbp);
|
|
push(rsi);
|
|
push(rdi);
|
|
push(r8);
|
|
push(r9);
|
|
push(r10);
|
|
push(r11);
|
|
push(r12);
|
|
push(r13);
|
|
push(r14);
|
|
push(r15);
|
|
push(r16);
|
|
push(r17);
|
|
push(r18);
|
|
push(r19);
|
|
push(r20);
|
|
push(r21);
|
|
push(r22);
|
|
push(r23);
|
|
push(r24);
|
|
push(r25);
|
|
push(r26);
|
|
push(r27);
|
|
push(r28);
|
|
push(r29);
|
|
push(r30);
|
|
push(r31);
|
|
pop(rax);
|
|
pop(rcx);
|
|
pop(rdx);
|
|
pop(rbx);
|
|
pop(rsp);
|
|
pop(rbp);
|
|
pop(rsi);
|
|
pop(rdi);
|
|
pop(r8);
|
|
pop(r9);
|
|
pop(r10);
|
|
pop(r11);
|
|
pop(r12);
|
|
pop(r13);
|
|
pop(r14);
|
|
pop(r15);
|
|
pop(r16);
|
|
pop(r17);
|
|
pop(r18);
|
|
pop(r19);
|
|
pop(r20);
|
|
pop(r21);
|
|
pop(r22);
|
|
pop(r23);
|
|
pop(r24);
|
|
pop(r25);
|
|
pop(r26);
|
|
pop(r27);
|
|
pop(r28);
|
|
pop(r29);
|
|
pop(r30);
|
|
pop(r31);
|
|
|
|
movrs(rcx, ptr[rax]);
|
|
movrs(ecx, ptr[rax]);
|
|
movrs(cx, ptr[rax]);
|
|
movrs(cl, ptr[rax+rdx*4]);
|
|
|
|
vmovrsb(xm1|k1|T_z, ptr[rax+128]);
|
|
vmovrsb(ym1|k1|T_z, ptr[rax+128]);
|
|
vmovrsb(zm1|k1|T_z, ptr[rax+128]);
|
|
|
|
vmovrsd(xm1|k1|T_z, ptr[rax+128]);
|
|
vmovrsd(ym1|k1|T_z, ptr[rax+128]);
|
|
vmovrsd(zm1|k1|T_z, ptr[rax+128]);
|
|
|
|
vmovrsq(xm1|k1|T_z, ptr[rax+128]);
|
|
vmovrsq(ym1|k1|T_z, ptr[rax+128]);
|
|
vmovrsq(zm1|k1|T_z, ptr[rax+128]);
|
|
|
|
vmovrsw(xm1|k1|T_z, ptr[rax+128]);
|
|
vmovrsw(ym1|k1|T_z, ptr[rax+128]);
|
|
vmovrsw(zm1|k1|T_z, ptr[rax+128]);
|