// AVX10 integer and FP16 VNNI, media and zero-extending vdpphps(xm1, xm2, xm3); vdpphps(xm1, xm2, ptr[rax+128]); vdpphps(xm1, xm2, ptr_b[rax+128]); vdpphps(ym1, ym2, ym3); vdpphps(ym1, ym2, ptr[rax+128]); vdpphps(ym1, ym2, ptr_b[rax+128]); vdpphps(zm1, zm2, zm3); vdpphps(zm1, zm2, ptr[rax+128]); vdpphps(zm1, zm2, ptr_b[rax+128]); // vmpsadbw(xm1, xm3, xm15, 3); vmpsadbw(xm1|T_z, xm4, ptr[rax+128], 5); vmpsadbw(ym1|k4, ym3, ym15, 3); vmpsadbw(ym1, ym4, ptr[rax+128], 5); vmpsadbw(zm1|k4, zm3, zm15, 3); vmpsadbw(zm1, zm4, ptr[rax+128], 5); // vpdpbssd(xm1, xm2, xm3); vpdpbssd(xm1, xm2, ptr[rax+128]); vpdpbssd(xm1, xm2, ptr_b[rax+128]); vpdpbssd(ym1, ym2, ym3); vpdpbssd(ym1, ym2, ptr[rax+128]); vpdpbssd(ym1, ym2, ptr_b[rax+128]); vpdpbssd(zm1, zm2, zm3); vpdpbssd(zm1, zm2, ptr[rax+128]); vpdpbssd(zm1, zm2, ptr_b[rax+128]); // vpdpbssds(xm1, xm2, xm3); vpdpbssds(xm1, xm2, ptr[rax+128]); vpdpbssds(xm1, xm2, ptr_b[rax+128]); vpdpbssds(ym1, ym2, ym3); vpdpbssds(ym1, ym2, ptr[rax+128]); vpdpbssds(ym1, ym2, ptr_b[rax+128]); vpdpbssds(zm1, zm2, zm3); vpdpbssds(zm1, zm2, ptr[rax+128]); vpdpbssds(zm1, zm2, ptr_b[rax+128]); // vpdpbsud(xm1, xm2, xm3); vpdpbsud(xm1, xm2, ptr[rax+128]); vpdpbsud(xm1, xm2, ptr_b[rax+128]); vpdpbsud(ym1, ym2, ym3); vpdpbsud(ym1, ym2, ptr[rax+128]); vpdpbsud(ym1, ym2, ptr_b[rax+128]); vpdpbsud(zm1, zm2, zm3); vpdpbsud(zm1, zm2, ptr[rax+128]); vpdpbsud(zm1, zm2, ptr_b[rax+128]); // vpdpbsuds(xm1, xm2, xm3); vpdpbsuds(xm1, xm2, ptr[rax+128]); vpdpbsuds(xm1, xm2, ptr_b[rax+128]); vpdpbsuds(ym1, ym2, ym3); vpdpbsuds(ym1, ym2, ptr[rax+128]); vpdpbsuds(ym1, ym2, ptr_b[rax+128]); vpdpbsuds(zm1, zm2, zm3); vpdpbsuds(zm1, zm2, ptr[rax+128]); vpdpbsuds(zm1, zm2, ptr_b[rax+128]); // vpdpbuud(xm1, xm2, xm3); vpdpbuud(xm1, xm2, ptr[rax+128]); vpdpbuud(xm1, xm2, ptr_b[rax+128]); vpdpbuud(ym1, ym2, ym3); vpdpbuud(ym1, ym2, ptr[rax+128]); vpdpbuud(ym1, ym2, ptr_b[rax+128]); vpdpbuud(zm1, zm2, zm3); vpdpbuud(zm1, zm2, ptr[rax+128]); vpdpbuud(zm1, zm2, ptr_b[rax+128]); // vpdpbuuds(xm1, xm2, xm3); vpdpbuuds(xm1, xm2, ptr[rax+128]); vpdpbuuds(xm1, xm2, ptr_b[rax+128]); vpdpbuuds(ym1, ym2, ym3); vpdpbuuds(ym1, ym2, ptr[rax+128]); vpdpbuuds(ym1, ym2, ptr_b[rax+128]); vpdpbuuds(zm1, zm2, zm3); vpdpbuuds(zm1, zm2, ptr[rax+128]); vpdpbuuds(zm1, zm2, ptr_b[rax+128]); // vpdpwsud(xm1, xm2, xm3); vpdpwsud(xm1, xm2, ptr[rax+128]); vpdpwsud(xm1, xm2, ptr_b[rax+128]); vpdpwsud(ym1, ym2, ym3); vpdpwsud(ym1, ym2, ptr[rax+128]); vpdpwsud(ym1, ym2, ptr_b[rax+128]); vpdpwsud(zm1, zm2, zm3); vpdpwsud(zm1, zm2, ptr[rax+128]); vpdpwsud(zm1, zm2, ptr_b[rax+128]); // vpdpwsuds(xm1, xm2, xm3); vpdpwsuds(xm1, xm2, ptr[rax+128]); vpdpwsuds(xm1, xm2, ptr_b[rax+128]); vpdpwsuds(ym1, ym2, ym3); vpdpwsuds(ym1, ym2, ptr[rax+128]); vpdpwsuds(ym1, ym2, ptr_b[rax+128]); vpdpwsuds(zm1, zm2, zm3); vpdpwsuds(zm1, zm2, ptr[rax+128]); vpdpwsuds(zm1, zm2, ptr_b[rax+128]); // vpdpwsud(xm1, xm2, xm3); vpdpwsud(xm1, xm2, ptr[rax+128]); vpdpwsud(xm1, xm2, ptr_b[rax+128]); vpdpwsud(ym1, ym2, ym3); vpdpwsud(ym1, ym2, ptr[rax+128]); vpdpwsud(ym1, ym2, ptr_b[rax+128]); vpdpwsud(zm1, zm2, zm3); vpdpwsud(zm1, zm2, ptr[rax+128]); vpdpwsud(zm1, zm2, ptr_b[rax+128]); // vpdpwsuds(xm1, xm2, xm3); vpdpwsuds(xm1, xm2, ptr[rax+128]); vpdpwsuds(xm1, xm2, ptr_b[rax+128]); vpdpwsuds(ym1, ym2, ym3); vpdpwsuds(ym1, ym2, ptr[rax+128]); vpdpwsuds(ym1, ym2, ptr_b[rax+128]); vpdpwsuds(zm1, zm2, zm3); vpdpwsuds(zm1, zm2, ptr[rax+128]); vpdpwsuds(zm1, zm2, ptr_b[rax+128]); // vpdpwuud(xm1, xm2, xm3); vpdpwuud(xm1, xm2, ptr[rax+128]); vpdpwuud(xm1, xm2, ptr_b[rax+128]); vpdpwuud(ym1, ym2, ym3); vpdpwuud(ym1, ym2, ptr[rax+128]); vpdpwuud(ym1, ym2, ptr_b[rax+128]); vpdpwuud(zm1, zm2, zm3); vpdpwuud(zm1, zm2, ptr[rax+128]); vpdpwuud(zm1, zm2, ptr_b[rax+128]); // vpdpwuuds(xm1, xm2, xm3); vpdpwuuds(xm1, xm2, ptr[rax+128]); vpdpwuuds(xm1, xm2, ptr_b[rax+128]); vpdpwuuds(ym1, ym2, ym3); vpdpwuuds(ym1, ym2, ptr[rax+128]); vpdpwuuds(ym1, ym2, ptr_b[rax+128]); vpdpwuuds(zm1, zm2, zm3); vpdpwuuds(zm1, zm2, ptr[rax+128]); vpdpwuuds(zm1, zm2, ptr_b[rax+128]); // vmovd(xm10, xm20); vmovd(xm1, xm2); vmovd(xm10, ptr[rax+128]); vmovd(ptr[rax+128], xm30); // vmovw(xm1, xm20); vmovw(xm1, xm2); vmovw(xm3, ptr [rax+0x40]); vmovw(ptr [rax+0x40], xm7); // push(rax); push(rcx); push(rdx); push(rbx); push(rsp); push(rbp); push(rsi); push(rdi); push(r8); push(r9); push(r10); push(r11); push(r12); push(r13); push(r14); push(r15); push(r16); push(r17); push(r18); push(r19); push(r20); push(r21); push(r22); push(r23); push(r24); push(r25); push(r26); push(r27); push(r28); push(r29); push(r30); push(r31); pop(rax); pop(rcx); pop(rdx); pop(rbx); pop(rsp); pop(rbp); pop(rsi); pop(rdi); pop(r8); pop(r9); pop(r10); pop(r11); pop(r12); pop(r13); pop(r14); pop(r15); pop(r16); pop(r17); pop(r18); pop(r19); pop(r20); pop(r21); pop(r22); pop(r23); pop(r24); pop(r25); pop(r26); pop(r27); pop(r28); pop(r29); pop(r30); pop(r31); movrs(rcx, ptr[rax]); movrs(ecx, ptr[rax]); movrs(cx, ptr[rax]); movrs(cl, ptr[rax+rdx*4]); vmovrsb(xm1|k1|T_z, ptr[rax+128]); vmovrsb(ym1|k1|T_z, ptr[rax+128]); vmovrsb(zm1|k1|T_z, ptr[rax+128]); vmovrsd(xm1|k1|T_z, ptr[rax+128]); vmovrsd(ym1|k1|T_z, ptr[rax+128]); vmovrsd(zm1|k1|T_z, ptr[rax+128]); vmovrsq(xm1|k1|T_z, ptr[rax+128]); vmovrsq(ym1|k1|T_z, ptr[rax+128]); vmovrsq(zm1|k1|T_z, ptr[rax+128]); vmovrsw(xm1|k1|T_z, ptr[rax+128]); vmovrsw(ym1|k1|T_z, ptr[rax+128]); vmovrsw(zm1|k1|T_z, ptr[rax+128]);