// AVX10 integer and FP16 VNNI, media and zero-extending vdpphps(xm1, xm2, xm3); vdpphps(xm1, xm2, ptr[rax+64]); vdpphps(xm1, xm2, ptr_b[rax+64]); vdpphps(ym1, ym2, ym3); vdpphps(ym1, ym2, ptr[rax+64]); vdpphps(ym1, ym2, ptr_b[rax+64]); vdpphps(zm1, zm2, zm3); vdpphps(zm1, zm2, ptr[rax+64]); vdpphps(zm1, zm2, ptr_b[rax+64]); // vmpsadbw(xm1, xm3, xm15, 3); vmpsadbw(xm1|T_z, xm4, ptr[rax+64], 5); vmpsadbw(ym1|k4, ym3, ym15, 3); vmpsadbw(ym1, ym4, ptr[rax+64], 5); vmpsadbw(zm1|k4, zm3, zm15, 3); vmpsadbw(zm1, zm4, ptr[rax+64], 5); // vpdpbssd(xm1, xm2, xm3); vpdpbssd(xm1, xm2, ptr[rax+64]); vpdpbssd(xm1, xm2, ptr_b[rax+64]); vpdpbssd(ym1, ym2, ym3); vpdpbssd(ym1, ym2, ptr[rax+64]); vpdpbssd(ym1, ym2, ptr_b[rax+64]); vpdpbssd(zm1, zm2, zm3); vpdpbssd(zm1, zm2, ptr[rax+64]); vpdpbssd(zm1, zm2, ptr_b[rax+64]); // vpdpbssds(xm1, xm2, xm3); vpdpbssds(xm1, xm2, ptr[rax+64]); vpdpbssds(xm1, xm2, ptr_b[rax+64]); vpdpbssds(ym1, ym2, ym3); vpdpbssds(ym1, ym2, ptr[rax+64]); vpdpbssds(ym1, ym2, ptr_b[rax+64]); vpdpbssds(zm1, zm2, zm3); vpdpbssds(zm1, zm2, ptr[rax+64]); vpdpbssds(zm1, zm2, ptr_b[rax+64]); // vpdpbsud(xm1, xm2, xm3); vpdpbsud(xm1, xm2, ptr[rax+64]); vpdpbsud(xm1, xm2, ptr_b[rax+64]); vpdpbsud(ym1, ym2, ym3); vpdpbsud(ym1, ym2, ptr[rax+64]); vpdpbsud(ym1, ym2, ptr_b[rax+64]); vpdpbsud(zm1, zm2, zm3); vpdpbsud(zm1, zm2, ptr[rax+64]); vpdpbsud(zm1, zm2, ptr_b[rax+64]); // vpdpbsuds(xm1, xm2, xm3); vpdpbsuds(xm1, xm2, ptr[rax+64]); vpdpbsuds(xm1, xm2, ptr_b[rax+64]); vpdpbsuds(ym1, ym2, ym3); vpdpbsuds(ym1, ym2, ptr[rax+64]); vpdpbsuds(ym1, ym2, ptr_b[rax+64]); vpdpbsuds(zm1, zm2, zm3); vpdpbsuds(zm1, zm2, ptr[rax+64]); vpdpbsuds(zm1, zm2, ptr_b[rax+64]); // vpdpbuud(xm1, xm2, xm3); vpdpbuud(xm1, xm2, ptr[rax+64]); vpdpbuud(xm1, xm2, ptr_b[rax+64]); vpdpbuud(ym1, ym2, ym3); vpdpbuud(ym1, ym2, ptr[rax+64]); vpdpbuud(ym1, ym2, ptr_b[rax+64]); vpdpbuud(zm1, zm2, zm3); vpdpbuud(zm1, zm2, ptr[rax+64]); vpdpbuud(zm1, zm2, ptr_b[rax+64]); // vpdpbuuds(xm1, xm2, xm3); vpdpbuuds(xm1, xm2, ptr[rax+64]); vpdpbuuds(xm1, xm2, ptr_b[rax+64]); vpdpbuuds(ym1, ym2, ym3); vpdpbuuds(ym1, ym2, ptr[rax+64]); vpdpbuuds(ym1, ym2, ptr_b[rax+64]); vpdpbuuds(zm1, zm2, zm3); vpdpbuuds(zm1, zm2, ptr[rax+64]); vpdpbuuds(zm1, zm2, ptr_b[rax+64]); // vpdpwsud(xm1, xm2, xm3); vpdpwsud(xm1, xm2, ptr[rax+64]); vpdpwsud(xm1, xm2, ptr_b[rax+64]); vpdpwsud(ym1, ym2, ym3); vpdpwsud(ym1, ym2, ptr[rax+64]); vpdpwsud(ym1, ym2, ptr_b[rax+64]); vpdpwsud(zm1, zm2, zm3); vpdpwsud(zm1, zm2, ptr[rax+64]); vpdpwsud(zm1, zm2, ptr_b[rax+64]); // vpdpwsuds(xm1, xm2, xm3); vpdpwsuds(xm1, xm2, ptr[rax+64]); vpdpwsuds(xm1, xm2, ptr_b[rax+64]); vpdpwsuds(ym1, ym2, ym3); vpdpwsuds(ym1, ym2, ptr[rax+64]); vpdpwsuds(ym1, ym2, ptr_b[rax+64]); vpdpwsuds(zm1, zm2, zm3); vpdpwsuds(zm1, zm2, ptr[rax+64]); vpdpwsuds(zm1, zm2, ptr_b[rax+64]); // vpdpwsud(xm1, xm2, xm3); vpdpwsud(xm1, xm2, ptr[rax+64]); vpdpwsud(xm1, xm2, ptr_b[rax+64]); vpdpwsud(ym1, ym2, ym3); vpdpwsud(ym1, ym2, ptr[rax+64]); vpdpwsud(ym1, ym2, ptr_b[rax+64]); vpdpwsud(zm1, zm2, zm3); vpdpwsud(zm1, zm2, ptr[rax+64]); vpdpwsud(zm1, zm2, ptr_b[rax+64]); // vpdpwsuds(xm1, xm2, xm3); vpdpwsuds(xm1, xm2, ptr[rax+64]); vpdpwsuds(xm1, xm2, ptr_b[rax+64]); vpdpwsuds(ym1, ym2, ym3); vpdpwsuds(ym1, ym2, ptr[rax+64]); vpdpwsuds(ym1, ym2, ptr_b[rax+64]); vpdpwsuds(zm1, zm2, zm3); vpdpwsuds(zm1, zm2, ptr[rax+64]); vpdpwsuds(zm1, zm2, ptr_b[rax+64]); // vpdpwuud(xm1, xm2, xm3); vpdpwuud(xm1, xm2, ptr[rax+64]); vpdpwuud(xm1, xm2, ptr_b[rax+64]); vpdpwuud(ym1, ym2, ym3); vpdpwuud(ym1, ym2, ptr[rax+64]); vpdpwuud(ym1, ym2, ptr_b[rax+64]); vpdpwuud(zm1, zm2, zm3); vpdpwuud(zm1, zm2, ptr[rax+64]); vpdpwuud(zm1, zm2, ptr_b[rax+64]); // vpdpwuuds(xm1, xm2, xm3); vpdpwuuds(xm1, xm2, ptr[rax+64]); vpdpwuuds(xm1, xm2, ptr_b[rax+64]); vpdpwuuds(ym1, ym2, ym3); vpdpwuuds(ym1, ym2, ptr[rax+64]); vpdpwuuds(ym1, ym2, ptr_b[rax+64]); vpdpwuuds(zm1, zm2, zm3); vpdpwuuds(zm1, zm2, ptr[rax+64]); vpdpwuuds(zm1, zm2, ptr_b[rax+64]); // vmovd(xm10, xm20); vmovd(xm1, xm2); vmovd(xm10, ptr[rax+64]); vmovd(ptr[rax+64], xm30); // vmovw(xm1, xm20); vmovw(xm1, xm2); vmovw(xm3, ptr [rax+0x40]); vmovw(ptr [rax+0x40], xm7); // push(rax); push(rcx); push(rdx); push(rbx); push(rsp); push(rbp); push(rsi); push(rdi); push(r8); push(r9); push(r10); push(r11); push(r12); push(r13); push(r14); push(r15); push(r16); push(r17); push(r18); push(r19); push(r20); push(r21); push(r22); push(r23); push(r24); push(r25); push(r26); push(r27); push(r28); push(r29); push(r30); push(r31); pop(rax); pop(rcx); pop(rdx); pop(rbx); pop(rsp); pop(rbp); pop(rsi); pop(rdi); pop(r8); pop(r9); pop(r10); pop(r11); pop(r12); pop(r13); pop(r14); pop(r15); pop(r16); pop(r17); pop(r18); pop(r19); pop(r20); pop(r21); pop(r22); pop(r23); pop(r24); pop(r25); pop(r26); pop(r27); pop(r28); pop(r29); pop(r30); pop(r31); movrs(rcx, ptr[rax]); movrs(ecx, ptr[rax]); movrs(cx, ptr[rax]); movrs(cl, ptr[rax+rdx*4]); prefetchnta(ptr[rcx]); prefetcht0(ptr[rcx]); prefetcht1(ptr[rcx]); prefetcht2(ptr[rcx]); prefetchit1(ptr[rip+64]); prefetchit0(ptr[rip+64]); prefetchrst2(ptr[rcx]); vmovrsb(xm1|k1|T_z, ptr[rax+64]); vmovrsb(ym1|k1|T_z, ptr[rax+64]); vmovrsb(zm1|k1|T_z, ptr[rax+64]); vmovrsd(xm1|k1|T_z, ptr[rax+64]); vmovrsd(ym1|k1|T_z, ptr[rax+64]); vmovrsd(zm1|k1|T_z, ptr[rax+64]); vmovrsq(xm1|k1|T_z, ptr[rax+64]); vmovrsq(ym1|k1|T_z, ptr[rax+64]); vmovrsq(zm1|k1|T_z, ptr[rax+64]); vmovrsw(xm1|k1|T_z, ptr[rax+64]); vmovrsw(ym1|k1|T_z, ptr[rax+64]); vmovrsw(zm1|k1|T_z, ptr[rax+64]); // moved for bug of nasm 3.x vcvtsd2si(esp, xmm4|T_rd_sae); vcvtsd2si(r8, xmm4|T_rd_sae); vcvtsd2usi(ecx, xmm4|T_rd_sae); vcvtsd2usi(r14, xmm4|T_rd_sae); vcvtss2si(ecx, xmm4|T_rd_sae); vcvtss2si(r13, xmm4|T_rd_sae); vcvtss2usi(esi, xmm4|T_rd_sae); vcvtss2usi(r10, xmm4|T_rd_sae); vcvttsd2si(ecx, xmm25|T_sae); vcvttsd2si(r12, xmm25|T_sae); vcvttsd2usi(edx, xmm25|T_sae); vcvttsd2usi(rbp, xmm25|T_sae); vcvttss2si(esp, xmm25|T_sae); vcvttss2si(r11, xmm25|T_sae); vcvttss2usi(edi, xmm25|T_sae); vcvttss2usi(r14, xmm25|T_sae);