00cc9309cb
de6e324bdseparate emu thread10d3daf86Roms List improvements95d202f37Let's make the rom list process on a separate thread so the emulator doesnt take ages to load.fc306967fWow the ROM Header was just completely busted. Game list view works nowbad1691eefuck this shit2b59e5f46game list in progressd26417b83remappable inputs in progressac4af8106inpute72abc240update readme430139dc9Qt6 frontend3080d4d45Fix this small bug too08cd13b85Cop0 unused functions do not actually pose a threat (as per manual). They don't do anything, so shall we.61bb4fb44make idle loop detection a little more specific with where the load goesb037de4c3SAZDFsdff12e81e73eneed to figure out why n64-systemtest loops indefinitely at some address that appears to be valid (i think it's me not invalidating the cache properly)204f0e13bidle skipping seems to work!cb8bb634asdkfjlasdf58e5c89c1Fix compilation issue on my machine (no idea)24fb2898eattempting more serious idle skipping214719577Place rsp.Step inside cached interpreter. Gains about 3 more fpsbb97dcc23mmmmm920b77d38wjkhasdfjhkasdf430ccdab4it's a start...4f42a673aCached interpreter plays Mario 64. Start looking into RSP as wellc9a030787idle skipping works!5fbda03cenew idea366637abaIdle skipping... maybe?609fa2fb0Cache instructions implemented but broken lmao. Commented out for nowe140a6d12- Stop using inheritance for CPU, instead use composition. - Introduce KAIZEN_JIT_ENABLED optional define instead of relying on __aarch64__ and the like. - More cache work68e613057prep cache impl811b4d809fix clang formatfda755f7didkd5024ebbfsmall MI refactor in preparation of (eventually) implementing the RDRAM interface properly694b45341Merge commit '206dcdedf195fb320913584180edb12c7731e396' as 'external/SDL'206dcdedfSquashed 'external/SDL/' content from commit 4d17b99d0a4d16e1cb4need to update sdl848b19920Fix compilation errordb61b5299Merge commit 'e94a94559f28e49678fbcf72199a5258137b0fe9' as 'external/imgui'e94a94559Squashed 'external/imgui/' content from commit 02e9b8cac52edb3757need to update imguic1a705e86Emulate weird JALR behaviour4b4c32f4bFix exception for "unusable COP1" in 4 instructions i missed accidentally (again)df5828142Bug putting 0s in the log everywheref8b580048Make isviewer a sink to file8241e9735Fix exception for "unusable COP1" in 4 instructions i missed accidentallyb29715f20small changesd9a620bc1make use of my new small utility library0d1aa938eAdd 'external/ircolib/' from commit 'ce3cd726c8df8388d554abf8bb55d55020eb4450'e64eb40b3Fuck git git-subtree-dir: external/ircolib git-subtree-split:de6e324bde
421 lines
11 KiB
C
421 lines
11 KiB
C
/* Bra.c -- Branch converters for RISC code
|
|
2023-04-02 : Igor Pavlov : Public domain */
|
|
|
|
#include "Precomp.h"
|
|
|
|
#include "Bra.h"
|
|
#include "CpuArch.h"
|
|
#include "RotateDefs.h"
|
|
|
|
#if defined(MY_CPU_SIZEOF_POINTER) \
|
|
&& ( MY_CPU_SIZEOF_POINTER == 4 \
|
|
|| MY_CPU_SIZEOF_POINTER == 8)
|
|
#define BR_CONV_USE_OPT_PC_PTR
|
|
#endif
|
|
|
|
#ifdef BR_CONV_USE_OPT_PC_PTR
|
|
#define BR_PC_INIT pc -= (UInt32)(SizeT)p;
|
|
#define BR_PC_GET (pc + (UInt32)(SizeT)p)
|
|
#else
|
|
#define BR_PC_INIT pc += (UInt32)size;
|
|
#define BR_PC_GET (pc - (UInt32)(SizeT)(lim - p))
|
|
// #define BR_PC_INIT
|
|
// #define BR_PC_GET (pc + (UInt32)(SizeT)(p - data))
|
|
#endif
|
|
|
|
#define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c;
|
|
// #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c;
|
|
|
|
#define Z7_BRANCH_CONV(name) z7_BranchConv_ ## name
|
|
|
|
#define Z7_BRANCH_FUNC_MAIN(name) \
|
|
static \
|
|
Z7_FORCE_INLINE \
|
|
Z7_ATTRIB_NO_VECTOR \
|
|
Byte *Z7_BRANCH_CONV(name)(Byte *p, SizeT size, UInt32 pc, int encoding)
|
|
|
|
#define Z7_BRANCH_FUNC_IMP(name, m, encoding) \
|
|
Z7_NO_INLINE \
|
|
Z7_ATTRIB_NO_VECTOR \
|
|
Byte *m(name)(Byte *data, SizeT size, UInt32 pc) \
|
|
{ return Z7_BRANCH_CONV(name)(data, size, pc, encoding); } \
|
|
|
|
#ifdef Z7_EXTRACT_ONLY
|
|
#define Z7_BRANCH_FUNCS_IMP(name) \
|
|
Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC, 0)
|
|
#else
|
|
#define Z7_BRANCH_FUNCS_IMP(name) \
|
|
Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC, 0) \
|
|
Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_ENC, 1)
|
|
#endif
|
|
|
|
#if defined(__clang__)
|
|
#define BR_EXTERNAL_FOR
|
|
#define BR_NEXT_ITERATION continue;
|
|
#else
|
|
#define BR_EXTERNAL_FOR for (;;)
|
|
#define BR_NEXT_ITERATION break;
|
|
#endif
|
|
|
|
#if defined(__clang__) && (__clang_major__ >= 8) \
|
|
|| defined(__GNUC__) && (__GNUC__ >= 1000) \
|
|
// GCC is not good for __builtin_expect() here
|
|
/* || defined(_MSC_VER) && (_MSC_VER >= 1920) */
|
|
// #define Z7_unlikely [[unlikely]]
|
|
// #define Z7_LIKELY(x) (__builtin_expect((x), 1))
|
|
#define Z7_UNLIKELY(x) (__builtin_expect((x), 0))
|
|
// #define Z7_likely [[likely]]
|
|
#else
|
|
// #define Z7_LIKELY(x) (x)
|
|
#define Z7_UNLIKELY(x) (x)
|
|
// #define Z7_likely
|
|
#endif
|
|
|
|
|
|
Z7_BRANCH_FUNC_MAIN(ARM64)
|
|
{
|
|
// Byte *p = data;
|
|
const Byte *lim;
|
|
const UInt32 flag = (UInt32)1 << (24 - 4);
|
|
const UInt32 mask = ((UInt32)1 << 24) - (flag << 1);
|
|
size &= ~(SizeT)3;
|
|
// if (size == 0) return p;
|
|
lim = p + size;
|
|
BR_PC_INIT
|
|
pc -= 4; // because (p) will point to next instruction
|
|
|
|
BR_EXTERNAL_FOR
|
|
{
|
|
// Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
|
|
for (;;)
|
|
{
|
|
UInt32 v;
|
|
if Z7_UNLIKELY(p == lim)
|
|
return p;
|
|
v = GetUi32a(p);
|
|
p += 4;
|
|
if Z7_UNLIKELY(((v - 0x94000000) & 0xfc000000) == 0)
|
|
{
|
|
UInt32 c = BR_PC_GET >> 2;
|
|
BR_CONVERT_VAL(v, c)
|
|
v &= 0x03ffffff;
|
|
v |= 0x94000000;
|
|
SetUi32a(p - 4, v)
|
|
BR_NEXT_ITERATION
|
|
}
|
|
// v = rotlFixed(v, 8); v += (flag << 8) - 0x90; if Z7_UNLIKELY((v & ((mask << 8) + 0x9f)) == 0)
|
|
v -= 0x90000000; if Z7_UNLIKELY((v & 0x9f000000) == 0)
|
|
{
|
|
UInt32 z, c;
|
|
// v = rotrFixed(v, 8);
|
|
v += flag; if Z7_UNLIKELY(v & mask) continue;
|
|
z = (v & 0xffffffe0) | (v >> 26);
|
|
c = (BR_PC_GET >> (12 - 3)) & ~(UInt32)7;
|
|
BR_CONVERT_VAL(z, c)
|
|
v &= 0x1f;
|
|
v |= 0x90000000;
|
|
v |= z << 26;
|
|
v |= 0x00ffffe0 & ((z & (((flag << 1) - 1))) - flag);
|
|
SetUi32a(p - 4, v)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
Z7_BRANCH_FUNCS_IMP(ARM64)
|
|
|
|
|
|
Z7_BRANCH_FUNC_MAIN(ARM)
|
|
{
|
|
// Byte *p = data;
|
|
const Byte *lim;
|
|
size &= ~(SizeT)3;
|
|
lim = p + size;
|
|
BR_PC_INIT
|
|
/* in ARM: branch offset is relative to the +2 instructions from current instruction.
|
|
(p) will point to next instruction */
|
|
pc += 8 - 4;
|
|
|
|
for (;;)
|
|
{
|
|
for (;;)
|
|
{
|
|
if Z7_UNLIKELY(p >= lim) { return p; } p += 4; if Z7_UNLIKELY(p[-1] == 0xeb) break;
|
|
if Z7_UNLIKELY(p >= lim) { return p; } p += 4; if Z7_UNLIKELY(p[-1] == 0xeb) break;
|
|
}
|
|
{
|
|
UInt32 v = GetUi32a(p - 4);
|
|
UInt32 c = BR_PC_GET >> 2;
|
|
BR_CONVERT_VAL(v, c)
|
|
v &= 0x00ffffff;
|
|
v |= 0xeb000000;
|
|
SetUi32a(p - 4, v)
|
|
}
|
|
}
|
|
}
|
|
Z7_BRANCH_FUNCS_IMP(ARM)
|
|
|
|
|
|
Z7_BRANCH_FUNC_MAIN(PPC)
|
|
{
|
|
// Byte *p = data;
|
|
const Byte *lim;
|
|
size &= ~(SizeT)3;
|
|
lim = p + size;
|
|
BR_PC_INIT
|
|
pc -= 4; // because (p) will point to next instruction
|
|
|
|
for (;;)
|
|
{
|
|
UInt32 v;
|
|
for (;;)
|
|
{
|
|
if Z7_UNLIKELY(p == lim)
|
|
return p;
|
|
// v = GetBe32a(p);
|
|
v = *(UInt32 *)(void *)p;
|
|
p += 4;
|
|
// if ((v & 0xfc000003) == 0x48000001) break;
|
|
// if ((p[-4] & 0xFC) == 0x48 && (p[-1] & 3) == 1) break;
|
|
if Z7_UNLIKELY(
|
|
((v - Z7_CONV_BE_TO_NATIVE_CONST32(0x48000001))
|
|
& Z7_CONV_BE_TO_NATIVE_CONST32(0xfc000003)) == 0) break;
|
|
}
|
|
{
|
|
v = Z7_CONV_NATIVE_TO_BE_32(v);
|
|
{
|
|
UInt32 c = BR_PC_GET;
|
|
BR_CONVERT_VAL(v, c)
|
|
}
|
|
v &= 0x03ffffff;
|
|
v |= 0x48000000;
|
|
SetBe32a(p - 4, v)
|
|
}
|
|
}
|
|
}
|
|
Z7_BRANCH_FUNCS_IMP(PPC)
|
|
|
|
|
|
#ifdef Z7_CPU_FAST_ROTATE_SUPPORTED
|
|
#define BR_SPARC_USE_ROTATE
|
|
#endif
|
|
|
|
Z7_BRANCH_FUNC_MAIN(SPARC)
|
|
{
|
|
// Byte *p = data;
|
|
const Byte *lim;
|
|
const UInt32 flag = (UInt32)1 << 22;
|
|
size &= ~(SizeT)3;
|
|
lim = p + size;
|
|
BR_PC_INIT
|
|
pc -= 4; // because (p) will point to next instruction
|
|
for (;;)
|
|
{
|
|
UInt32 v;
|
|
for (;;)
|
|
{
|
|
if Z7_UNLIKELY(p == lim)
|
|
return p;
|
|
/* // the code without GetBe32a():
|
|
{ const UInt32 v = GetUi16a(p) & 0xc0ff; p += 4; if (v == 0x40 || v == 0xc07f) break; }
|
|
*/
|
|
v = GetBe32a(p);
|
|
p += 4;
|
|
#ifdef BR_SPARC_USE_ROTATE
|
|
v = rotlFixed(v, 2);
|
|
v += (flag << 2) - 1;
|
|
if Z7_UNLIKELY((v & (3 - (flag << 3))) == 0)
|
|
#else
|
|
v += (UInt32)5 << 29;
|
|
v ^= (UInt32)7 << 29;
|
|
v += flag;
|
|
if Z7_UNLIKELY((v & (0 - (flag << 1))) == 0)
|
|
#endif
|
|
break;
|
|
}
|
|
{
|
|
// UInt32 v = GetBe32a(p - 4);
|
|
#ifndef BR_SPARC_USE_ROTATE
|
|
v <<= 2;
|
|
#endif
|
|
{
|
|
UInt32 c = BR_PC_GET;
|
|
BR_CONVERT_VAL(v, c)
|
|
}
|
|
v &= (flag << 3) - 1;
|
|
#ifdef BR_SPARC_USE_ROTATE
|
|
v -= (flag << 2) - 1;
|
|
v = rotrFixed(v, 2);
|
|
#else
|
|
v -= (flag << 2);
|
|
v >>= 2;
|
|
v |= (UInt32)1 << 30;
|
|
#endif
|
|
SetBe32a(p - 4, v)
|
|
}
|
|
}
|
|
}
|
|
Z7_BRANCH_FUNCS_IMP(SPARC)
|
|
|
|
|
|
Z7_BRANCH_FUNC_MAIN(ARMT)
|
|
{
|
|
// Byte *p = data;
|
|
Byte *lim;
|
|
size &= ~(SizeT)1;
|
|
// if (size == 0) return p;
|
|
if (size <= 2) return p;
|
|
size -= 2;
|
|
lim = p + size;
|
|
BR_PC_INIT
|
|
/* in ARM: branch offset is relative to the +2 instructions from current instruction.
|
|
(p) will point to the +2 instructions from current instruction */
|
|
// pc += 4 - 4;
|
|
// if (encoding) pc -= 0xf800 << 1; else pc += 0xf800 << 1;
|
|
// #define ARMT_TAIL_PROC { goto armt_tail; }
|
|
#define ARMT_TAIL_PROC { return p; }
|
|
|
|
do
|
|
{
|
|
/* in MSVC 32-bit x86 compilers:
|
|
UInt32 version : it loads value from memory with movzx
|
|
Byte version : it loads value to 8-bit register (AL/CL)
|
|
movzx version is slightly faster in some cpus
|
|
*/
|
|
unsigned b1;
|
|
// Byte / unsigned
|
|
b1 = p[1];
|
|
// optimized version to reduce one (p >= lim) check:
|
|
// unsigned a1 = p[1]; b1 = p[3]; p += 2; if Z7_LIKELY((b1 & (a1 ^ 8)) < 0xf8)
|
|
for (;;)
|
|
{
|
|
unsigned b3; // Byte / UInt32
|
|
/* (Byte)(b3) normalization can use low byte computations in MSVC.
|
|
It gives smaller code, and no loss of speed in some compilers/cpus.
|
|
But new MSVC 32-bit x86 compilers use more slow load
|
|
from memory to low byte register in that case.
|
|
So we try to use full 32-bit computations for faster code.
|
|
*/
|
|
// if (p >= lim) { ARMT_TAIL_PROC } b3 = b1 + 8; b1 = p[3]; p += 2; if ((b3 & b1) >= 0xf8) break;
|
|
if Z7_UNLIKELY(p >= lim) { ARMT_TAIL_PROC } b3 = p[3]; p += 2; if Z7_UNLIKELY((b3 & (b1 ^ 8)) >= 0xf8) break;
|
|
if Z7_UNLIKELY(p >= lim) { ARMT_TAIL_PROC } b1 = p[3]; p += 2; if Z7_UNLIKELY((b1 & (b3 ^ 8)) >= 0xf8) break;
|
|
}
|
|
{
|
|
/* we can adjust pc for (0xf800) to rid of (& 0x7FF) operation.
|
|
But gcc/clang for arm64 can use bfi instruction for full code here */
|
|
UInt32 v =
|
|
((UInt32)GetUi16a(p - 2) << 11) |
|
|
((UInt32)GetUi16a(p) & 0x7FF);
|
|
/*
|
|
UInt32 v =
|
|
((UInt32)p[1 - 2] << 19)
|
|
+ (((UInt32)p[1] & 0x7) << 8)
|
|
+ (((UInt32)p[-2] << 11))
|
|
+ (p[0]);
|
|
*/
|
|
p += 2;
|
|
{
|
|
UInt32 c = BR_PC_GET >> 1;
|
|
BR_CONVERT_VAL(v, c)
|
|
}
|
|
SetUi16a(p - 4, (UInt16)(((v >> 11) & 0x7ff) | 0xf000))
|
|
SetUi16a(p - 2, (UInt16)(v | 0xf800))
|
|
/*
|
|
p[-4] = (Byte)(v >> 11);
|
|
p[-3] = (Byte)(0xf0 | ((v >> 19) & 0x7));
|
|
p[-2] = (Byte)v;
|
|
p[-1] = (Byte)(0xf8 | (v >> 8));
|
|
*/
|
|
}
|
|
}
|
|
while (p < lim);
|
|
return p;
|
|
// armt_tail:
|
|
// if ((Byte)((lim[1] & 0xf8)) != 0xf0) { lim += 2; } return lim;
|
|
// return (Byte *)(lim + ((Byte)((lim[1] ^ 0xf0) & 0xf8) == 0 ? 0 : 2));
|
|
// return (Byte *)(lim + (((lim[1] ^ ~0xfu) & ~7u) == 0 ? 0 : 2));
|
|
// return (Byte *)(lim + 2 - (((((unsigned)lim[1] ^ 8) + 8) >> 7) & 2));
|
|
}
|
|
Z7_BRANCH_FUNCS_IMP(ARMT)
|
|
|
|
|
|
// #define BR_IA64_NO_INLINE
|
|
|
|
Z7_BRANCH_FUNC_MAIN(IA64)
|
|
{
|
|
// Byte *p = data;
|
|
const Byte *lim;
|
|
size &= ~(SizeT)15;
|
|
lim = p + size;
|
|
pc -= 1 << 4;
|
|
pc >>= 4 - 1;
|
|
// pc -= 1 << 1;
|
|
|
|
for (;;)
|
|
{
|
|
unsigned m;
|
|
for (;;)
|
|
{
|
|
if Z7_UNLIKELY(p == lim)
|
|
return p;
|
|
m = (unsigned)((UInt32)0x334b0000 >> (*p & 0x1e));
|
|
p += 16;
|
|
pc += 1 << 1;
|
|
if (m &= 3)
|
|
break;
|
|
}
|
|
{
|
|
p += (ptrdiff_t)m * 5 - 20; // negative value is expected here.
|
|
do
|
|
{
|
|
const UInt32 t =
|
|
#if defined(MY_CPU_X86_OR_AMD64)
|
|
// we use 32-bit load here to reduce code size on x86:
|
|
GetUi32(p);
|
|
#else
|
|
GetUi16(p);
|
|
#endif
|
|
UInt32 z = GetUi32(p + 1) >> m;
|
|
p += 5;
|
|
if (((t >> m) & (0x70 << 1)) == 0
|
|
&& ((z - (0x5000000 << 1)) & (0xf000000 << 1)) == 0)
|
|
{
|
|
UInt32 v = (UInt32)((0x8fffff << 1) | 1) & z;
|
|
z ^= v;
|
|
#ifdef BR_IA64_NO_INLINE
|
|
v |= (v & ((UInt32)1 << (23 + 1))) >> 3;
|
|
{
|
|
UInt32 c = pc;
|
|
BR_CONVERT_VAL(v, c)
|
|
}
|
|
v &= (0x1fffff << 1) | 1;
|
|
#else
|
|
{
|
|
if (encoding)
|
|
{
|
|
// pc &= ~(0xc00000 << 1); // we just need to clear at least 2 bits
|
|
pc &= (0x1fffff << 1) | 1;
|
|
v += pc;
|
|
}
|
|
else
|
|
{
|
|
// pc |= 0xc00000 << 1; // we need to set at least 2 bits
|
|
pc |= ~(UInt32)((0x1fffff << 1) | 1);
|
|
v -= pc;
|
|
}
|
|
}
|
|
v &= ~(UInt32)(0x600000 << 1);
|
|
#endif
|
|
v += (0x700000 << 1);
|
|
v &= (0x8fffff << 1) | 1;
|
|
z |= v;
|
|
z <<= m;
|
|
SetUi32(p + 1 - 5, z)
|
|
}
|
|
m++;
|
|
}
|
|
while (m &= 3); // while (m < 4);
|
|
}
|
|
}
|
|
}
|
|
Z7_BRANCH_FUNCS_IMP(IA64)
|