/* Copyright (c) 2020 Themaister
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#ifndef TEXTURE_H_
#define TEXTURE_H_

#include "data_structures.h"

const int TEXTURE_FORMAT_RGBA = 0;
const int TEXTURE_FORMAT_YUV = 1;
const int TEXTURE_FORMAT_CI = 2;
const int TEXTURE_FORMAT_IA = 3;
const int TEXTURE_FORMAT_I = 4;

int texel_mask_s(TileInfo tile, int s)
{
	if (tile.mask_s != 0)
	{
		int mask = 1 << tile.mask_s;
		if ((tile.flags & TILE_INFO_MIRROR_S_BIT) != 0)
			s ^= max((s & mask) - 1, 0);
		s &= mask - 1;
	}

	return s;
}

ivec2 texel_mask_s_copy(TileInfo tile, int s)
{
	ivec2 multi_s = s + ivec2(0, 1);

	if (tile.mask_s != 0)
	{
		int mask = 1 << tile.mask_s;
		if ((tile.flags & TILE_INFO_MIRROR_S_BIT) != 0)
			multi_s ^= max((multi_s & mask) - 1, 0);
		multi_s &= mask - 1;
	}

	return multi_s;
}

int texel_mask_t(TileInfo tile, int t)
{
	if (tile.mask_t != 0)
	{
		int mask = 1 << tile.mask_t;
		if ((tile.flags & TILE_INFO_MIRROR_T_BIT) != 0)
			t ^= max((t & mask) - 1, 0);
		t &= mask - 1;
	}

	return t;
}

i16x4 convert_rgba16(uint word)
{
	uvec3 rgb = (uvec3(word) >> uvec3(11, 6, 1)) & 31u;
	rgb = (rgb << 3u) | (rgb >> 2u);
	uint alpha = (word & 1u) * 0xffu;
	return i16x4(rgb, alpha);
}

i16x4 convert_ia16(uint word)
{
	uint intensity = word >> 8;
	uint alpha = word & 0xff;
	return i16x4(intensity, intensity, intensity, alpha);
}

i16x4 sample_texel_rgba4(TileInfo tile, uint tmem_instance, uvec2 st)
{
	uint byte_offset = tile.offset + tile.stride * st.y;
	byte_offset += st.x >> 1;
	byte_offset &= 0xfff;

	uint shift = (~st.x & 1) * 4;

	uint index = byte_offset;
	index ^= (st.y & 1) << 2;
	index ^= 3;

	uint word = uint(tmem8.instances[tmem_instance].elems[index]);
	word = (word >> shift) & 0xf;
	word |= word << 4;
	return i16x4(word);
}

i16x4 sample_texel_ia4(TileInfo tile, uint tmem_instance, uvec2 st)
{
	uint byte_offset = tile.offset + tile.stride * st.y;
	byte_offset += st.x >> 1;
	byte_offset &= 0xfff;

	uint shift = (~st.x & 1) * 4;

	uint index = byte_offset;
	index ^= (st.y & 1) << 2;
	index ^= 3;

	uint word = uint(tmem8.instances[tmem_instance].elems[index]);
	word = (word >> shift) & 0xf;

	uint intensity = word & 0xe;
	intensity = (intensity << 4) | (intensity << 1) | (intensity >> 2);
	return i16x4(intensity, intensity, intensity, (word & 1) * 0xff);
}

i16x4 sample_texel_ci4(TileInfo tile, uint tmem_instance, uvec2 st, uint pal)
{
	uint byte_offset = tile.offset + tile.stride * st.y;
	byte_offset += st.x >> 1;
	byte_offset &= 0xfff;

	uint shift = (~st.x & 1) * 4;

	uint index = byte_offset;
	index ^= (st.y & 1) << 2;
	index ^= 3;

	uint word = uint(tmem8.instances[tmem_instance].elems[index]);
	word = (word >> shift) & 0xf;
	word |= pal << 4;
	return i16x4(word);
}

i16x4 sample_texel_ci4_tlut(TileInfo tile, uint tmem_instance, uvec2 st, uint pal, uint lut_offset, uint addr_xor, bool tlut_type)
{
	uint byte_offset = tile.offset + tile.stride * st.y;
	byte_offset += st.x >> 1;
	byte_offset &= 0x7ff;

	uint shift = (~st.x & 1) * 4;

	uint index = byte_offset;
	index ^= (st.y & 1) << 2;
	index ^= 3;

	uint word = uint(tmem8.instances[tmem_instance].elems[index]);
	word = (word >> shift) & 0xf;
	word |= pal << 4;

	uint lut_entry = (word << 2) + lut_offset;
	lut_entry ^= addr_xor;

	word = uint(tmem16.instances[tmem_instance].elems[0x400 | lut_entry]);
	return tlut_type ? convert_ia16(word) : convert_rgba16(word);
}

i16x4 sample_texel_ci8_tlut(TileInfo tile, uint tmem_instance, uvec2 st, uint lut_offset, uint addr_xor, bool tlut_type)
{
	uint byte_offset = tile.offset + tile.stride * st.y;
	byte_offset += st.x;
	byte_offset &= 0x7ff;

	uint index = byte_offset;
	index ^= (st.y & 1) << 2;
	index ^= 3;

	uint word = uint(tmem8.instances[tmem_instance].elems[index]);
	uint lut_entry = (word << 2) + lut_offset;
	lut_entry ^= addr_xor;

	word = uint(tmem16.instances[tmem_instance].elems[0x400 | lut_entry]);
	return tlut_type ? convert_ia16(word) : convert_rgba16(word);
}

i16x4 sample_texel_ci32(TileInfo tile, uint tmem_instance, uvec2 st)
{
	uint byte_offset = tile.offset + tile.stride * st.y;
	byte_offset += st.x * 2;
	byte_offset &= 0xfff;

	uint index = byte_offset >> 1;
	index ^= (st.y & 1) << 1;
	index ^= 1;

	uint word = uint(tmem16.instances[tmem_instance].elems[index]);
	return i16x2(word >> 8, word & 0xff).xyxy;
}

i16x4 sample_texel_ci32_tlut(TileInfo tile, uint tmem_instance, uvec2 st, uint lut_offset, uint addr_xor, bool tlut_type)
{
	uint byte_offset = tile.offset + tile.stride * st.y;
	byte_offset += st.x * 2;
	byte_offset &= 0x7ff;

	uint index = byte_offset >> 1;
	index ^= (st.y & 1) << 1;
	index ^= 1;

	uint word = uint(tmem16.instances[tmem_instance].elems[index]);
	uint lut_entry = ((word >> 6) & ~3) + lut_offset;
	lut_entry ^= addr_xor;
	word = uint(tmem16.instances[tmem_instance].elems[0x400 | lut_entry]);
	return tlut_type ? convert_ia16(word) : convert_rgba16(word);
}

i16x4 sample_texel_rgba8(TileInfo tile, uint tmem_instance, uvec2 st)
{
	uint byte_offset = tile.offset + tile.stride * st.y;
	byte_offset += st.x;
	byte_offset &= 0xfff;

	uint index = byte_offset;
	index ^= (st.y & 1) << 2;
	index ^= 3;

	uint word = uint(tmem8.instances[tmem_instance].elems[index]);
	return i16x4(word);
}

i16x4 sample_texel_ia8(TileInfo tile, uint tmem_instance, uvec2 st)
{
	uint byte_offset = tile.offset + tile.stride * st.y;
	byte_offset += st.x;
	byte_offset &= 0xfff;

	uint index = byte_offset;
	index ^= (st.y & 1) << 2;
	index ^= 3;

	uint word = uint(tmem8.instances[tmem_instance].elems[index]);
	uint intensity = word >> 4;
	uint alpha = word & 0xf;
	alpha |= alpha << 4;
	intensity |= intensity << 4;
	return i16x4(intensity, intensity, intensity, alpha);
}

i16x4 sample_texel_yuv16(TileInfo tile, uint tmem_instance, uvec2 st, uint chroma_x)
{
	uint byte_offset = tile.offset + tile.stride * st.y;
	uint byte_offset_luma = byte_offset + st.x;
	byte_offset_luma &= 0x7ff;

	uint byte_offset_chroma = byte_offset + chroma_x * 2;
	byte_offset_chroma &= 0x7ff;

	uint index_luma = byte_offset_luma;
	index_luma ^= (st.y & 1) << 2;
	index_luma ^= 3;

	uint index_chroma = byte_offset_chroma >> 1;
	index_chroma ^= (st.y & 1) << 1;
	index_chroma ^= 1;

	u8 luma = u8(tmem8.instances[tmem_instance].elems[index_luma | 0x800]);
	u16 chroma = u16(tmem16.instances[tmem_instance].elems[index_chroma]);
	u8 u = u8((chroma >> U16_C(8)) & U16_C(0xff));
	u8 v = u8((chroma >> U16_C(0)) & U16_C(0xff));
	return i16x4(i16(u) - I16_C(0x80), i16(v) - I16_C(0x80), luma, luma);
}

i16x4 sample_texel_rgba16(TileInfo tile, uint tmem_instance, uvec2 st)
{
	uint byte_offset = tile.offset + tile.stride * st.y;
	byte_offset += st.x * 2;
	byte_offset &= 0xfff;

	uint index = byte_offset >> 1;
	index ^= (st.y & 1) << 1;
	index ^= 1;

	uint word = uint(tmem16.instances[tmem_instance].elems[index]);
	return convert_rgba16(word);
}

i16x4 sample_texel_ia16(TileInfo tile, uint tmem_instance, uvec2 st)
{
	uint byte_offset = tile.offset + tile.stride * st.y;
	byte_offset += st.x * 2;
	byte_offset &= 0xfff;

	uint index = byte_offset >> 1;
	index ^= (st.y & 1) << 1;
	index ^= 1;

	uint word = uint(tmem16.instances[tmem_instance].elems[index]);
	return convert_ia16(word);
}

i16x4 sample_texel_rgba32(TileInfo tile, uint tmem_instance, uvec2 st)
{
	uint byte_offset = tile.offset + tile.stride * st.y;
	byte_offset += st.x * 2;
	byte_offset &= 0x7ff;

	uint index = byte_offset >> 1;
	index ^= (st.y & 1) << 1;
	index ^= 1;

	uint lower_word = uint(tmem16.instances[tmem_instance].elems[index]);
	uint upper_word = uint(tmem16.instances[tmem_instance].elems[index | 0x400]);
	return i16x4(lower_word >> 8, lower_word & 0xff, upper_word >> 8, upper_word & 0xff);
}

int clamp_and_shift_coord(bool clamp_bit, int coord, int lo, int hi, int shift)
{
	// Clamp 17-bit coordinate to 16-bit coordinate here.
	coord = clamp(coord, -0x8000, 0x7fff);

	if (shift < 11)
		coord >>= shift;
	else
	{
		coord <<= (32 - shift);
		coord >>= 16;
	}

	if (clamp_bit)
	{
		bool clamp_hi = (coord >> 3) >= hi;
		if (clamp_hi)
			coord = (((hi >> 2) - (lo >> 2)) & 0x3ff) << 5;
		else
			coord = max(coord - (lo << 3), 0);
	}
	else
		coord -= lo << 3;

	return coord;
}

int shift_coord(int coord, int lo, int shift)
{
	// Clamp 17-bit coordinate to 16-bit coordinate here.
	coord = clamp(coord, -0x8000, 0x7fff);

	if (shift < 11)
		coord >>= shift;
	else
	{
		coord <<= (32 - shift);
		coord >>= 16;
	}
	coord -= lo << 3;
	return coord;
}

// The copy pipe reads 4x16 words.
int sample_texture_copy_word(TileInfo tile, uint tmem_instance, ivec2 st, int s_offset, bool tlut, bool tlut_type)
{
	// For non-16bpp TMEM, the lower 32-bits are sampled based on direct 16-bit fetches. There are no shifts applied.
	bool high_word = s_offset < 2;
	bool replicate_8bpp = high_word && tile.size != 2 && !tlut;
	int samp;

	int s_shamt = min(int(tile.size), 2);
	bool large_texel = int(tile.size) == 3;
	int idx_mask = (large_texel || tlut) ? 0x3ff : 0x7ff;

	if (replicate_8bpp)
	{
		// The high word of 8-bpp replication is special in the sense that we sample 8-bpp correctly.
		// Sample the two possible words.
		st.x += 2 * s_offset;
		ivec2 s = texel_mask_s_copy(tile, st.x);
		int t = texel_mask_t(tile, st.y);

		uint tbase = tile.offset + tile.stride * t;
		uvec2 nibble_offset = (tbase * 2 + (s << s_shamt)) & 0x1fffu;
		nibble_offset ^= (t & 1u) * 8u;
		uvec2 index = nibble_offset >> 2u;

		index &= idx_mask;
		int samp0 = int(tmem16.instances[tmem_instance].elems[index.x ^ 1]);
		int samp1 = int(tmem16.instances[tmem_instance].elems[index.y ^ 1]);

		if (tile.size == 1)
		{
			samp0 >>= 8 - 4 * int(nibble_offset.x & 2);
			samp1 >>= 8 - 4 * int(nibble_offset.y & 2);
			samp0 &= 0xff;
			samp1 &= 0xff;
		}
		else if (tile.size == 0)
		{
			samp0 >>= 12 - 4 * int(nibble_offset.x & 3u);
			samp1 >>= 12 - 4 * int(nibble_offset.y & 3u);
			samp0 = (samp0 & 0xf) * 0x11;
			samp1 = (samp1 & 0xf) * 0x11;
		}
		else
		{
			samp0 >>= 8;
			samp1 >>= 8;
		}

		samp = (samp0 << 8) | samp1;
	}
	else
	{
		st.x += s_offset;
		int s = texel_mask_s(tile, st.x);
		int t = texel_mask_t(tile, st.y);

		uint tbase = tile.offset + tile.stride * t;
		uint nibble_offset = (tbase * 2 + (s << s_shamt)) & 0x1fffu;
		nibble_offset ^= (t & 1u) * 8u;

		uint index = nibble_offset >> 2u;
		index &= idx_mask;
		samp = int(tmem16.instances[tmem_instance].elems[index ^ 1]);

		if (tlut)
		{
			if (tile.size == 0)
			{
				samp >>= 12 - 4 * (nibble_offset & 3);
				samp &= 0xf;
				samp |= tile.palette << 4;
				samp <<= 2;
				samp += s_offset;
			}
			else
			{
				samp >>= 8 - 4 * (nibble_offset & 2);
				samp &= 0xff;
				samp <<= 2;
				samp += s_offset;
			}
			samp = int(tmem16.instances[tmem_instance].elems[(samp | 0x400) ^ 1]);
		}
	}

	return samp;
}

int sample_texture_copy(TileInfo tile, uint tmem_instance, ivec2 st, int s_offset, bool tlut, bool tlut_type)
{
	st.x = shift_coord(st.x, int(tile.slo), int(tile.shift_s));
	st.y = shift_coord(st.y, int(tile.tlo), int(tile.shift_t));
	st >>= 5;

	int samp;
	if (global_constants.fb_info.fb_size == 0)
	{
		samp = 0;
	}
	else if (global_constants.fb_info.fb_size == 1)
	{
		samp = sample_texture_copy_word(tile, tmem_instance, st, s_offset >> 1, tlut, tlut_type);
		samp >>= 8 - 8 * (s_offset & 1);
		samp &= 0xff;
	}
	else
	{
		samp = sample_texture_copy_word(tile, tmem_instance, st, s_offset, tlut, tlut_type);
	}

	return samp;
}

i16x2 bilinear_3tap(i16x2 t00, i16x2 t10, i16x2 t01, i16x2 t11, ivec2 frac)
{
	int sum_frac = frac.x + frac.y;
	i16x2 t_base = sum_frac >= 32 ? t11 : t00;
	i16x2 flip_frac = i16x2(sum_frac >= 32 ? (32 - frac.yx) : frac);
	i16x2 accum = (t10 - t_base) * flip_frac.x;
	accum += (t01 - t_base) * flip_frac.y;
	accum += I16_C(0x10);
	accum >>= I16_C(5);
	accum += t_base;
	return accum;
}

i16x4 texture_convert_factors(i16x4 texel_in, i16x4 factors)
{
	ivec4 texel = bitfieldExtract(ivec4(texel_in), 0, 9);

	int r = texel.b + ((factors.x * texel.g + 0x80) >> 8);
	int g = texel.b + ((factors.y * texel.r + factors.z * texel.g + 0x80) >> 8);
	int b = texel.b + ((factors.w * texel.r + 0x80) >> 8);
	int a = texel.b;
	return i16x4(r, g, b, a);
}

i16x4 sample_texture(TileInfo tile, uint tmem_instance, ivec2 st, bool tlut, bool tlut_type,
                     bool sample_quad, bool mid_texel_state, bool convert_one, bool bilerp,
                     i16x4 conversion_factors, i16x4 prev_cycle)
{
	st.x = clamp_and_shift_coord((tile.flags & TILE_INFO_CLAMP_S_BIT) != 0, st.x, int(tile.slo), int(tile.shi), int(tile.shift_s));
	st.y = clamp_and_shift_coord((tile.flags & TILE_INFO_CLAMP_T_BIT) != 0, st.y, int(tile.tlo), int(tile.thi), int(tile.shift_t));

	ivec2 frac;
	if (sample_quad || tlut)
		frac = st & 31;
	else
		frac = ivec2(0);

	int sum_frac = frac.x + frac.y;
	st >>= 5;

	int s0 = texel_mask_s(tile, st.x);
	int t0 = texel_mask_t(tile, st.y);
	int s1 = texel_mask_s(tile, st.x + 1);
	int t1 = texel_mask_t(tile, st.y + 1);

	// Very specific weird logic going on with t0 and t1.
	int tdiff = max(t1 - t0, -255);
	t1 = (t0 & 0xff) + tdiff;
	t0 &= 0xff;

	i16x4 t_base, t10, t01, t11;
	bool mid_texel = all(bvec4(mid_texel_state, bilerp, equal(frac, ivec2(0x10))));

	bool upper_lut = sum_frac >= 0x20;
	if (mid_texel)
	{
		// Ensure we sample all 4 texels.
		sum_frac = 0;
	}

	bool yuv = tile.fmt == TEXTURE_FORMAT_YUV;
	ivec2 base_st = sum_frac >= 0x20 ? ivec2(s1, t1) : ivec2(s0, t0);
	int chroma_frac = ((s0 & 1) << 4) | (frac.x >> 1);

	if (tlut)
	{
		if (!sample_quad)
		{
			// Weird mode where we sample a bilinear footprint with the 4 banks of TLUT instead.
			// Force the footprint to be sampled, but adjust the input coordinates instead.
			base_st = ivec2(s0, t0);
			s1 = s0;
			t1 = t0;
		}

		switch (int(tile.fmt))
		{
		case TEXTURE_FORMAT_RGBA:
		case TEXTURE_FORMAT_CI:
		case TEXTURE_FORMAT_IA:
		case TEXTURE_FORMAT_I:
		{
			// For TLUT, entries in the LUT are duplicated and we must make sure that we sample 3 different banks
			// when we look up the TLUT entry. In normal situations, this is irrelevant, but we're trying to be accurate here.
			bool upper = sum_frac >= 0x20;
			uint addr_xor = upper_lut ? 2 : 1;

			switch (int(tile.size))
			{
			case 0:
				t_base = sample_texel_ci4_tlut(tile, tmem_instance, base_st, tile.palette, upper ? 3 : 0, addr_xor, tlut_type);
				if (bilerp)
				{
					t10 = sample_texel_ci4_tlut(tile, tmem_instance, ivec2(s1, t0), tile.palette, 1, addr_xor,
					                            tlut_type);
					t01 = sample_texel_ci4_tlut(tile, tmem_instance, ivec2(s0, t1), tile.palette, 2, addr_xor,
					                            tlut_type);
				}
				if (mid_texel)
				{
					t11 = sample_texel_ci4_tlut(tile, tmem_instance, ivec2(s1, t1), tile.palette, 3, addr_xor,
					                            tlut_type);
				}
				break;

			case 1:
				t_base = sample_texel_ci8_tlut(tile, tmem_instance, base_st, upper ? 3 : 0, addr_xor, tlut_type);
				if (bilerp)
				{
					t10 = sample_texel_ci8_tlut(tile, tmem_instance, ivec2(s1, t0), 1, addr_xor, tlut_type);
					t01 = sample_texel_ci8_tlut(tile, tmem_instance, ivec2(s0, t1), 2, addr_xor, tlut_type);
				}
				if (mid_texel)
					t11 = sample_texel_ci8_tlut(tile, tmem_instance, ivec2(s1, t1), 3, addr_xor, tlut_type);
				break;

			default:
				t_base = sample_texel_ci32_tlut(tile, tmem_instance, base_st, upper ? 3 : 0, addr_xor, tlut_type);
				if (bilerp)
				{
					t10 = sample_texel_ci32_tlut(tile, tmem_instance, ivec2(s1, t0), 1, addr_xor, tlut_type);
					t01 = sample_texel_ci32_tlut(tile, tmem_instance, ivec2(s0, t1), 2, addr_xor, tlut_type);
				}
				if (mid_texel)
					t11 = sample_texel_ci32_tlut(tile, tmem_instance, ivec2(s1, t1), 3, addr_xor, tlut_type);
				break;
			}
			break;
		}
		}
	}
	else
	{
		switch (int(tile.fmt))
		{
		case TEXTURE_FORMAT_RGBA:
			switch (int(tile.size))
			{
			case 0:
				t_base = sample_texel_rgba4(tile, tmem_instance, base_st);
				if (sample_quad)
				{
					t10 = sample_texel_rgba4(tile, tmem_instance, ivec2(s1, t0));
					t01 = sample_texel_rgba4(tile, tmem_instance, ivec2(s0, t1));
				}
				if (mid_texel)
					t11 = sample_texel_rgba4(tile, tmem_instance, ivec2(s1, t1));
				break;

			case 1:
				t_base = sample_texel_rgba8(tile, tmem_instance, base_st);
				if (sample_quad)
				{
					t10 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t0));
					t01 = sample_texel_rgba8(tile, tmem_instance, ivec2(s0, t1));
				}
				if (mid_texel)
					t11 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t1));
				break;

			case 2:
				t_base = sample_texel_rgba16(tile, tmem_instance, base_st);
				if (sample_quad)
				{
					t10 = sample_texel_rgba16(tile, tmem_instance, ivec2(s1, t0));
					t01 = sample_texel_rgba16(tile, tmem_instance, ivec2(s0, t1));
				}
				if (mid_texel)
					t11 = sample_texel_rgba16(tile, tmem_instance, ivec2(s1, t1));
				break;

			case 3:
				t_base = sample_texel_rgba32(tile, tmem_instance, base_st);
				if (sample_quad)
				{
					t10 = sample_texel_rgba32(tile, tmem_instance, ivec2(s1, t0));
					t01 = sample_texel_rgba32(tile, tmem_instance, ivec2(s0, t1));
				}
				if (mid_texel)
					t11 = sample_texel_rgba32(tile, tmem_instance, ivec2(s1, t1));
				break;
			}
			break;

		case TEXTURE_FORMAT_YUV:
		{
			uint chroma_x0 = s0 >> 1;
			uint chroma_x1 = (s1 + (s1 - s0)) >> 1;

			// Only implement 16bpp for now. It's the only one that gives meaningful results.
			t_base = sample_texel_yuv16(tile, tmem_instance, ivec2(s0, t0), chroma_x0);
			if (sample_quad)
			{
				t10 = sample_texel_yuv16(tile, tmem_instance, ivec2(s1, t0), chroma_x1);
				t01 = sample_texel_yuv16(tile, tmem_instance, ivec2(s0, t1), chroma_x0);
				t11 = sample_texel_yuv16(tile, tmem_instance, ivec2(s1, t1), chroma_x1);
			}
			break;
		}

		case TEXTURE_FORMAT_CI:
			switch (int(tile.size))
			{
			case 0:
				t_base = sample_texel_ci4(tile, tmem_instance, base_st, tile.palette);
				if (sample_quad)
				{
					t10 = sample_texel_ci4(tile, tmem_instance, ivec2(s1, t0), tile.palette);
					t01 = sample_texel_ci4(tile, tmem_instance, ivec2(s0, t1), tile.palette);
				}
				if (mid_texel)
					t11 = sample_texel_ci4(tile, tmem_instance, ivec2(s1, t1), tile.palette);
				break;

			case 1:
				t_base = sample_texel_rgba8(tile, tmem_instance, base_st);
				if (sample_quad)
				{
					t10 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t0));
					t01 = sample_texel_rgba8(tile, tmem_instance, ivec2(s0, t1));
				}
				if (mid_texel)
					t11 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t1));
				break;

			default:
				t_base = sample_texel_ci32(tile, tmem_instance, base_st);
				if (sample_quad)
				{
					t10 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t0));
					t01 = sample_texel_ci32(tile, tmem_instance, ivec2(s0, t1));
				}
				if (mid_texel)
					t11 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t1));
				break;
			}
			break;

		case TEXTURE_FORMAT_IA:
			switch (int(tile.size))
			{
			case 0:
				t_base = sample_texel_ia4(tile, tmem_instance, base_st);
				if (sample_quad)
				{
					t10 = sample_texel_ia4(tile, tmem_instance, ivec2(s1, t0));
					t01 = sample_texel_ia4(tile, tmem_instance, ivec2(s0, t1));
				}
				if (mid_texel)
					t11 = sample_texel_ia4(tile, tmem_instance, ivec2(s1, t1));
				break;

			case 1:
				t_base = sample_texel_ia8(tile, tmem_instance, base_st);
				if (sample_quad)
				{
					t10 = sample_texel_ia8(tile, tmem_instance, ivec2(s1, t0));
					t01 = sample_texel_ia8(tile, tmem_instance, ivec2(s0, t1));
				}
				if (mid_texel)
					t11 = sample_texel_ia8(tile, tmem_instance, ivec2(s1, t1));
				break;

			case 2:
				t_base = sample_texel_ia16(tile, tmem_instance, base_st);
				if (sample_quad)
				{
					t10 = sample_texel_ia16(tile, tmem_instance, ivec2(s1, t0));
					t01 = sample_texel_ia16(tile, tmem_instance, ivec2(s0, t1));
				}
				if (mid_texel)
					t11 = sample_texel_ia16(tile, tmem_instance, ivec2(s1, t1));
				break;

			case 3:
				t_base = sample_texel_ci32(tile, tmem_instance, base_st);
				if (sample_quad)
				{
					t10 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t0));
					t01 = sample_texel_ci32(tile, tmem_instance, ivec2(s0, t1));
				}
				if (mid_texel)
					t11 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t1));
				break;
			}
			break;

		case TEXTURE_FORMAT_I:
			switch (int(tile.size))
			{
			case 0:
				t_base = sample_texel_rgba4(tile, tmem_instance, base_st);
				if (sample_quad)
				{
					t10 = sample_texel_rgba4(tile, tmem_instance, ivec2(s1, t0));
					t01 = sample_texel_rgba4(tile, tmem_instance, ivec2(s0, t1));
				}
				if (mid_texel)
					t11 = sample_texel_rgba4(tile, tmem_instance, ivec2(s1, t1));
				break;

			case 1:
				t_base = sample_texel_rgba8(tile, tmem_instance, base_st);
				if (sample_quad)
				{
					t10 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t0));
					t01 = sample_texel_rgba8(tile, tmem_instance, ivec2(s0, t1));
				}
				if (mid_texel)
					t11 = sample_texel_rgba8(tile, tmem_instance, ivec2(s1, t1));
				break;

			default:
				t_base = sample_texel_ci32(tile, tmem_instance, base_st);
				if (sample_quad)
				{
					t10 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t0));
					t01 = sample_texel_ci32(tile, tmem_instance, ivec2(s0, t1));
				}
				if (mid_texel)
					t11 = sample_texel_ci32(tile, tmem_instance, ivec2(s1, t1));
				break;
			}
			break;
		}
	}

	i16x4 accum;

	// This is esoteric gibberish for the most part ...

	// Basic ideas seem to be:
	// - If mid_texel is enabled and we end up sampling center pixel, replace any 3-tap bilinear with 4-tap average.
	// - If YUV is used, filtering is separate for RG (chroma) and BA (luma) channels. Upper / Mid signals are separate.
	// - For YUV, sampling without bilerp with sample_quad means picking either t00 or t11, the base texel of any 3-tap bilerp plane.
	//   Chroma and Luma planes are selected separately.
	//   Then, the texel is converted.
	// - If convert_one + sample_quad + bilerp is used, a whack mode is entered where the conversion factors are dynamic.
	//   This also needs to handle variants of MID / YUV.

	if (convert_one)
	{
		// bilerp + convert_one path. !bilerp + convert_one path is trivial and does not require sampling at all.
		// It is handled outside.

		ivec4 prev_sext = bitfieldExtract(ivec4(prev_cycle), 0, 9);
		if (sample_quad)
		{
			bool mid_rg = yuv ? all(bvec3(mid_texel_state, equal(ivec2(chroma_frac, frac.y), ivec2(0x10)))) : mid_texel;
			bool mid_ba = mid_texel;

			bool upper_ba = sum_frac >= 32;
			bool upper_rg = yuv ? ((chroma_frac + frac.y) >= 32 && !mid_rg) : upper_ba;

			ivec2 factors_rg = upper_rg ? prev_sext.gr : prev_sext.rg;
			ivec2 factors_ba = upper_ba ? prev_sext.gr : prev_sext.rg;

			// t11 vs t00 selection is already done for non-YUV. YUV needs to defer here.

			ivec2 converted_rg, converted_ba;
			if (mid_rg)
			{
				converted_rg = factors_rg.r * (t01.rg - t11.rg) +
				               factors_rg.g * (t10.rg - t11.rg) +
				               ((t_base.rg - t11.rg) << 6) + 0x80;
			}
			else
			{
				ivec2 base_rg = upper_rg && yuv ? t11.xy : t_base.xy;
				converted_rg = factors_rg.r * (t10.xy - base_rg) + factors_rg.g * (t01.xy - base_rg) + 0x80;
			}

			if (mid_ba)
			{
				converted_ba = factors_ba.r * (t01.ba - t11.ba) +
				               factors_ba.g * (t10.ba - t11.ba) +
				               ((t_base.ba - t11.ba) << 6) + 0x80;
			}
			else
			{
				ivec2 base_ba = upper_ba && yuv ? t11.zw : t_base.zw;
				converted_ba = factors_ba.r * (t10.zw - base_ba) + factors_ba.g * (t01.zw - base_ba) + 0x80;
			}

			ivec4 converted = ivec4(converted_rg, converted_ba);
			converted >>= 8;
			converted += prev_sext.b;
			accum = i16x4(converted);
		}
		else
			accum = i16x4(prev_sext.bbbb);
	}
	else if (yuv)
	{
		if (sample_quad)
		{
			i16x2 accum_chroma;
			i16x2 accum_luma;

			if (bilerp)
			{
				bool mid_chroma = all(bvec3(mid_texel_state, equal(ivec2(chroma_frac, frac.y), ivec2(0x10))));
				if (mid_chroma)
					accum_chroma = (t_base.xy + t10.xy + t11.xy + t01.xy + I16_C(2)) >> I16_C(2);
				else
					accum_chroma = bilinear_3tap(t_base.xy, t10.xy, t01.xy, t11.xy, ivec2(chroma_frac, frac.y));

				if (mid_texel)
					accum_luma = (t_base.zw + t10.zw + t11.zw + t01.zw + I16_C(2)) >> I16_C(2);
				else
					accum_luma = bilinear_3tap(t_base.zw, t10.zw, t01.zw, t11.zw, frac);
			}
			else
			{
				// Weird path. Seems to pick either t00 or t11 for purposes of nearest.
				// Bilinear footprint path, except it's not doing bilinear path.
				accum_luma = frac.x + frac.y >= 32 ? t11.zw : t_base.zw;
				accum_chroma = chroma_frac + frac.y >= 32 ? t11.xy : t_base.xy;
			}

			accum = i16x4(accum_chroma, accum_luma);
		}
		else
			accum = t_base;
	}
	else if (mid_texel)
	{
		accum = (t_base + t01 + t10 + t11 + I16_C(2)) >> I16_C(2);
	}
	else if (bilerp && (sample_quad || tlut))
	{
		i16x2 flip_frac = i16x2(sum_frac >= 32 ? (32 - frac.yx) : frac);
		accum = (t10 - t_base) * flip_frac.x;
		accum += (t01 - t_base) * flip_frac.y;
		accum += I16_C(0x10);
		accum >>= I16_C(5);
		accum += t_base;
	}
	else
		accum = t_base;

	// If we don't spend math on bilerp for this cycle, we get conversion instead.
	// This happens regardless of convert_one. Convert_one in cycle 1 only means we take the
	// previous texel cycle and perform some math on it.

	if (!bilerp && !convert_one)
		accum = texture_convert_factors(accum, conversion_factors);

	return accum;
}

void compute_lod_2cycle(inout uint tile0, inout uint tile1, out i16 lod_frac, uint max_level, int min_lod,
                        ivec2 st, ivec2 st_dx, ivec2 st_dy,
                        bool perspective_overflow, bool tex_lod_en, bool sharpen_tex_en, bool detail_tex_en)
{
	bool magnify = false;
	bool distant = false;

	uint tile_offset = 0;

	if (perspective_overflow)
	{
		distant = true;
		lod_frac = i16(0xff);
	}
	else
	{
		ivec2 dx = st_dx - st;
		// Kinda abs, except it's 1 less than expected if negative.
		dx ^= dx >> 31;
		ivec2 dy = st_dy - st;
		// Kinda abs, except it's 1 less than expected if negative.
		dy ^= dy >> 31;

		ivec2 max_d2 = max(dx, dy);
		int max_d = max(max_d2.x, max_d2.y);

		if (max_d >= 0x4000)
		{
			distant = true;
			lod_frac = i16(0xff);
			tile_offset = max_level;
		}
		else if (max_d < 32) // LOD < 0
		{
			distant = max_level == 0u;
			magnify = true;

			if (!sharpen_tex_en && !detail_tex_en)
				lod_frac = i16(distant ? 0xff : 0);
			else
				lod_frac = i16((max(min_lod, max_d) << 3) + (sharpen_tex_en ? -0x100 : 0));
		}
		else
		{
			int mip_base = max(findMSB(max_d >> 5), 0);
			distant = mip_base >= max_level;

			if (distant && !sharpen_tex_en && !detail_tex_en)
			{
				lod_frac = i16(0xff);
			}
			else
			{
				lod_frac = i16(((max_d << 3) >> mip_base) & 0xff);
				tile_offset = mip_base;
			}
		}
	}

	if (tex_lod_en)
	{
		if (distant)
			tile_offset = max_level;

		if (!detail_tex_en)
		{
			tile0 = (tile0 + tile_offset) & 7u;
			if (distant || (!sharpen_tex_en && magnify))
				tile1 = tile0;
			else
				tile1 = (tile0 + 1) & 7;
		}
		else
		{
			tile1 = (tile0 + tile_offset + ((distant || magnify) ? 1 : 2)) & 7u;
			tile0 = (tile0 + tile_offset + (magnify ? 0 : 1)) & 7u;
		}
	}
}

#endif