#version 450
/* Copyright (c) 2020 Themaister
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
#include "debug.h"
#include "small_types.h"
layout(local_size_x_id = 0) in;

layout(set = 0, binding = 0, std430) readonly buffer VRAM8Buffer
{
    mem_u8 data[];
} vram8;

layout(set = 0, binding = 0, std430) readonly buffer VRAM16Buffer
{
    mem_u16 data[];
} vram16;

layout(set = 0, binding = 0, std430) readonly buffer VRAM32Buffer
{
    uint data[];
} vram32;

layout(set = 0, binding = 1, std430) buffer TMEM16Buffer
{
    mem_u16 data[2048];
} tmem16;

struct TileInstance
{
    mem_u16 data[2048];
};

layout(set = 0, binding = 2, std430) writeonly buffer TMEMInstances
{
    TileInstance instances[];
} tile_instances;

layout(push_constant, std430) uniform Registers
{
    int num_uploads;
} registers;

const int TEXTURE_FMT_RGBA = 0;
const int TEXTURE_FMT_YUV = 1;
const int TEXTURE_FMT_CI = 2;
const int TEXTURE_FMT_IA = 3;
const int TEXTURE_FMT_I = 4;

const int UPLOAD_MODE_TILE = 0;
const int UPLOAD_MODE_TLUT = 1;
const int UPLOAD_MODE_BLOCK = 2;

struct UploadInfo
{
    int width, height;
    float min_t_mod, max_t_mod;

    int vram_addr;
    int vram_width;
    int vram_size;
    int vram_effective_width;

    int tmem_offset;
    int tmem_stride_words;
    int tmem_size;
    int tmem_fmt;

    int mode;
    float inv_tmem_stride_words;
    int dxt;
    int padding;
};

layout(set = 1, binding = 0, std140) uniform UploadInfos
{
    UploadInfo upload_info[256];
};

bool tmem_dirty;
uint current_tmem_value;

int compute_upload_t(int offset, float inv_stride)
{
    // This is still exact for all relevant inputs, and much faster than integer divide.
    return int((float(offset) + 0.5) * inv_stride);
}

// In 32bpp upload mode we read 64 bits and split the result over the lower and upper TMEM.
void update_tmem_32(UploadInfo info, int tmem16_index, bool upper_tmem, bool yuv)
{
    int tmem16_offset = (info.tmem_offset & 0x7ff) >> 1;
    int tmem16_stride = info.tmem_stride_words;

    int pixel_offset = (tmem16_index - tmem16_offset) & 0x3ff;
    int upload_x, upload_y;
    int upload_x_xor = 0;

    if (info.mode == UPLOAD_MODE_BLOCK)
    {
        int word_offset = pixel_offset >> 1;

        if (info.tmem_stride_words == 0)
        {
            // Trivial case, we can just compute T factor directly and set upload_x_xor.
            // Other than that, it works like a simple 1D upload.

            // However, if DxT is weird, we might end up in a situation where this word is written multiple times,
            // or zero times.

            int iteration_candidate_first = word_offset & ~1;
            int iteration_candidate_second = iteration_candidate_first + 1;
            int first_t = (iteration_candidate_first * info.dxt) >> 16;
            int second_t = (iteration_candidate_second * info.dxt) >> 16;
            if (first_t != second_t)
            {
                int iteration_candidate_first_write_index = iteration_candidate_first ^ (first_t & 1);
                int iteration_candidate_second_write_index = iteration_candidate_second ^ (second_t & 1);
                if (iteration_candidate_second_write_index == word_offset)
                    upload_x_xor = (second_t & 1) << 1;
                else if (iteration_candidate_first_write_index == word_offset)
                    upload_x_xor = (first_t & 1) << 1;
                else
                    return;
            }
            else
                upload_x_xor ^= (first_t & 1) << 1;
        }
        else
        {
            // Welp ... This is pure insanity, but if we want to be completely correct ...
            int min_t = compute_upload_t(word_offset & ~1, info.min_t_mod);
            int max_t = compute_upload_t(word_offset | 1, info.max_t_mod);

            // If t has a range, then the solution to Y = (t = floor(X * dt / 2048)) * stride + X has a range space of:
            // Y - t_max * stride <= X <= Y - t_min * stride.
            int max_word_candidate = (word_offset | 1) - tmem16_stride * min_t;
            int min_word_candidate = (word_offset & ~1) - tmem16_stride * max_t;

            // If we have constraints for X, we constraint T further.
            min_t = max(min_t, (min_word_candidate * info.dxt) >> 16);
            max_t = min(max_t, (max_word_candidate * info.dxt) >> 16);

            bool found_candidate = false;
            for (int t = max_t; t >= min_t; t--)
            {
                // Check to see if t is a solution to the equation.
                // Potentially two targets could write here.
                int candidate_solution_first = (word_offset & ~1) - tmem16_stride * t;
                int candidate_solution_second = (word_offset | 1) - tmem16_stride * t;

                int candidate_t_first = (candidate_solution_first * info.dxt) >> 16;
                int candidate_t_second = (candidate_solution_second * info.dxt) >> 16;

                if (((candidate_solution_second + candidate_t_second * tmem16_stride) ^ (candidate_t_second & 1)) == word_offset)
                {
                    found_candidate = true;
                    pixel_offset = (candidate_solution_second << 1) + (pixel_offset & 1);
                    break;
                }
                else if (((candidate_solution_first + candidate_t_first * tmem16_stride) ^ (candidate_t_first & 1)) == word_offset)
                {
                    found_candidate = true;
                    pixel_offset = (candidate_solution_first << 1) + (pixel_offset & 1);
                    break;
                }
            }

            // We strided over this 64bpp word.
            if (!found_candidate)
                return;
        }

        upload_x = pixel_offset;
        upload_y = 0;
    }
    else if (tmem16_stride == 0)
    {
        // For TMEM stride of 0 we're essentially replaying the same line over and over and the final visible result
        // is what happened in Y == height - 1.
        upload_x = pixel_offset;
        upload_y = info.height - 1;
    }
    else
    {
        upload_y = compute_upload_t(pixel_offset, info.inv_tmem_stride_words);
        upload_x = pixel_offset - upload_y * tmem16_stride;

        // If stride is smaller than width, we'll need to unroll the last line.
        if (upload_y >= info.height)
        {
            upload_x += tmem16_stride * (upload_y - info.height + 1);
            upload_y = info.height - 1;
        }
    }

    int last_line_upload_x = upload_x ^ ((upload_y & 1) << 1);
    if (last_line_upload_x >= info.width && upload_y > 0)
    {
        // If the last line won't trigger a write, the previous line probably did.
        upload_y--;
        upload_x += tmem16_stride;
    }

    int iteration_offset;

    upload_x ^= ((upload_y & 1) << 1) | upload_x_xor;

    if (info.vram_size == 3 || yuv)
    {
        iteration_offset = 4 * (upload_x & ~1);
    }
    else if (info.vram_size == 2)
    {
        // In 16bpp VRAM mode, we are supposed to step 4 pixels at a time (8 bytes), which will form 2 complete pixels.
        // However, in 32bpp tile mode we're not shifting the X value appropriately.
        // So, we're writing texels [0, 1, ..., 4, 5, ...], etc.
        if ((upload_x & 2) != 0)
        {
            // We're not writing in this line, but the previous line might have!
            // Interleaving patterns will form ...
            if (upload_y > 0)
            {
                upload_y--;
                upload_x += tmem16_stride;
                upload_x ^= 2;
            }
            else
            {
                // These 2 words will never be written to.
                return;
            }
        }
        iteration_offset = 2 * (upload_x & ~1);
    }
    else if (info.vram_size == 1)
    {
        // 4 potential mirrors.
        for (int i = 0; i < 4 && upload_y > 0 && (upload_x & 6) != 0; i++)
        {
            upload_y--;
            upload_x += tmem16_stride;
            upload_x ^= 2;
        }

        if ((upload_x & 6) != 0)
        {
            // These 6 words will never be written to.
            return;
        }

        iteration_offset = upload_x & ~1;
    }

    if (upload_x >= info.width)
        return;

    int line_rdram_addr = info.vram_addr + ((upload_y * info.vram_width) << (info.vram_size - 1));

    // The loading pipeline reads 64 bits per iteration.
    int rdram_addr = line_rdram_addr + iteration_offset + 4 * (upload_x & 1);

    uint word;
    if ((rdram_addr & 3) == 0)
    {
        word = uint(vram32.data[rdram_addr >> 2]);
    }
    else
    {
        word = (uint(vram8.data[rdram_addr ^ 3]) << 24) |
               (uint(vram8.data[(rdram_addr + 1) ^ 3]) << 16) |
               (uint(vram8.data[(rdram_addr + 2) ^ 3]) << 8) |
                uint(vram8.data[(rdram_addr + 3) ^ 3]);
    }

    if (yuv)
    {
        // Lower TMEM receives interleaved UV samples, while upper receives Y.
        if (upper_tmem)
        {
            uint y0 = (word >> 16u) & 0xffu;
            uint y1 = (word >> 0u) & 0xffu;
            word = (y0 << 8u) | y1;
        }
        else
        {
            uint u = (word >> 24u) & 0xffu;
            uint v = (word >> 8u) & 0xffu;
            word = (u << 8u) | v;
        }
    }
    else
    {
        word >>= 16u - 16u * uint(upper_tmem);
        word &= 0xffffu;
    }
    current_tmem_value = word;
    tmem_dirty = true;
}

void update_tmem_16(UploadInfo info, int tmem16_index)
{
    int tmem16_offset = (info.tmem_offset & 0xfff) >> 1;
    int tmem16_stride = info.tmem_stride_words;

    int pixel_offset = (tmem16_index - tmem16_offset) & 0x7ff;
    int upload_x, upload_y;
    int upload_x_xor = 0;

    if (info.mode == UPLOAD_MODE_BLOCK)
    {
        int word_offset = pixel_offset >> 2;

        if (info.tmem_stride_words == 0)
        {
            // Trivial case, we can just compute T factor directly and set upload_x_xor.
            // Other than that, it works like a simple 1D upload.
            upload_x_xor = (((word_offset * info.dxt) >> 16) & 1) << 1;
        }
        else
        {
            // Welp ... This is pure insanity, but if we want to be completely correct ...
            int min_t = compute_upload_t(word_offset, info.min_t_mod);
            int max_t = compute_upload_t(word_offset, info.max_t_mod);

            // If t has a range, then the solution to Y = (t = floor(X * dt / 2048)) * stride + X has a range space of:
            // Y - t_max * stride <= X <= Y - t_min * stride.
            int max_word_candidate = word_offset - tmem16_stride * min_t;
            int min_word_candidate = word_offset - tmem16_stride * max_t;

            // If we have constraints for X, we constraint T further.
            min_t = max(min_t, (min_word_candidate * info.dxt) >> 16);
            max_t = min(max_t, (max_word_candidate * info.dxt) >> 16);

            bool found_candidate = false;
            for (int t = max_t; t >= min_t; t--)
            {
                // Check to see if t is a solution to the equation.
                int candidate_solution = word_offset - tmem16_stride * t;
                int computed_t = (candidate_solution * info.dxt) >> 16;
                if (candidate_solution + computed_t * tmem16_stride == word_offset)
                {
                    found_candidate = true;
                    upload_x_xor = (computed_t & 1) << 1;
                    pixel_offset = (candidate_solution << 2) + (pixel_offset & 3);
                }
            }

            // We strided over this 64bpp word.
            if (!found_candidate)
                return;
        }

        upload_x = pixel_offset;
        upload_y = 0;
    }
    else if (tmem16_stride == 0)
    {
        // For TMEM stride of 0 we're essentially replaying the same line over and over and the final visible result
        // is what happened in Y == height - 1.
        upload_x = pixel_offset;
        upload_y = info.height - 1;
    }
    else
    {
        upload_y = compute_upload_t(pixel_offset, info.inv_tmem_stride_words);
        upload_x = pixel_offset - upload_y * tmem16_stride;

        // If stride is smaller than width, we'll need to unroll the last line.
        if (upload_y >= info.height)
        {
            upload_x += tmem16_stride * (upload_y - info.height + 1);
            upload_y = info.height - 1;
        }
    }

    // This is pure bullshit magic which arises as an edge case when
    // tile pixel size does not match texture image size.
    // Should not happen in normal applications.
    // This is basically doing scatter-as-gather, so we need to figure out
    // if there is no write to our texel after all (striding), or if there are multiple writes
    // to our texel, in which case we need to figure out the last writer.
    // This code is black magic, and it's made with blood, sweat and tears from testing with lots of trial and error.
    int iteration_offset;
    if (info.tmem_size != info.vram_size)
    {
        if (info.vram_size - info.tmem_size == 1)
        {
            // If TMEM is N bpp but VRAM is 2N bpp, we will get mirrored writes here.
            // Select which half of the 2N bpp load we observe in TMEM.
            iteration_offset = (upload_x & ~3) * 4;
            if ((upload_x & ~3) + 2 < (info.vram_effective_width >> (3 - info.vram_size)))
                iteration_offset += 8;
        }
        else if (info.tmem_size == 2 && info.vram_size == 1)
        {
            // In 8bpp VRAM mode, we are supposed to step 8 pixels at a time (8 bytes), which will form 4 complete pixels.
            // However, in 16bpp tile mode we're not shifting the X value appropriately.
            // So, we're writing texels [0, 1, 2, 3, ..., 8, 9, 10, 11], etc.
            if ((upload_x & 4) != 0)
            {
                // We're not writing in this line, but the previous line might have!
                // Interleaving patterns will form ...
                if ((tmem16_stride & 4) != 0 && upload_y > 0)
                {
                    upload_y--;
                    upload_x += tmem16_stride;
                }
                else
                {
                    // These 4 words will never be written to.
                    return;
                }
            }
            iteration_offset = upload_x & ~3;
        }
    }
    else
    {
        // Normal case TMEM size aligns with VRAM size.
        iteration_offset = (upload_x & ~3) * 2;
    }

    if (upload_x >= info.width)
        return;

    int line_rdram_addr = info.vram_addr + ((upload_y * info.vram_width) << (info.vram_size - 1));
    upload_x ^= ((upload_y & 1) << 1) | upload_x_xor;

    // The loading pipeline reads 64 bits per iteration.
    int rdram_addr = line_rdram_addr + iteration_offset + 2 * (upload_x & 3);

    uint word;
    if ((rdram_addr & 1) == 0)
        word = uint(vram16.data[(rdram_addr >> 1) ^ 1]);
    else
        word = (uint(vram8.data[rdram_addr ^ 3]) << 8) | uint(vram8.data[(rdram_addr + 1) ^ 3]);

    current_tmem_value = word;
    tmem_dirty = true;
}

void update_tmem_lut(UploadInfo info, int tmem16_index)
{
    int tmem16_offset = (info.tmem_offset & 0xfff) >> 1;
    int pixel_offset = (tmem16_index - tmem16_offset) & 0x7ff;
    int pixel_offset_splat;

    if (info.vram_size - info.tmem_size == 2)
    {
        pixel_offset_splat = pixel_offset >> 2;
        pixel_offset_splat <<= info.vram_size - 2;
        if (pixel_offset_splat >= info.vram_effective_width)
            return;
    }
    else if (info.vram_size - info.tmem_size == 1)
    {
        if ((pixel_offset & 4) == 0)
        {
            int shamt = info.tmem_size + (info.vram_size == 2 ? 2 : 0);
            pixel_offset_splat = (pixel_offset & ~7) >> shamt;
            if (pixel_offset_splat >= info.vram_effective_width)
                return;
        }
        else
        {
            return;
        }
    }
    else if (info.vram_size == info.tmem_size)
    {
        if ((pixel_offset & 0xc) == 0)
        {
            int shamt = info.tmem_size + (info.vram_size == 2 ? 2 : 0);
            pixel_offset_splat = (pixel_offset & ~3) >> shamt;
            if (pixel_offset_splat >= info.vram_effective_width)
                return;
        }
        else
        {
            return;
        }
    }
    else if (info.vram_size - info.tmem_size == -1)
    {
        if ((pixel_offset & 0x1c) == 0)
        {
            int shamt = info.tmem_size;
            pixel_offset_splat = (pixel_offset >> shamt) & ~7;
            if (pixel_offset_splat >= info.vram_effective_width)
                return;
        }
        else
        {
            return;
        }
    }
    else
    {
        // 4bpp tile, 32bpp VRAM. Mirrored writes.
        int span_iteration = pixel_offset >> 2;
        span_iteration = span_iteration * 2;
        int span_pixel = span_iteration * 2;
        if (span_pixel + 2 < info.vram_effective_width)
            span_pixel += 2;

        if (span_pixel >= info.vram_effective_width)
            return;

        pixel_offset_splat = span_pixel;
    }

    int rdram_addr = info.vram_addr + (pixel_offset_splat << (info.vram_size - 1));

    // Odd behavior when we have unaligned TLUT uploads.
    rdram_addr += 2 * (rdram_addr & 1) * (pixel_offset & 3);

    uint word;
    if ((rdram_addr & 1) == 0)
        word = uint(vram16.data[(rdram_addr >> 1) ^ 1]);
    else
        word = (uint(vram8.data[rdram_addr ^ 3]) << 8) | uint(vram8.data[(rdram_addr + 1) ^ 3]);

    current_tmem_value = word;
    tmem_dirty = true;
}

void main()
{
    tmem_dirty = false;
    current_tmem_value = uint(tmem16.data[gl_GlobalInvocationID.x]);
    int tmem16_index = int(gl_GlobalInvocationID.x) ^ 1;
    bool upper_tmem = tmem16_index >= 0x400;

    tile_instances.instances[0].data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);

    int num_uploads = registers.num_uploads;
    for (int i = 0; i < num_uploads; i++)
    {
        UploadInfo info = upload_info[i];
        if (info.mode == UPLOAD_MODE_TLUT)
        {
            update_tmem_lut(info, tmem16_index);
        }
        else
        {
            bool yuv = info.tmem_fmt == TEXTURE_FMT_YUV;
            if (info.tmem_size == 3 || yuv)
                update_tmem_32(info, tmem16_index & 0x3ff, upper_tmem, yuv);
            else
                update_tmem_16(info, tmem16_index);
        }

        tile_instances.instances[i + 1].data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);
    }

    if (tmem_dirty)
        tmem16.data[gl_GlobalInvocationID.x] = mem_u16(current_tmem_value);
}