/* Immediate-mode renderer.

Geometry is given by client code and buffered in an upload-heap buffer stored
in host memory.
When the buffer fills up or the client is done, a draw call is issued. The draw
call reads directly from the buffer in host memory; there is no intermediate
buffer copy.
The renderer double-buffers two host-side buffers so that the client can
continue specifying more data into a second buffer while the contents of the
first buffer are rendered.
If the first buffer is still being rendered while the client loops around, then
the client must wait before issuing further geometry.
Once the render of the first buffer completes, the process starts again,
ping-ponging between the two buffers.*/
#include <dxg/imm.h>
#include <dxg/dxcommon.h>

#include <imm_vs.h> // generated
#include <imm_ps.h> // generated

#define WIN32_LEAN_AND_MEAN
#include <Windows.h> // OutputDebugStringA

#include <stdint.h>
#include <stdlib.h>

static ID3D12Resource* create_buffer(ID3D12Device* pDevice, size_t size) {
    assert(pDevice);
    const D3D12_HEAP_PROPERTIES props = {
        .Type                 = D3D12_HEAP_TYPE_UPLOAD,
        .CPUPageProperty      = D3D12_CPU_PAGE_PROPERTY_UNKNOWN,
        .MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN,
        .CreationNodeMask     = 0,
        .VisibleNodeMask      = 0
    };
    const D3D12_RESOURCE_DESC desc = {
        .Dimension        = D3D12_RESOURCE_DIMENSION_BUFFER,
        .Alignment        = D3D12_DEFAULT_RESOURCE_PLACEMENT_ALIGNMENT,
        .Width            = size,
        .Height           = 1,
        .DepthOrArraySize = 1,
        .MipLevels        = 1,
        .Format           = DXGI_FORMAT_UNKNOWN,
        .SampleDesc       = {.Count = 1, .Quality = 0},
        .Layout           = D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
        .Flags            = D3D12_RESOURCE_FLAG_NONE
    };
    ID3D12Resource* pBuffer = NULL;
    TrapIfFailed(pDevice->lpVtbl->CreateCommittedResource(
        pDevice,
        &props,
        D3D12_HEAP_FLAG_NONE,
        &desc,
        D3D12_RESOURCE_STATE_VERTEX_AND_CONSTANT_BUFFER,
        NULL,
        &IID_ID3D12Resource,
        &pBuffer));
    return pBuffer;
}

typedef struct GraphicsState {
    D3D12_VIEWPORT              viewport;
    D3D12_CPU_DESCRIPTOR_HANDLE hBackBufferView;
    D3D12_CPU_DESCRIPTOR_HANDLE hDepthStencilView;
} GraphicsState;

// Set of per-draw resources. The renderer cycles between sets per draw.
typedef struct ResourceSet {
    ID3D12Resource* pVertexBuffer;
    CommandRecorder cmdRec;
} ResourceSet;

typedef struct DxgImm {
    ID3D12Device*        pDevice;
    ID3D12CommandQueue*  pCmdQueue;
    ID3D12PipelineState* pPipelineState;
    ID3D12RootSignature* pRootSignature;
    GraphicsState        graphicsState;
    ResourceSet          resources[2];
    int                  cur; // Index to current resource set. New geometry written here.
    float*               pCurrentBufferData; // Mapped region of current buffer.
    size_t               bufferSizeVerts; // Num verts per buffer.
    ID3D12Fence*         pFence;
    HANDLE               fenceEvent;
    uint64_t             fenceValue;
    size_t               vertsWritten; // Verts written to current buffer.
    bool                 wait; // Whether the next draw should wait.
} DxgImm;

static inline size_t vertex_size_bytes() {
    return 3 * sizeof(float);
}

static inline size_t verts_byte_count(size_t numVerts) {
    return numVerts * vertex_size_bytes();
}

static inline size_t dxg_imm_verts_left(const DxgImm* imm) {
    assert(imm);
    assert(imm->bufferSizeVerts >= imm->vertsWritten);
    return imm->bufferSizeVerts - imm->vertsWritten;
}

static void dxg_imm_copy_verts(DxgImm* imm, const float* pVerts, size_t count) {
    assert(imm);
    assert(pVerts);
    assert(count <= dxg_imm_verts_left(imm));
    memcpy(&imm->pCurrentBufferData[imm->vertsWritten], pVerts, verts_byte_count(count));
    imm->vertsWritten += count;
}

// Set up the current resource set for drawing.
static void dxg_imm_set_up_resource_set(DxgImm* imm) {
    assert(imm);
    ResourceSet* const pResources = &imm->resources[imm->cur];
    TrapIfFailed(pResources->pVertexBuffer->lpVtbl->Map(
        pResources->pVertexBuffer, 0, NULL, &imm->pCurrentBufferData));
    dxg_cmdrec_reset(&pResources->cmdRec);
}

// Move on to the next resource set.
static ID3D12Resource* dxg_imm_next_resource_set(DxgImm* imm) {
    assert(imm);
    ResourceSet* const pResources = &imm->resources[imm->cur];
    // Unmap the current buffer.
    // TODO: Do we actually need to do this or can we leave it mapped? If the
    // latter, then we could just map both buffers and let them be.
    pResources->pVertexBuffer->lpVtbl->Unmap(pResources->pVertexBuffer, 0, NULL);
    // Move on to the next resource set.
    imm->cur = (imm->cur + 1) & 1;
    imm->vertsWritten = 0;
    // Set up the new resource set.
    dxg_imm_set_up_resource_set(imm);
}

// Wait for the current buffer to be available for writing.
static void dxg_imm_wait(DxgImm* imm) {
    assert(imm);
    assert(imm->wait);
    // We only need to wait upon the first round around both buffers.
    // First Signal is on fence value 1, 0 is not actually Signaled.
    if (imm->fenceValue > 2) { // TODO: Do we need this check?
        // The last buffer (not current) was Signaled with fenceValue - 1.
        // The current buffer was therefore Signaled two fence values ago, or
        // fenceValue - 2.
        dxg_wait(imm->pFence, imm->fenceEvent, imm->fenceValue - 2);
    }
    imm->wait = false;
}

// Draw the current buffer.
static void dxg_imm_draw(DxgImm* imm) {
    assert(imm);
    ResourceSet* const pResourceSet = &imm->resources[imm->cur];
    ID3D12Resource* const pCurrentBuffer = pResourceSet->pVertexBuffer;
    ID3D12GraphicsCommandList* const pCmdList = pResourceSet->cmdRec.pCmdList;
    const D3D12_VIEWPORT* const pViewport = &imm->graphicsState.viewport;
    const D3D12_RECT scissor = {
        .bottom = pViewport->Height,
        .left   = 0,
        .right  = pViewport->Width,
        .top    = 0,
    };
    const D3D12_VERTEX_BUFFER_VIEW vertexBufferView = {
        .BufferLocation = pCurrentBuffer->lpVtbl->GetGPUVirtualAddress(pCurrentBuffer),
        .SizeInBytes    = verts_byte_count(imm->vertsWritten),
        .StrideInBytes  = vertex_size_bytes(),
    };
    pCmdList->lpVtbl->RSSetViewports(pCmdList, 1, pViewport);
    pCmdList->lpVtbl->RSSetScissorRects(pCmdList, 1, &scissor);
    pCmdList->lpVtbl->OMSetRenderTargets(
        pCmdList, 1, &imm->graphicsState.hBackBufferView, false, &imm->graphicsState.hDepthStencilView);
    pCmdList->lpVtbl->SetPipelineState(pCmdList, imm->pPipelineState);
    pCmdList->lpVtbl->SetGraphicsRootSignature(pCmdList, imm->pRootSignature);
    pCmdList->lpVtbl->IASetPrimitiveTopology(pCmdList, D3D_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
    pCmdList->lpVtbl->IASetVertexBuffers(pCmdList, 0, 1, &vertexBufferView);
    pCmdList->lpVtbl->DrawInstanced(pCmdList, imm->vertsWritten, 1, 0, 0);
    pCmdList->lpVtbl->Close(pCmdList);
    ID3D12CommandList* const cmdLists[] = {(ID3D12CommandList*)pCmdList};
    ID3D12CommandQueue* const pCmdQueue = imm->pCmdQueue;
    pCmdQueue->lpVtbl->ExecuteCommandLists(pCmdQueue, 1, cmdLists);
}

DxgImm* dxg_imm_init(ID3D12Device* pDevice, ID3D12CommandQueue* pCmdQueue, DXGI_FORMAT swapChainRtvFormat, DXGI_SAMPLE_DESC swapChainSampleDesc, size_t bufferSizeVerts) {
    assert(pDevice);
    assert(pCmdQueue);

    DxgImm* imm = calloc(1, sizeof(DxgImm));
    if (!imm) {
        return 0;
    }

    imm->pDevice         = pDevice;
    imm->pCmdQueue       = pCmdQueue;
    imm->bufferSizeVerts = bufferSizeVerts;
    imm->fenceValue      = 0;

    // TODO: Move this to the application side.
    const D3D_SHADER_MODEL model = D3D_SHADER_MODEL_6_5;
    D3D12_FEATURE_DATA_SHADER_MODEL shaderModel = { model };
    HRESULT result = pDevice->lpVtbl->CheckFeatureSupport(
        pDevice, D3D12_FEATURE_SHADER_MODEL, &shaderModel, sizeof(shaderModel));
    if (FAILED(result) || (shaderModel.HighestShaderModel < model)) {
        DEBUG_PRINT("ERROR: Shader Model 6.5 is not supported!\n");
        TrapIfFailed(result);
    }

    const D3D12_SHADER_BYTECODE vs_bytecode = {
        .pShaderBytecode = imm_vs,
        .BytecodeLength  = sizeof(imm_vs)
    };

    const D3D12_SHADER_BYTECODE ps_bytecode = {
        .pShaderBytecode = imm_ps,
        .BytecodeLength  = sizeof(imm_ps)
    };

    // TODO: Find out how many root parameters to use.
    // Let's do bindless rendering to keep things flexible.
    const D3D12_ROOT_SIGNATURE_DESC rootsig_desc = {
        .NumParameters     = 0,
        .pParameters       = NULL,
        .NumStaticSamplers = 0,
        .pStaticSamplers   = NULL,
        .Flags             = D3D12_ROOT_SIGNATURE_FLAG_ALLOW_INPUT_ASSEMBLER_INPUT_LAYOUT
    };

    ID3DBlob* pRootSignature = NULL;
    ID3DBlob* pErrors = NULL;
    result = D3D12SerializeRootSignature(
        &rootsig_desc,
        D3D_ROOT_SIGNATURE_VERSION_1,
        &pRootSignature,
        &pErrors);
    if (FAILED(result)) {
        if (pErrors) {
            DEBUG_PRINT(pErrors->lpVtbl->GetBufferPointer(pErrors));
        }
        TrapIfFailed(result);
    }

    TrapIfFailed(imm->pDevice->lpVtbl->CreateRootSignature(
        imm->pDevice,
        0,
        pRootSignature->lpVtbl->GetBufferPointer(pRootSignature),
        pRootSignature->lpVtbl->GetBufferSize(pRootSignature),
        &IID_ID3D12RootSignature,
        &imm->pRootSignature));

    const D3D12_INPUT_ELEMENT_DESC input_layout[] = {
        { "POSITION", 0, DXGI_FORMAT_R32G32B32_FLOAT, 0, 0, D3D12_INPUT_CLASSIFICATION_PER_VERTEX_DATA, 0 }
    };
    const D3D12_INPUT_LAYOUT_DESC input_layout_desc = {
        .pInputElementDescs = input_layout,
        .NumElements        = COUNTOF(input_layout)
    };

    const D3D12_GRAPHICS_PIPELINE_STATE_DESC gpso = {
        .pRootSignature        = imm->pRootSignature,
        .VS                    = vs_bytecode,
        .PS                    = ps_bytecode,
        .BlendState            = CD3DX12_BLEND_DESC_DEFAULT(),
        .SampleMask            = PointSampling,
        .RasterizerState       = CD3DX12_RASTERIZER_DESC_DEFAULT(),
        .InputLayout           = input_layout_desc,
        .PrimitiveTopologyType = D3D12_PRIMITIVE_TOPOLOGY_TYPE_TRIANGLE,
        .NumRenderTargets      = 1,
        .RTVFormats            = {swapChainRtvFormat},
        .SampleDesc            = swapChainSampleDesc
    };
    TrapIfFailed(imm->pDevice->lpVtbl->CreateGraphicsPipelineState(
        imm->pDevice, &gpso, &IID_ID3D12PipelineState, &imm->pPipelineState));

    const size_t bufferSize = verts_byte_count(bufferSizeVerts);
    for (int i = 0; i < 2; ++i) {
        imm->resources[i].pVertexBuffer = create_buffer(pDevice, bufferSize);
        if (!imm->resources[i].pVertexBuffer) {
            dxg_imm_destroy(&imm);
        }
        TrapIfFailed(dxg_cmdrec_init(&imm->resources[i].cmdRec, pDevice));
    }
    imm->cur = 0;
    dxg_imm_set_up_resource_set(imm);

    TrapIfFailed(pDevice->lpVtbl->CreateFence(
        pDevice, imm->fenceValue, D3D12_FENCE_FLAG_NONE, &IID_ID3D12Fence, &imm->pFence));

    if ((imm->fenceEvent = CreateEvent(NULL, FALSE, FALSE, NULL)) == NULL) {
        TrapIfFailed(HRESULT_FROM_WIN32(GetLastError()));
    }

    return imm;
}

void dxg_imm_destroy(DxgImm** ppImm) {
    assert(ppImm);
    DxgImm* imm = *ppImm;
    if (imm) {
        for (int i = 0; i < 2; ++i) {
            SafeRelease(imm->resources[i].pVertexBuffer);
            dxg_cmdrec_destroy(&imm->resources[i].cmdRec);
        }
        SafeRelease(imm->pRootSignature);
        SafeRelease(imm->pPipelineState);
        SafeRelease(imm->pFence);
        if (imm->fenceEvent != NULL) {
            CloseHandle(imm->fenceEvent);
        }
        free(imm);
        *ppImm = 0;
    }
}

void dxg_imm_set_graphics_state(
        DxgImm* imm,
        const D3D12_VIEWPORT* pViewport,
        D3D12_CPU_DESCRIPTOR_HANDLE hBackBufferView,
        D3D12_CPU_DESCRIPTOR_HANDLE hDepthStencilView) {
    assert(imm);
    assert(pViewport);
    assert(hBackBufferView.ptr);
    assert(hDepthStencilView.ptr);
    imm->graphicsState = (GraphicsState) {
        .viewport          = *pViewport,
        .hBackBufferView   = hBackBufferView,
        .hDepthStencilView = hDepthStencilView,
    };
}

void dxg_imm_flush(DxgImm* imm) {
    assert(imm);
    if (imm->vertsWritten > 0) {
        dxg_imm_draw(imm);
        // Signal the fence so that the current buffer can be reused once the
        // draw has finished.
        ID3D12CommandQueue* pCmdQueue = imm->pCmdQueue;
        imm->fenceValue++;
        pCmdQueue->lpVtbl->Signal(pCmdQueue, imm->pFence, imm->fenceValue);
        // Next draw should Wait for the next buffer. Wait lazily on the next
        // draw to avoid a stall here.
        imm->wait = true;
        dxg_imm_next_resource_set(imm);
    }
}

void dxg_imm_draw_triangles(DxgImm* imm, const float* pVerts, size_t numTris) {
    assert(imm);
    assert(pVerts);
    // TODO: This could be a loop to handle the case where the max buffer
    // capacity cannot hold numTris. Or maybe we should rely on the caller
    // to specify a big enough capacity, but that makes the API less
    // friendly.
    size_t triCapacity = dxg_imm_verts_left(imm) / 3;
    if (triCapacity == 0) {
        dxg_imm_flush(imm);
    }
    // If we just flushed the previous buffer, then we have to wait on the next
    // one. The wait is done here, and not inside the branch above, because the
    // client code can also flush the buffer.
    if (imm->wait) {
        dxg_imm_wait(imm);
    }
    // Re-evaluate capacity. It must be >0 now.
    triCapacity = dxg_imm_verts_left(imm) / 3;
    assert(triCapacity > 0);
    const size_t numVerts = MIN(triCapacity, numTris) * 3;
    dxg_imm_copy_verts(imm, pVerts, numVerts);
}