blit-compositor 0.33.0

#version 450

// Convert BGRA → NV12 into a single linear buffer.
// Y plane at offset 0, UV plane at uv_offset.
// Source pixels are edge-extended into encoder padding area.
// Dispatch with (ceil(enc_width/16), ceil(enc_height/16), 1).

layout(local_size_x = 16, local_size_y = 16) in;

layout(set = 0, binding = 0, rgba8) readonly uniform image2D bgra_in;
layout(set = 0, binding = 1, std430) buffer NV12Out {
    uint data[];
} nv12;

layout(push_constant) uniform Params {
    uint src_width;   // BGRA source dimensions
    uint src_height;
    uint y_stride;    // bytes per row (Y and UV use the same stride)
    uint uv_offset;   // byte offset to UV plane
    uint enc_width;   // NV12 output dimensions (>= src, encoder-aligned)
    uint enc_height;
};

void main() {
    uvec2 pos = gl_GlobalInvocationID.xy;
    if (pos.x >= enc_width || pos.y >= enc_height) return;

    // Clamp to source dimensions for edge-extension into padding.
    ivec2 src = ivec2(min(pos.x, src_width - 1u), min(pos.y, src_height - 1u));
    vec4 bgra = imageLoad(bgra_in, src);
    float r = bgra.z;  // BGRA: z=R
    float g = bgra.y;  // BGRA: y=G
    float b = bgra.x;  // BGRA: x=B

    // BT.601 full-range RGB→Y
    float y_val = 0.299 * r + 0.587 * g + 0.114 * b;
    uint y_byte = uint(clamp(y_val * 255.0, 0.0, 255.0));

    // Write Y byte via packed uint.
    uint y_idx = pos.y * y_stride + pos.x;
    uint word_idx = y_idx >> 2u;
    uint byte_pos = y_idx & 3u;
    atomicOr(nv12.data[word_idx], y_byte << (byte_pos * 8u));

    // Write UV at half resolution (every 2x2 block).
    if ((pos.x & 1u) == 0u && (pos.y & 1u) == 0u) {
        vec4 s00 = bgra;
        ivec2 s10_pos = ivec2(min(pos.x + 1u, src_width - 1u), src.y);
        ivec2 s01_pos = ivec2(src.x, min(pos.y + 1u, src_height - 1u));
        ivec2 s11_pos = ivec2(s10_pos.x, s01_pos.y);
        vec4 s10 = imageLoad(bgra_in, s10_pos);
        vec4 s01 = imageLoad(bgra_in, s01_pos);
        vec4 s11 = imageLoad(bgra_in, s11_pos);

        float ar = (s00.z + s10.z + s01.z + s11.z) * 0.25;
        float ag = (s00.y + s10.y + s01.y + s11.y) * 0.25;
        float ab = (s00.x + s10.x + s01.x + s11.x) * 0.25;
        float u_val = -0.169 * ar - 0.331 * ag + 0.500 * ab + 0.5;
        float v_val =  0.500 * ar - 0.419 * ag - 0.081 * ab + 0.5;
        uint u_byte = uint(clamp(u_val * 255.0, 0.0, 255.0));
        uint v_byte = uint(clamp(v_val * 255.0, 0.0, 255.0));

        // UV interleaved: U then V, same stride as Y.
        uint cx = pos.x >> 1u;
        uint cy = pos.y >> 1u;
        uint uv_idx = uv_offset + cy * y_stride + cx * 2u;
        uint uv_word = uv_idx >> 2u;
        uint uv_byte_pos = uv_idx & 3u;
        uint uv_packed = (u_byte | (v_byte << 8u)) << (uv_byte_pos * 8u);
        atomicOr(nv12.data[uv_word], uv_packed);
    }
}