blit-compositor 0.33.0

blit headless Wayland compositor
Documentation
#version 450

// Convert BGRA → YUV 4:4:4 (I444) into a single linear buffer.
// Three full-resolution planes: Y at offset 0, U at u_offset, V at v_offset.
// Source pixels are edge-extended into encoder padding area.
// Dispatch with (ceil(enc_width/16), ceil(enc_height/16), 1).

layout(local_size_x = 16, local_size_y = 16) in;

layout(set = 0, binding = 0, rgba8) readonly uniform image2D bgra_in;
layout(set = 0, binding = 1, std430) buffer YUV444Out {
    uint data[];
} yuv;

layout(push_constant) uniform Params {
    uint src_width;   // BGRA source dimensions
    uint src_height;
    uint y_stride;    // bytes per row (all planes use the same stride)
    uint u_offset;    // byte offset to U plane
    uint v_offset;    // byte offset to V plane
    uint enc_width;   // YUV444 output dimensions (>= src, encoder-aligned)
    uint enc_height;
};

void main() {
    uvec2 pos = gl_GlobalInvocationID.xy;
    if (pos.x >= enc_width || pos.y >= enc_height) return;

    // Clamp to source dimensions for edge-extension into padding.
    ivec2 src = ivec2(min(pos.x, src_width - 1u), min(pos.y, src_height - 1u));
    vec4 bgra = imageLoad(bgra_in, src);
    float r = bgra.z;  // BGRA: z=R
    float g = bgra.y;  // BGRA: y=G
    float b = bgra.x;  // BGRA: x=B

    // BT.601 full-range RGB→YUV
    float y_val = 0.299 * r + 0.587 * g + 0.114 * b;
    float u_val = -0.169 * r - 0.331 * g + 0.500 * b + 0.5;
    float v_val =  0.500 * r - 0.419 * g - 0.081 * b + 0.5;

    uint y_byte = uint(clamp(y_val * 255.0, 0.0, 255.0));
    uint u_byte = uint(clamp(u_val * 255.0, 0.0, 255.0));
    uint v_byte = uint(clamp(v_val * 255.0, 0.0, 255.0));

    uint idx = pos.y * y_stride + pos.x;

    // Write Y byte via packed uint.
    uint y_word = idx >> 2u;
    uint y_bp = idx & 3u;
    atomicOr(yuv.data[y_word], y_byte << (y_bp * 8u));

    // Write U byte.
    uint u_idx = u_offset + idx;
    uint u_word = u_idx >> 2u;
    uint u_bp = u_idx & 3u;
    atomicOr(yuv.data[u_word], u_byte << (u_bp * 8u));

    // Write V byte.
    uint v_idx = v_offset + idx;
    uint v_word = v_idx >> 2u;
    uint v_bp = v_idx & 3u;
    atomicOr(yuv.data[v_word], v_byte << (v_bp * 8u));
}