#version 450
// Convert BGRA → NV12 into a single linear buffer.
// Y plane at offset 0, UV plane at uv_offset.
// Source pixels are edge-extended into encoder padding area.
// Dispatch with (ceil(enc_width/16), ceil(enc_height/16), 1).
layout(local_size_x = 16, local_size_y = 16) in;
layout(set = 0, binding = 0, rgba8) readonly uniform image2D bgra_in;
layout(set = 0, binding = 1, std430) buffer NV12Out {
uint data[];
} nv12;
layout(push_constant) uniform Params {
uint src_width; // BGRA source dimensions
uint src_height;
uint y_stride; // bytes per row (Y and UV use the same stride)
uint uv_offset; // byte offset to UV plane
uint enc_width; // NV12 output dimensions (>= src, encoder-aligned)
uint enc_height;
};
void main() {
uvec2 pos = gl_GlobalInvocationID.xy;
if (pos.x >= enc_width || pos.y >= enc_height) return;
// Clamp to source dimensions for edge-extension into padding.
ivec2 src = ivec2(min(pos.x, src_width - 1u), min(pos.y, src_height - 1u));
vec4 bgra = imageLoad(bgra_in, src);
float r = bgra.z; // BGRA: z=R
float g = bgra.y; // BGRA: y=G
float b = bgra.x; // BGRA: x=B
// BT.601 full-range RGB→Y
float y_val = 0.299 * r + 0.587 * g + 0.114 * b;
uint y_byte = uint(clamp(y_val * 255.0, 0.0, 255.0));
// Write Y byte via packed uint.
uint y_idx = pos.y * y_stride + pos.x;
uint word_idx = y_idx >> 2u;
uint byte_pos = y_idx & 3u;
atomicOr(nv12.data[word_idx], y_byte << (byte_pos * 8u));
// Write UV at half resolution (every 2x2 block).
if ((pos.x & 1u) == 0u && (pos.y & 1u) == 0u) {
vec4 s00 = bgra;
ivec2 s10_pos = ivec2(min(pos.x + 1u, src_width - 1u), src.y);
ivec2 s01_pos = ivec2(src.x, min(pos.y + 1u, src_height - 1u));
ivec2 s11_pos = ivec2(s10_pos.x, s01_pos.y);
vec4 s10 = imageLoad(bgra_in, s10_pos);
vec4 s01 = imageLoad(bgra_in, s01_pos);
vec4 s11 = imageLoad(bgra_in, s11_pos);
float ar = (s00.z + s10.z + s01.z + s11.z) * 0.25;
float ag = (s00.y + s10.y + s01.y + s11.y) * 0.25;
float ab = (s00.x + s10.x + s01.x + s11.x) * 0.25;
float u_val = -0.169 * ar - 0.331 * ag + 0.500 * ab + 0.5;
float v_val = 0.500 * ar - 0.419 * ag - 0.081 * ab + 0.5;
uint u_byte = uint(clamp(u_val * 255.0, 0.0, 255.0));
uint v_byte = uint(clamp(v_val * 255.0, 0.0, 255.0));
// UV interleaved: U then V, same stride as Y.
uint cx = pos.x >> 1u;
uint cy = pos.y >> 1u;
uint uv_idx = uv_offset + cy * y_stride + cx * 2u;
uint uv_word = uv_idx >> 2u;
uint uv_byte_pos = uv_idx & 3u;
uint uv_packed = (u_byte | (v_byte << 8u)) << (uv_byte_pos * 8u);
atomicOr(nv12.data[uv_word], uv_packed);
}
}