use anyhow::{Context, Result, bail};
use bytes::Bytes;
use std::ffi::c_void;
use std::os::raw::{c_int, c_uint};
use std::ptr;
use super::tuning::{self, NvencRateControl};
use super::{AUTO_FROM_TARGET, EncodedPacket, Encoder, EncoderConfig, QualityTarget};
#[cfg(test)]
use crate::frame::ColorMetadata;
use crate::frame::{PixelFormat, TransferFn, VideoFrame};
const NV_ENC_SUCCESS: c_uint = 0;
const NV_ENC_ERR_INVALID_PTR: c_uint = 6;
const NV_ENC_ERR_INVALID_PARAM: c_uint = 8;
const NV_ENC_ERR_ENCODER_NOT_INITIALIZED: c_uint = 11;
const NV_ENC_ERR_LOCK_BUSY: c_uint = 13;
const NV_ENC_ERR_NEED_MORE_INPUT: c_uint = 17;
const NV_ENC_ERR_ENCODER_BUSY: c_uint = 18;
const NV_ENC_DEVICE_TYPE_CUDA: c_uint = 1;
const NV_ENC_BUFFER_FORMAT_IYUV: c_uint = 0x00000100;
const NV_ENC_BUFFER_FORMAT_YUV420_10BIT: c_uint = 0x00010000;
const NV_ENC_PIC_FLAG_FORCEIDR: c_uint = 0x02;
const NV_ENC_PIC_FLAG_EOS: c_uint = 0x08;
const NV_ENC_PIC_TYPE_P: c_uint = 0;
const NV_ENC_PIC_TYPE_I: c_uint = 2;
const NV_ENC_PIC_TYPE_IDR: c_uint = 3;
#[allow(dead_code)]
const NV_ENC_TUNING_INFO_HIGH_QUALITY: c_uint = 1;
const NV_ENC_PARAMS_RC_CONSTQP: u32 = 0x0;
const NV_ENC_PARAMS_RC_VBR: u32 = 0x1;
#[allow(dead_code)]
const NV_ENC_PARAMS_RC_VBR_HQ: u32 = 0x20;
const RING_SIZE: usize = 4;
const NVENCAPI_MAJOR: u32 = 13;
const NVENCAPI_MINOR: u32 = 0;
const NVENCAPI_VERSION: u32 = NVENCAPI_MAJOR | (NVENCAPI_MINOR << 24);
const fn struct_version(ver: u32) -> u32 {
NVENCAPI_VERSION | (ver << 16) | (0x7 << 28)
}
const NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER: u32 = struct_version(1); const NV_ENC_INITIALIZE_PARAMS_VER: u32 = struct_version(7) | (1u32 << 31); const NV_ENC_CREATE_INPUT_BUFFER_VER: u32 = struct_version(2); const NV_ENC_CREATE_BITSTREAM_BUFFER_VER: u32 = struct_version(1); const NV_ENC_LOCK_INPUT_BUFFER_VER: u32 = struct_version(1); const NV_ENC_LOCK_BITSTREAM_VER: u32 = struct_version(2) | (1u32 << 31); const NV_ENC_PIC_PARAMS_VER: u32 = struct_version(7) | (1u32 << 31); const NV_ENC_CONFIG_VER: u32 = struct_version(9) | (1u32 << 31); const NV_ENC_PRESET_CONFIG_VER: u32 = struct_version(5) | (1u32 << 31);
#[repr(C)]
#[derive(Clone, Copy, PartialEq, Eq)]
struct Guid {
data1: u32,
data2: u16,
data3: u16,
data4: [u8; 8],
}
const NV_ENC_CODEC_AV1_GUID: Guid = Guid {
data1: 0x0a352289,
data2: 0x0aa7,
data3: 0x4759,
data4: [0x86, 0x2d, 0x5d, 0x15, 0xcd, 0x16, 0xd2, 0x54],
};
#[repr(C)]
struct NvEncCapsParam {
version: u32,
caps_to_query: c_uint,
reserved: [u32; 62],
}
const NV_ENC_CAPS_WIDTH_MAX: c_uint = 16;
const NV_ENC_CAPS_HEIGHT_MAX: c_uint = 17;
const NV_ENC_CAPS_SUPPORT_10BIT_ENCODE: c_uint = 39;
#[allow(dead_code)]
const NV_ENC_PRESET_P5_GUID: Guid = Guid {
data1: 0x21c6e6b4,
data2: 0x297a,
data3: 0x4cba,
data4: [0x99, 0x8f, 0xb6, 0xcb, 0xde, 0x72, 0xad, 0xe3],
};
#[allow(dead_code)]
const NV_ENC_PRESET_P6_GUID: Guid = Guid {
data1: 0x8e75c279,
data2: 0x6299,
data3: 0x4ab6,
data4: [0x83, 0x02, 0x0b, 0x21, 0x5a, 0x33, 0x5c, 0xf5],
};
#[allow(dead_code)]
const NV_ENC_PRESET_P7_GUID: Guid = Guid {
data1: 0x84848c12,
data2: 0x6f71,
data3: 0x4c13,
data4: [0x93, 0x1b, 0x53, 0xe2, 0x83, 0xf5, 0x79, 0x74],
};
fn guid_from_bytes(bytes: [u8; 16]) -> Guid {
Guid {
data1: u32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]),
data2: u16::from_le_bytes([bytes[4], bytes[5]]),
data3: u16::from_le_bytes([bytes[6], bytes[7]]),
data4: [
bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15],
],
}
}
type CUresult = c_int;
type CUdevice = c_int;
type CUcontext = *mut c_void;
type FnCuInit = unsafe extern "C" fn(c_uint) -> CUresult;
type FnCuDeviceGet = unsafe extern "C" fn(*mut CUdevice, c_int) -> CUresult;
type FnCuCtxCreate = unsafe extern "C" fn(*mut CUcontext, c_uint, CUdevice) -> CUresult;
type FnCuCtxDestroy = unsafe extern "C" fn(CUcontext) -> CUresult;
type FnCuCtxPushCurrent = unsafe extern "C" fn(CUcontext) -> CUresult;
type FnCuCtxPopCurrent = unsafe extern "C" fn(*mut CUcontext) -> CUresult;
#[repr(C)]
struct NvEncOpenEncodeSessionExParams {
version: u32,
device_type: u32,
device: *mut c_void,
reserved: *mut c_void,
api_version: u32,
reserved1: [u32; 253],
reserved2: [*mut c_void; 64],
}
#[repr(C)]
struct NvEncInitializeParams {
version: u32,
encode_guid: Guid,
preset_guid: Guid,
encode_width: u32,
encode_height: u32,
dar_width: u32,
dar_height: u32,
frame_rate_num: u32,
frame_rate_den: u32,
enable_encode_async: u32,
enable_ptd: u32,
flags: u32,
priv_data_size: u32,
reserved: u32,
priv_data: *mut c_void,
encode_config: *mut c_void,
max_encode_width: u32,
max_encode_height: u32,
max_me_hint_counts_per_block: [u32; 8],
tuning_info: u32,
buffer_format: u32,
num_state_buffers: u32,
output_stats_level: u32,
reserved1: [u32; 284],
reserved2: [*mut c_void; 64],
}
#[allow(dead_code)]
const INIT_BIT_REPORT_SLICE_OFFSETS: u32 = 1 << 0;
#[allow(dead_code)]
const INIT_BIT_ENABLE_SUB_FRAME_WRITE: u32 = 1 << 1;
#[allow(dead_code)]
const INIT_BIT_ENABLE_EXTERNAL_ME_HINTS: u32 = 1 << 2;
#[allow(dead_code)]
const INIT_BIT_ENABLE_ME_ONLY_MODE: u32 = 1 << 3;
#[allow(dead_code)]
const INIT_BIT_ENABLE_WEIGHTED_PREDICTION: u32 = 1 << 4;
#[allow(dead_code)]
const INIT_BIT_ENABLE_OUTPUT_IN_VIDMEM: u32 = 1 << 9;
#[allow(dead_code)]
const INIT_BIT_ENABLE_RECON_FRAME_OUTPUT: u32 = 1 << 10;
#[allow(dead_code)]
const INIT_BIT_ENABLE_OUTPUT_STATS: u32 = 1 << 11;
#[allow(dead_code)]
const INIT_BIT_ENABLE_UNI_DIRECTIONAL_B: u32 = 1 << 12;
#[repr(C)]
struct NvEncRcParams {
version: u32,
rate_control_mode: u32,
const_qp_inter_p: u32,
const_qp_inter_b: u32,
const_qp_intra: u32,
average_bitrate: u32,
max_bitrate: u32,
vbv_buffer_size: u32,
vbv_initial_delay: u32,
flags: u32,
min_qp_inter_p: u32,
min_qp_inter_b: u32,
min_qp_intra: u32,
max_qp_inter_p: u32,
max_qp_inter_b: u32,
max_qp_intra: u32,
initial_rc_qp_inter_p: u32,
initial_rc_qp_inter_b: u32,
initial_rc_qp_intra: u32,
temporally_layer_bitrate_ratio: [u32; 3],
target_quality: u8,
target_quality_lsb: u8,
lookahead_depth: u16,
low_delay_key_frame_scale: u32,
qp_map_mode: u32,
multi_pass: u32,
alpha_layer_bitrate_ratio: u32,
cbqpi_ofs: i8,
cbqpp_ofs: i8,
crqpi_ofs: i8,
crqpp_ofs: i8,
reserved: [u32; 4],
}
#[repr(C)]
struct NvEncConfigAv1 {
level: u32,
tier: u32,
min_part_size: u32,
max_part_size: u32,
flags: u32,
idr_period: u32,
intra_refresh_period: u32,
intra_refresh_cnt: u32,
max_num_ref_frames_in_dpb: u32,
num_tile_columns: u32,
num_tile_rows: u32,
reserved2: u32,
tile_widths: *mut u32,
tile_heights: *mut u32,
max_temporal_layers_minus1: u32,
color_primaries: u32,
transfer_characteristics: u32,
matrix_coefficients: u32,
color_range: u32,
chroma_sample_position: u32,
use_b_frames_as_ref: u32,
film_grain_params: *mut c_void,
num_fwd_refs: u32,
num_bwd_refs: u32,
output_bit_depth: u32,
input_bit_depth: u32,
ltr_num_frames: u32,
num_temporal_layers: u32,
tf_level: u32,
reserved1: [u32; 230],
reserved3: [*mut c_void; 62],
}
const AV1_BIT_OUTPUT_ANNEXB_FORMAT: u32 = 1 << 0;
#[allow(dead_code)]
const AV1_BIT_ENABLE_TIMING_INFO: u32 = 1 << 1;
const AV1_BIT_REPEAT_SEQ_HDR: u32 = 1 << 5;
const AV1_CHROMA_FORMAT_IDC_420: u32 = 1 << 7;
#[repr(C)]
struct NvEncConfig {
version: u32,
profile_guid: Guid,
gop_length: u32,
frame_interval_p: u32,
mono_chrome_encoding: u32,
frame_field_mode: u32,
mv_precision: u32,
rc_params: NvEncRcParams,
codec_config_av1: NvEncConfigAv1,
_codec_config_pad: [u32; 60],
reserved: [u32; 278],
reserved2: [*mut c_void; 64],
}
#[repr(C)]
struct NvEncPresetConfig {
version: u32,
reserved: u32,
preset_cfg: NvEncConfig,
reserved1: [u32; 256],
reserved2: [*mut c_void; 64],
}
#[repr(C)]
struct NvEncCreateInputBuffer {
version: u32,
width: u32,
height: u32,
memory_heap: u32,
buffer_fmt: u32,
reserved: u32,
input_buffer: *mut c_void,
sys_mem_buffer: *mut c_void,
reserved1: [u32; 57],
reserved2: [*mut c_void; 63],
}
#[repr(C)]
struct NvEncCreateBitstreamBuffer {
version: u32,
size: u32,
memory_heap: u32,
reserved: u32,
bitstream_buffer: *mut c_void,
bitstream_buffer_ptr: *mut c_void,
reserved1: [u32; 58],
reserved2: [*mut c_void; 64],
}
#[repr(C)]
struct NvEncLockInputBuffer {
version: u32,
reserved1: u32,
input_buffer: *mut c_void,
buffer_data_ptr: *mut c_void,
pitch: u32,
reserved2: [u32; 251],
reserved3: [*mut c_void; 64],
}
#[repr(C)]
struct NvEncLockBitstream {
version: u32, bitfields: u32, output_bitstream: *mut c_void, slice_offsets: *mut u32, frame_idx: u32, hw_encode_status: u32, num_slices: u32, bitstream_size_in_bytes: u32, output_time_stamp: u64, output_duration: u64, bitstream_buffer_ptr: *mut c_void, picture_type: u32, picture_struct: u32, frame_avg_qp: u32, frame_satd: u32, ltr_frame_idx: u32, ltr_frame_bitmap: u32, temporal_id: u32, intra_mb_count: u32, inter_mb_count: u32, average_mvx: i32, average_mvy: i32, alpha_layer_size_in_bytes: u32, output_stats_ptr_size: u32, reserved: u32, output_stats_ptr: *mut c_void, frame_idx_display: u32, reserved1: [u32; 219], reserved2: [*mut c_void; 63], reserved_internal: [u32; 8], }
#[repr(C)]
struct NvEncPicParams {
version: u32,
input_width: u32,
input_height: u32,
input_pitch: u32,
encode_pic_flags: u32,
frame_idx: u32,
input_timestamp: u64,
input_duration: u64,
input_buffer: *mut c_void,
output_bitstream: *mut c_void,
completion_event: *mut c_void,
buffer_fmt: u32,
picture_struct: u32,
picture_type: u32,
codec_pic_params: [u64; 193],
me_hint_counts_per_block: [u32; 8],
me_external_hints: *mut c_void,
reserved2: [u32; 7],
reserved5: [*mut c_void; 2],
qp_delta_map: *mut i8,
qp_delta_map_size: u32,
reserved_bitfields: u32,
me_hint_ref_pic_dist: [u16; 2],
reserved4: u32,
alpha_buffer: *mut c_void,
me_external_sb_hints: *mut c_void,
me_sb_hints_count: u32,
state_buffer_idx: u32,
output_recon_buffer: *mut c_void,
reserved3: [u32; 284],
reserved6: [*mut c_void; 57],
}
#[repr(C)]
struct NvEncFunctionList {
version: u32,
reserved: u32,
nv_enc_open_encode_session: *mut c_void,
nv_enc_get_encode_guid_count: *mut c_void,
nv_enc_get_encode_profile_guid_count: *mut c_void,
nv_enc_get_encode_profile_guids: *mut c_void,
nv_enc_get_encode_guids: *mut c_void,
nv_enc_get_input_format_count: *mut c_void,
nv_enc_get_input_formats: *mut c_void,
nv_enc_get_encode_caps: *mut c_void,
nv_enc_get_encode_preset_count: *mut c_void,
nv_enc_get_encode_preset_guids: *mut c_void,
nv_enc_get_encode_preset_config: *mut c_void,
nv_enc_initialize_encoder: *mut c_void,
nv_enc_create_input_buffer: *mut c_void,
nv_enc_destroy_input_buffer: *mut c_void,
nv_enc_create_bitstream_buffer: *mut c_void,
nv_enc_destroy_bitstream_buffer: *mut c_void,
nv_enc_encode_picture: *mut c_void,
nv_enc_lock_bitstream: *mut c_void,
nv_enc_unlock_bitstream: *mut c_void,
nv_enc_lock_input_buffer: *mut c_void,
nv_enc_unlock_input_buffer: *mut c_void,
nv_enc_get_encode_stats: *mut c_void,
nv_enc_get_sequence_params: *mut c_void,
nv_enc_register_async_event: *mut c_void,
nv_enc_unregister_async_event: *mut c_void,
nv_enc_map_input_resource: *mut c_void,
nv_enc_unmap_input_resource: *mut c_void,
nv_enc_destroy_encoder: *mut c_void,
nv_enc_invalidate_ref_frames: *mut c_void,
nv_enc_open_encode_session_ex: *mut c_void,
nv_enc_register_resource: *mut c_void,
nv_enc_unregister_resource: *mut c_void,
nv_enc_reconfigure_encoder: *mut c_void,
reserved1: *mut c_void,
nv_enc_create_mv_buffer: *mut c_void,
nv_enc_destroy_mv_buffer: *mut c_void,
nv_enc_run_motion_estimation_only: *mut c_void,
nv_enc_get_last_error_string: *mut c_void,
nv_enc_set_io_cuda_streams: *mut c_void,
nv_enc_get_encode_preset_config_ex: *mut c_void,
nv_enc_get_sequence_param_ex: *mut c_void,
nv_enc_restore_encoder_state: *mut c_void,
nv_enc_lookahead_picture: *mut c_void,
reserved2: [*mut c_void; 275],
}
const NV_ENCODE_API_FUNCTION_LIST_VER: u32 = struct_version(2);
type FnNvEncodeAPIGetMaxSupportedVersion = unsafe extern "C" fn(*mut u32) -> c_uint;
type FnNvEncodeAPICreateInstance = unsafe extern "C" fn(*mut NvEncFunctionList) -> c_uint;
type FnNvEncGetEncodeGUIDCount = unsafe extern "C" fn(*mut c_void, *mut u32) -> c_uint;
type FnNvEncGetEncodeGUIDs =
unsafe extern "C" fn(*mut c_void, *mut Guid, u32, *mut u32) -> c_uint;
type FnNvEncGetEncodeCaps =
unsafe extern "C" fn(*mut c_void, Guid, *mut NvEncCapsParam, *mut c_int) -> c_uint;
type FnNvEncOpenEncodeSessionEx =
unsafe extern "C" fn(*mut NvEncOpenEncodeSessionExParams, *mut *mut c_void) -> c_uint;
type FnNvEncInitializeEncoder =
unsafe extern "C" fn(*mut c_void, *mut NvEncInitializeParams) -> c_uint;
type FnNvEncCreateInputBuffer =
unsafe extern "C" fn(*mut c_void, *mut NvEncCreateInputBuffer) -> c_uint;
type FnNvEncDestroyInputBuffer = unsafe extern "C" fn(*mut c_void, *mut c_void) -> c_uint;
type FnNvEncCreateBitstreamBuffer =
unsafe extern "C" fn(*mut c_void, *mut NvEncCreateBitstreamBuffer) -> c_uint;
type FnNvEncDestroyBitstreamBuffer = unsafe extern "C" fn(*mut c_void, *mut c_void) -> c_uint;
type FnNvEncLockInputBuffer =
unsafe extern "C" fn(*mut c_void, *mut NvEncLockInputBuffer) -> c_uint;
type FnNvEncUnlockInputBuffer = unsafe extern "C" fn(*mut c_void, *mut c_void) -> c_uint;
type FnNvEncEncodePicture = unsafe extern "C" fn(*mut c_void, *mut NvEncPicParams) -> c_uint;
type FnNvEncLockBitstream = unsafe extern "C" fn(*mut c_void, *mut NvEncLockBitstream) -> c_uint;
type FnNvEncUnlockBitstream = unsafe extern "C" fn(*mut c_void, *mut c_void) -> c_uint;
type FnNvEncDestroyEncoder = unsafe extern "C" fn(*mut c_void) -> c_uint;
type FnNvEncGetEncodePresetConfigEx =
unsafe extern "C" fn(*mut c_void, Guid, Guid, u32, *mut NvEncPresetConfig) -> c_uint;
struct EncodeSession {
encoder: *mut c_void,
input_buffers: [*mut c_void; RING_SIZE],
bitstream_buffers: [*mut c_void; RING_SIZE],
cuda_ctx: CUcontext,
width: u32,
height: u32,
buffer_format: c_uint,
fn_destroy_input_buffer: FnNvEncDestroyInputBuffer,
fn_destroy_bitstream_buffer: FnNvEncDestroyBitstreamBuffer,
fn_lock_input_buffer: FnNvEncLockInputBuffer,
fn_unlock_input_buffer: FnNvEncUnlockInputBuffer,
fn_encode_picture: FnNvEncEncodePicture,
fn_lock_bitstream: FnNvEncLockBitstream,
fn_unlock_bitstream: FnNvEncUnlockBitstream,
fn_destroy_encoder: FnNvEncDestroyEncoder,
fn_cu_ctx_destroy: FnCuCtxDestroy,
fn_cu_ctx_push: FnCuCtxPushCurrent,
fn_cu_ctx_pop: FnCuCtxPopCurrent,
}
unsafe impl Send for EncodeSession {}
impl EncodeSession {
unsafe fn ctx_scope(&self) -> Result<CtxScope> {
unsafe { CtxScope::push(self.cuda_ctx, self.fn_cu_ctx_push, self.fn_cu_ctx_pop) }
}
}
impl Drop for EncodeSession {
fn drop(&mut self) {
unsafe {
let _scope =
CtxScope::push(self.cuda_ctx, self.fn_cu_ctx_push, self.fn_cu_ctx_pop).ok();
for i in (0..RING_SIZE).rev() {
if !self.input_buffers[i].is_null() {
(self.fn_destroy_input_buffer)(self.encoder, self.input_buffers[i]);
}
if !self.bitstream_buffers[i].is_null() {
(self.fn_destroy_bitstream_buffer)(self.encoder, self.bitstream_buffers[i]);
}
}
if !self.encoder.is_null() {
(self.fn_destroy_encoder)(self.encoder);
}
drop(_scope);
if !self.cuda_ctx.is_null() {
(self.fn_cu_ctx_destroy)(self.cuda_ctx);
}
}
}
}
struct CtxScope {
pop: FnCuCtxPopCurrent,
}
impl CtxScope {
unsafe fn push(
ctx: CUcontext,
push: FnCuCtxPushCurrent,
pop: FnCuCtxPopCurrent,
) -> Result<Self> {
unsafe {
if push(ctx) != 0 {
bail!("cuCtxPushCurrent failed");
}
Ok(Self { pop })
}
}
}
impl Drop for CtxScope {
fn drop(&mut self) {
let mut popped: CUcontext = ptr::null_mut();
unsafe {
(self.pop)(&mut popped);
}
}
}
fn nvenc_buffer_format_for(fmt: PixelFormat) -> Result<c_uint> {
match fmt {
PixelFormat::Yuv420p => Ok(NV_ENC_BUFFER_FORMAT_IYUV),
PixelFormat::Yuv420p10le => Ok(NV_ENC_BUFFER_FORMAT_YUV420_10BIT),
other => bail!(
"NVENC AV1 expects Yuv420p or Yuv420p10le, got {other:?} \
(4:2:2 / 4:4:4 / RGB / alpha not supported on this backend)"
),
}
}
const fn pixel_bit_depth_minus8_for(fmt: PixelFormat) -> u32 {
match fmt {
PixelFormat::Yuv420p10le => 2,
_ => 0,
}
}
fn transfer_to_h273(tf: TransferFn) -> u32 {
match tf {
TransferFn::Bt709 => 1,
TransferFn::Bt470Bg => 4,
TransferFn::Linear => 8,
TransferFn::St2084 => 16,
TransferFn::AribStdB67 => 18,
TransferFn::Unspecified => 1,
}
}
fn fps_to_rational(fps: f64) -> (u32, u32) {
const EXACT: &[(f64, u32, u32)] = &[
(23.976, 24_000, 1001),
(24.0, 24, 1),
(25.0, 25, 1),
(29.97, 30_000, 1001),
(30.0, 30, 1),
(48.0, 48, 1),
(50.0, 50, 1),
(59.94, 60_000, 1001),
(60.0, 60, 1),
];
for &(f, n, d) in EXACT {
if (fps - f).abs() < 1e-3 {
return (n, d);
}
}
if (fps - fps.round()).abs() < 1e-6 && fps > 0.0 {
return (fps.round() as u32, 1);
}
let k = (fps * 1001.0).round();
if (k / 1001.0 - fps).abs() < 1e-4 && k > 0.0 {
let k_u = k as u32;
return (k_u, 1001);
}
let num = (fps * 1000.0).round().max(1.0) as u32;
(num, 1000)
}
pub struct NvencEncoder {
config: EncoderConfig,
session: Option<EncodeSession>,
pending_frames: Vec<VideoFrame>,
encoded_packets: Vec<EncodedPacket>,
flushed: bool,
packet_cursor: usize,
frame_counter: u32,
ring_idx: usize,
last_drained_frame_idx: [i64; RING_SIZE],
_encode_lib: libloading::Library,
_cuda_lib: libloading::Library,
}
impl NvencEncoder {
pub fn new(config: EncoderConfig, gpu_index: u32) -> Result<Self> {
let _init_guard = crate::cuda_lock::lock_for_cuda_init();
tracing::info!(
event = "nvenc.init.start",
gpu_index,
width = config.width,
height = config.height,
?config.target,
?config.tier,
?config.pixel_format,
"NVENC init starting"
);
let encode_lib = unsafe { libloading::Library::new("libnvidia-encode.so") }
.or_else(|_| unsafe { libloading::Library::new("libnvidia-encode.so.1") })
.or_else(|_| unsafe { libloading::Library::new("nvEncodeAPI64.dll") })
.context("loading NVIDIA encode library")?;
let cuda_lib = unsafe { libloading::Library::new("libcuda.so") }
.or_else(|_| unsafe { libloading::Library::new("libcuda.so.1") })
.or_else(|_| unsafe { libloading::Library::new("nvcuda.dll") })
.context("loading CUDA driver for NVENC")?;
unsafe {
let get_version: libloading::Symbol<FnNvEncodeAPIGetMaxSupportedVersion> = encode_lib
.get(b"NvEncodeAPIGetMaxSupportedVersion")
.context("missing NvEncodeAPIGetMaxSupportedVersion")?;
let mut version: u32 = 0;
if get_version(&mut version) != NV_ENC_SUCCESS {
bail!("NvEncodeAPIGetMaxSupportedVersion failed");
}
let driver_major = version >> 4;
let driver_minor = version & 0xF;
tracing::info!(
major = driver_major,
minor = driver_minor,
"NVENC driver API version"
);
if driver_major < 12 {
bail!(
"NVENC driver API < 12 does not support AV1 (got {driver_major}.{driver_minor})"
);
}
let create_instance: libloading::Symbol<FnNvEncodeAPICreateInstance> = encode_lib
.get(b"NvEncodeAPICreateInstance")
.context("missing NvEncodeAPICreateInstance")?;
let mut fn_list: NvEncFunctionList = std::mem::zeroed();
fn_list.version = NV_ENCODE_API_FUNCTION_LIST_VER;
if create_instance(&mut fn_list) != NV_ENC_SUCCESS {
bail!("NvEncodeAPICreateInstance failed");
}
tracing::info!(event = "nvenc.cuda.cuInit", gpu_index, "cuInit");
let cu_init: libloading::Symbol<FnCuInit> = cuda_lib.get(b"cuInit")?;
if cu_init(0) != 0 {
tracing::error!(
event = "nvenc.cuda.error",
fn_name = "cuInit",
gpu_index,
"cuInit failed"
);
bail!("cuInit failed");
}
tracing::info!(event = "nvenc.cuda.cuDeviceGet", gpu_index, "cuDeviceGet");
let cu_device_get: libloading::Symbol<FnCuDeviceGet> = cuda_lib.get(b"cuDeviceGet")?;
let mut device: CUdevice = 0;
if cu_device_get(&mut device, gpu_index as c_int) != 0 {
tracing::error!(
event = "nvenc.cuda.error",
fn_name = "cuDeviceGet",
gpu_index,
"cuDeviceGet failed"
);
bail!("cuDeviceGet failed for GPU {gpu_index}");
}
tracing::info!(
event = "nvenc.cuda.cuCtxCreate",
gpu_index,
width = config.width,
height = config.height,
"cuCtxCreate (5-way contention candidate)"
);
let cu_ctx_create: libloading::Symbol<FnCuCtxCreate> =
cuda_lib.get(b"cuCtxCreate_v2")?;
let mut cuda_ctx: CUcontext = ptr::null_mut();
if cu_ctx_create(&mut cuda_ctx, 0, device) != 0 {
tracing::error!(
event = "nvenc.cuda.error",
fn_name = "cuCtxCreate",
gpu_index,
"cuCtxCreate failed"
);
bail!("cuCtxCreate failed");
}
tracing::info!(event = "nvenc.cuda.ok", gpu_index, "CUDA context created");
let fn_cu_ctx_destroy: libloading::Symbol<FnCuCtxDestroy> =
cuda_lib.get(b"cuCtxDestroy_v2")?;
let fn_cu_ctx_push: libloading::Symbol<FnCuCtxPushCurrent> =
cuda_lib.get(b"cuCtxPushCurrent_v2")?;
let fn_cu_ctx_pop: libloading::Symbol<FnCuCtxPopCurrent> =
cuda_lib.get(b"cuCtxPopCurrent_v2")?;
macro_rules! cast_fn {
($field:expr, $ty:ty, $name:literal) => {{
if $field.is_null() {
bail!(concat!("NVENC fn-list missing ", $name));
}
std::mem::transmute::<*mut c_void, $ty>($field)
}};
}
let fn_open_session: FnNvEncOpenEncodeSessionEx = cast_fn!(
fn_list.nv_enc_open_encode_session_ex,
FnNvEncOpenEncodeSessionEx,
"OpenEncodeSessionEx"
);
let fn_initialize_encoder: FnNvEncInitializeEncoder = cast_fn!(
fn_list.nv_enc_initialize_encoder,
FnNvEncInitializeEncoder,
"InitializeEncoder"
);
let fn_create_input_buffer: FnNvEncCreateInputBuffer = cast_fn!(
fn_list.nv_enc_create_input_buffer,
FnNvEncCreateInputBuffer,
"CreateInputBuffer"
);
let fn_destroy_input_buffer: FnNvEncDestroyInputBuffer = cast_fn!(
fn_list.nv_enc_destroy_input_buffer,
FnNvEncDestroyInputBuffer,
"DestroyInputBuffer"
);
let fn_create_bitstream_buffer: FnNvEncCreateBitstreamBuffer = cast_fn!(
fn_list.nv_enc_create_bitstream_buffer,
FnNvEncCreateBitstreamBuffer,
"CreateBitstreamBuffer"
);
let fn_destroy_bitstream_buffer: FnNvEncDestroyBitstreamBuffer = cast_fn!(
fn_list.nv_enc_destroy_bitstream_buffer,
FnNvEncDestroyBitstreamBuffer,
"DestroyBitstreamBuffer"
);
let fn_lock_input_buffer: FnNvEncLockInputBuffer = cast_fn!(
fn_list.nv_enc_lock_input_buffer,
FnNvEncLockInputBuffer,
"LockInputBuffer"
);
let fn_unlock_input_buffer: FnNvEncUnlockInputBuffer = cast_fn!(
fn_list.nv_enc_unlock_input_buffer,
FnNvEncUnlockInputBuffer,
"UnlockInputBuffer"
);
let fn_encode_picture: FnNvEncEncodePicture = cast_fn!(
fn_list.nv_enc_encode_picture,
FnNvEncEncodePicture,
"EncodePicture"
);
let fn_lock_bitstream: FnNvEncLockBitstream = cast_fn!(
fn_list.nv_enc_lock_bitstream,
FnNvEncLockBitstream,
"LockBitstream"
);
let fn_unlock_bitstream: FnNvEncUnlockBitstream = cast_fn!(
fn_list.nv_enc_unlock_bitstream,
FnNvEncUnlockBitstream,
"UnlockBitstream"
);
let fn_destroy_encoder: FnNvEncDestroyEncoder = cast_fn!(
fn_list.nv_enc_destroy_encoder,
FnNvEncDestroyEncoder,
"DestroyEncoder"
);
let fn_get_preset_config_ex: FnNvEncGetEncodePresetConfigEx = cast_fn!(
fn_list.nv_enc_get_encode_preset_config_ex,
FnNvEncGetEncodePresetConfigEx,
"GetEncodePresetConfigEx"
);
let fn_get_guid_count: FnNvEncGetEncodeGUIDCount = cast_fn!(
fn_list.nv_enc_get_encode_guid_count,
FnNvEncGetEncodeGUIDCount,
"GetEncodeGUIDCount"
);
let fn_get_guids: FnNvEncGetEncodeGUIDs = cast_fn!(
fn_list.nv_enc_get_encode_guids,
FnNvEncGetEncodeGUIDs,
"GetEncodeGUIDs"
);
let fn_get_encode_caps: FnNvEncGetEncodeCaps = cast_fn!(
fn_list.nv_enc_get_encode_caps,
FnNvEncGetEncodeCaps,
"GetEncodeCaps"
);
let mut open_params: NvEncOpenEncodeSessionExParams = std::mem::zeroed();
open_params.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER;
open_params.device_type = NV_ENC_DEVICE_TYPE_CUDA;
open_params.device = cuda_ctx;
open_params.api_version = NVENCAPI_VERSION;
let mut encoder: *mut c_void = ptr::null_mut();
tracing::info!(
event = "nvenc.ffi.call",
fn_name = "NvEncOpenEncodeSessionEx",
gpu_index,
width = config.width,
height = config.height,
"calling NvEncOpenEncodeSessionEx (parallel-init candidate)"
);
let rc = fn_open_session(&mut open_params, &mut encoder);
if rc != NV_ENC_SUCCESS {
tracing::error!(
event = "nvenc.ffi.error",
fn_name = "NvEncOpenEncodeSessionEx",
rc,
gpu_index,
width = config.width,
height = config.height,
"NVENC FFI failed"
);
(*fn_cu_ctx_destroy)(cuda_ctx);
bail!("NvEncOpenEncodeSessionEx failed: {rc}");
}
tracing::info!(
event = "nvenc.ffi.ok",
fn_name = "NvEncOpenEncodeSessionEx",
gpu_index,
width = config.width,
height = config.height,
"NvEncOpenEncodeSessionEx OK — session handle acquired"
);
let cap_err: Option<String> = {
let mut guid_count: u32 = 0;
if fn_get_guid_count(encoder, &mut guid_count) != NV_ENC_SUCCESS {
Some(format!("NvEncGetEncodeGUIDCount failed on GPU {gpu_index}"))
} else {
let mut guids = vec![
Guid { data1: 0, data2: 0, data3: 0, data4: [0u8; 8] };
guid_count.max(1) as usize
];
let mut returned: u32 = 0;
if fn_get_guids(encoder, guids.as_mut_ptr(), guid_count, &mut returned)
!= NV_ENC_SUCCESS
{
Some(format!("NvEncGetEncodeGUIDs failed on GPU {gpu_index}"))
} else if !guids[..returned as usize]
.iter()
.any(|g| *g == NV_ENC_CODEC_AV1_GUID)
{
Some(format!(
"NVENC on GPU {gpu_index} does not support AV1 encode \
({returned} codec(s) advertised, none AV1) — needs NVIDIA \
Ada+ or an Ampere datacenter SKU"
))
} else {
let query = |cap: c_uint| -> i32 {
let mut p: NvEncCapsParam = std::mem::zeroed();
p.version = struct_version(1);
p.caps_to_query = cap;
let mut val: c_int = 0;
let rc =
fn_get_encode_caps(encoder, NV_ENC_CODEC_AV1_GUID, &mut p, &mut val);
if rc != NV_ENC_SUCCESS { -1 } else { val }
};
let w_max = query(NV_ENC_CAPS_WIDTH_MAX);
let h_max = query(NV_ENC_CAPS_HEIGHT_MAX);
if w_max > 0
&& h_max > 0
&& ((config.width as i32) > w_max || (config.height as i32) > h_max)
{
Some(format!(
"NVENC AV1 on GPU {gpu_index} maxes at {w_max}x{h_max}, \
requested {}x{}",
config.width, config.height
))
} else if config.pixel_format == PixelFormat::Yuv420p10le
&& query(NV_ENC_CAPS_SUPPORT_10BIT_ENCODE) == 0
{
Some(format!(
"NVENC on GPU {gpu_index} does not support 10-bit AV1 encode"
))
} else {
tracing::info!(
gpu_index,
av1 = true,
w_max,
h_max,
ten_bit = config.pixel_format == PixelFormat::Yuv420p10le,
"NVENC AV1 capability validated"
);
None
}
}
}
};
if let Some(msg) = cap_err {
fn_destroy_encoder(encoder);
(*fn_cu_ctx_destroy)(cuda_ctx);
bail!("{msg}");
}
let tp =
tuning::nvenc_av1_params(config.target, config.tier, config.width, config.height);
let nvenc_cq = if config.quality == AUTO_FROM_TARGET {
tp.cq
} else {
config.quality.min(63)
};
let preset_guid = guid_from_bytes(tp.preset_guid);
#[repr(C)]
struct NvEncPresetConfigPadded {
base: NvEncPresetConfig,
_overflow_pad: [u8; 16384],
}
let mut padded: NvEncPresetConfigPadded = std::mem::zeroed();
padded.base.version = NV_ENC_PRESET_CONFIG_VER;
padded.base.preset_cfg.version = NV_ENC_CONFIG_VER;
tracing::info!(
event = "nvenc.ffi.call",
fn_name = "NvEncGetEncodePresetConfigEx",
gpu_index,
width = config.width,
height = config.height,
buffer_size = std::mem::size_of::<NvEncPresetConfigPadded>(),
"calling NvEncGetEncodePresetConfigEx (16 KiB over-allocated buffer)"
);
let rc = fn_get_preset_config_ex(
encoder,
NV_ENC_CODEC_AV1_GUID,
preset_guid,
tp.tuning_info,
&mut padded.base,
);
let preset_cfg = &padded.base;
if rc != NV_ENC_SUCCESS {
tracing::error!(
event = "nvenc.ffi.error",
fn_name = "NvEncGetEncodePresetConfigEx",
rc,
gpu_index,
width = config.width,
height = config.height,
"NvEncGetEncodePresetConfigEx failed"
);
(fn_destroy_encoder)(encoder);
(*fn_cu_ctx_destroy)(cuda_ctx);
bail!("NvEncGetEncodePresetConfigEx failed: {rc}");
}
tracing::info!(
event = "nvenc.ffi.ok",
fn_name = "NvEncGetEncodePresetConfigEx",
gpu_index,
width = config.width,
height = config.height,
"NvEncGetEncodePresetConfigEx OK"
);
let mut enc_config: NvEncConfig = std::ptr::read(&preset_cfg.preset_cfg);
enc_config.version = NV_ENC_CONFIG_VER;
enc_config.gop_length = config.keyframe_interval;
enc_config.frame_interval_p = 1; enc_config.mv_precision = 3;
enc_config.rc_params.version = struct_version(1);
match config.target {
QualityTarget::VisuallyLossless => {
let low = (nvenc_cq as u32).clamp(8, 12);
enc_config.rc_params.rate_control_mode = NV_ENC_PARAMS_RC_CONSTQP;
enc_config.rc_params.const_qp_intra = low;
enc_config.rc_params.const_qp_inter_p = low.saturating_add(1);
enc_config.rc_params.const_qp_inter_b = low.saturating_add(2);
enc_config.rc_params.target_quality = low as u8;
}
_ => {
let rc_mode = match tp.rc_mode {
NvencRateControl::ConstQp => NV_ENC_PARAMS_RC_CONSTQP,
NvencRateControl::VbrTargetQuality => NV_ENC_PARAMS_RC_VBR,
};
enc_config.rc_params.rate_control_mode = rc_mode;
enc_config.rc_params.target_quality = nvenc_cq.min(51);
enc_config.rc_params.target_quality_lsb = 0;
enc_config.rc_params.const_qp_intra = nvenc_cq as u32;
enc_config.rc_params.const_qp_inter_p = (nvenc_cq as u32).saturating_add(2);
enc_config.rc_params.const_qp_inter_b = (nvenc_cq as u32).saturating_add(4);
}
}
if config.constant_qp {
let q = nvenc_cq as u32;
enc_config.rc_params.rate_control_mode = NV_ENC_PARAMS_RC_CONSTQP;
enc_config.rc_params.const_qp_intra = q;
enc_config.rc_params.const_qp_inter_p = q.saturating_add(1);
enc_config.rc_params.const_qp_inter_b = q.saturating_add(2);
enc_config.rc_params.target_quality = q.min(255) as u8;
}
let buffer_format = nvenc_buffer_format_for(config.pixel_format)?;
let bit_depth_minus8 = pixel_bit_depth_minus8_for(config.pixel_format);
let bit_depth_enum = if bit_depth_minus8 == 0 { 0 } else { 1 };
enc_config.codec_config_av1.flags = AV1_BIT_REPEAT_SEQ_HDR | AV1_CHROMA_FORMAT_IDC_420;
enc_config.codec_config_av1.idr_period = config.keyframe_interval;
enc_config.codec_config_av1.max_num_ref_frames_in_dpb = 4;
enc_config.codec_config_av1.num_tile_columns = tp.num_tile_columns;
enc_config.codec_config_av1.num_tile_rows = tp.num_tile_rows;
enc_config.codec_config_av1.output_bit_depth = bit_depth_enum;
enc_config.codec_config_av1.input_bit_depth = bit_depth_enum;
let cm = &config.color_metadata;
enc_config.codec_config_av1.color_primaries = cm.colour_primaries as u32;
enc_config.codec_config_av1.transfer_characteristics = transfer_to_h273(cm.transfer);
enc_config.codec_config_av1.matrix_coefficients = cm.matrix_coefficients as u32;
enc_config.codec_config_av1.color_range = cm.full_range as u32;
let mut init_params: NvEncInitializeParams = std::mem::zeroed();
init_params.version = NV_ENC_INITIALIZE_PARAMS_VER;
init_params.encode_guid = NV_ENC_CODEC_AV1_GUID;
init_params.preset_guid = preset_guid;
init_params.encode_width = config.width;
init_params.encode_height = config.height;
init_params.dar_width = config.width;
init_params.dar_height = config.height;
let (num, den) = fps_to_rational(config.frame_rate);
init_params.frame_rate_num = num;
init_params.frame_rate_den = den;
init_params.enable_encode_async = 0;
init_params.enable_ptd = 1;
init_params.max_encode_width = config.width;
init_params.max_encode_height = config.height;
init_params.tuning_info = tp.tuning_info;
init_params.buffer_format = buffer_format;
init_params.encode_config = (&mut enc_config) as *mut NvEncConfig as *mut c_void;
tracing::info!(
width = config.width,
height = config.height,
target = ?config.target,
tier = ?config.tier,
cq = nvenc_cq,
rc_mode = enc_config.rc_params.rate_control_mode,
tile_cols = tp.num_tile_columns,
tile_rows = tp.num_tile_rows,
frame_rate_num = num,
frame_rate_den = den,
"NVENC AV1 tuning applied"
);
tracing::info!(
event = "nvenc.ffi.call",
fn_name = "NvEncInitializeEncoder",
width = config.width,
height = config.height,
gpu_index,
"calling NvEncInitializeEncoder (4K segfault candidate)"
);
let rc = fn_initialize_encoder(encoder, &mut init_params);
if rc != NV_ENC_SUCCESS {
tracing::error!(
event = "nvenc.ffi.error",
fn_name = "NvEncInitializeEncoder",
rc,
width = config.width,
height = config.height,
gpu_index,
"NvEncInitializeEncoder failed"
);
(fn_destroy_encoder)(encoder);
(*fn_cu_ctx_destroy)(cuda_ctx);
bail!("NvEncInitializeEncoder failed: {rc}");
}
tracing::info!(
event = "nvenc.ffi.ok",
fn_name = "NvEncInitializeEncoder",
width = config.width,
height = config.height,
"NvEncInitializeEncoder OK"
);
let mut input_buffers: [*mut c_void; RING_SIZE] = [ptr::null_mut(); RING_SIZE];
let mut bitstream_buffers: [*mut c_void; RING_SIZE] = [ptr::null_mut(); RING_SIZE];
let cleanup_partial =
|allocated: usize,
inputs: &[*mut c_void; RING_SIZE],
outputs: &[*mut c_void; RING_SIZE]| {
for i in (0..allocated).rev() {
if !inputs[i].is_null() {
(fn_destroy_input_buffer)(encoder, inputs[i]);
}
if !outputs[i].is_null() {
(fn_destroy_bitstream_buffer)(encoder, outputs[i]);
}
}
};
for i in 0..RING_SIZE {
let mut input_desc: NvEncCreateInputBuffer = std::mem::zeroed();
input_desc.version = NV_ENC_CREATE_INPUT_BUFFER_VER;
input_desc.width = config.width;
input_desc.height = config.height;
input_desc.buffer_fmt = buffer_format;
let rc = fn_create_input_buffer(encoder, &mut input_desc);
if rc != NV_ENC_SUCCESS {
tracing::error!(
event = "nvenc.ffi.error",
fn_name = "NvEncCreateInputBuffer",
slot = i,
rc,
width = config.width,
height = config.height,
"NvEncCreateInputBuffer failed"
);
cleanup_partial(i, &input_buffers, &bitstream_buffers);
(fn_destroy_encoder)(encoder);
(*fn_cu_ctx_destroy)(cuda_ctx);
bail!("NvEncCreateInputBuffer (slot {i}) failed: {rc}");
}
input_buffers[i] = input_desc.input_buffer;
let mut bitstream_desc: NvEncCreateBitstreamBuffer = std::mem::zeroed();
bitstream_desc.version = NV_ENC_CREATE_BITSTREAM_BUFFER_VER;
bitstream_desc.size = 16 * 1024 * 1024;
let rc = fn_create_bitstream_buffer(encoder, &mut bitstream_desc);
if rc != NV_ENC_SUCCESS {
tracing::error!(
event = "nvenc.ffi.error",
fn_name = "NvEncCreateBitstreamBuffer",
slot = i,
rc,
width = config.width,
height = config.height,
"NvEncCreateBitstreamBuffer failed"
);
cleanup_partial(i + 1, &input_buffers, &bitstream_buffers);
(fn_destroy_encoder)(encoder);
(*fn_cu_ctx_destroy)(cuda_ctx);
bail!("NvEncCreateBitstreamBuffer (slot {i}) failed: {rc}");
}
bitstream_buffers[i] = bitstream_desc.bitstream_buffer;
}
tracing::info!(
event = "nvenc.init.complete",
gpu_index,
width = config.width,
height = config.height,
ring_size = RING_SIZE,
"NVENC encoder ready (init complete)"
);
let session = EncodeSession {
encoder,
input_buffers,
bitstream_buffers,
cuda_ctx,
width: config.width,
height: config.height,
buffer_format,
fn_destroy_input_buffer,
fn_destroy_bitstream_buffer,
fn_lock_input_buffer,
fn_unlock_input_buffer,
fn_encode_picture,
fn_lock_bitstream,
fn_unlock_bitstream,
fn_destroy_encoder,
fn_cu_ctx_destroy: *fn_cu_ctx_destroy,
fn_cu_ctx_push: *fn_cu_ctx_push,
fn_cu_ctx_pop: *fn_cu_ctx_pop,
};
tracing::info!(
width = config.width,
height = config.height,
quality = config.quality,
gpu = gpu_index,
ring_size = RING_SIZE,
"NVENC AV1 encoder ready"
);
Ok(Self {
config,
session: Some(session),
pending_frames: Vec::new(),
encoded_packets: Vec::new(),
flushed: false,
packet_cursor: 0,
frame_counter: 0,
ring_idx: 0,
last_drained_frame_idx: [-1; RING_SIZE],
_encode_lib: encode_lib,
_cuda_lib: cuda_lib,
})
}
}
unsafe fn upload_frame(
session: &EncodeSession,
frame: &VideoFrame,
slot: usize,
) -> Result<u32> {
unsafe {
let input_buffer = session.input_buffers[slot];
let mut lock: NvEncLockInputBuffer = std::mem::zeroed();
lock.version = NV_ENC_LOCK_INPUT_BUFFER_VER;
lock.input_buffer = input_buffer;
let rc = (session.fn_lock_input_buffer)(session.encoder, &mut lock);
if rc != NV_ENC_SUCCESS {
bail!("NvEncLockInputBuffer failed: {rc}");
}
let pitch = lock.pitch as usize;
let w = session.width as usize;
let h = session.height as usize;
let cw = w.div_ceil(2);
let ch = h.div_ceil(2);
let y_size = w * h;
let uv_size = cw * ch;
if frame.data.len() < y_size + 2 * uv_size {
(session.fn_unlock_input_buffer)(session.encoder, input_buffer);
bail!("frame data too small for {}x{} YUV420p", w, h);
}
let dst = lock.buffer_data_ptr as *mut u8;
for row in 0..h {
let src = frame.data.as_ptr().add(row * w);
let dst_row = dst.add(row * pitch);
ptr::copy_nonoverlapping(src, dst_row, w);
}
let chroma_pitch = pitch / 2;
let u_dst_base = dst.add(pitch * h);
let u_src_base = frame.data.as_ptr().add(y_size);
for row in 0..ch {
let src = u_src_base.add(row * cw);
let dst_row = u_dst_base.add(row * chroma_pitch);
ptr::copy_nonoverlapping(src, dst_row, cw);
}
let v_dst_base = u_dst_base.add(chroma_pitch * ch);
let v_src_base = u_src_base.add(uv_size);
for row in 0..ch {
let src = v_src_base.add(row * cw);
let dst_row = v_dst_base.add(row * chroma_pitch);
ptr::copy_nonoverlapping(src, dst_row, cw);
}
let rc = (session.fn_unlock_input_buffer)(session.encoder, input_buffer);
if rc != NV_ENC_SUCCESS {
bail!("NvEncUnlockInputBuffer failed: {rc}");
}
Ok(lock.pitch)
}
}
unsafe fn upload_frame_10bit(
session: &EncodeSession,
frame: &VideoFrame,
slot: usize,
) -> Result<u32> {
unsafe {
let input_buffer = session.input_buffers[slot];
let mut lock: NvEncLockInputBuffer = std::mem::zeroed();
lock.version = NV_ENC_LOCK_INPUT_BUFFER_VER;
lock.input_buffer = input_buffer;
let rc = (session.fn_lock_input_buffer)(session.encoder, &mut lock);
if rc != NV_ENC_SUCCESS {
bail!("NvEncLockInputBuffer failed: {rc}");
}
let pitch_bytes = lock.pitch as usize;
let w = session.width as usize;
let h = session.height as usize;
let cw = w.div_ceil(2);
let ch = h.div_ceil(2);
let y_bytes = w * h * 2;
let uv_bytes = cw * ch * 2;
if frame.data.len() < y_bytes + 2 * uv_bytes {
(session.fn_unlock_input_buffer)(session.encoder, input_buffer);
bail!(
"frame data too small for {}x{} Yuv420p10le: need {} bytes, got {}",
w,
h,
y_bytes + 2 * uv_bytes,
frame.data.len()
);
}
let dst = lock.buffer_data_ptr as *mut u8;
let src_ptr = frame.data.as_ptr();
for row in 0..h {
let src_row = src_ptr.add(row * w * 2) as *const u16;
let dst_row = dst.add(row * pitch_bytes) as *mut u16;
for col in 0..w {
let sample = (*src_row.add(col)) & 0x03FF;
*dst_row.add(col) = sample << 6;
}
}
let chroma_pitch_bytes = pitch_bytes / 2;
let u_dst_base = dst.add(pitch_bytes * h);
let u_src_base = src_ptr.add(y_bytes);
for row in 0..ch {
let src_row = u_src_base.add(row * cw * 2) as *const u16;
let dst_row = u_dst_base.add(row * chroma_pitch_bytes) as *mut u16;
for col in 0..cw {
let sample = (*src_row.add(col)) & 0x03FF;
*dst_row.add(col) = sample << 6;
}
}
let v_dst_base = u_dst_base.add(chroma_pitch_bytes * ch);
let v_src_base = u_src_base.add(uv_bytes);
for row in 0..ch {
let src_row = v_src_base.add(row * cw * 2) as *const u16;
let dst_row = v_dst_base.add(row * chroma_pitch_bytes) as *mut u16;
for col in 0..cw {
let sample = (*src_row.add(col)) & 0x03FF;
*dst_row.add(col) = sample << 6;
}
}
let rc = (session.fn_unlock_input_buffer)(session.encoder, input_buffer);
if rc != NV_ENC_SUCCESS {
bail!("NvEncUnlockInputBuffer failed: {rc}");
}
Ok(lock.pitch)
}
}
unsafe fn drain_bitstream(
session: &EncodeSession,
slot: usize,
) -> Result<Option<(u32, EncodedPacket)>> {
unsafe {
let bitstream_buffer = session.bitstream_buffers[slot];
let mut lock: NvEncLockBitstream = std::mem::zeroed();
lock.version = NV_ENC_LOCK_BITSTREAM_VER;
lock.output_bitstream = bitstream_buffer;
let rc = (session.fn_lock_bitstream)(session.encoder, &mut lock);
match rc {
NV_ENC_SUCCESS => { }
NV_ENC_ERR_NEED_MORE_INPUT
| NV_ENC_ERR_LOCK_BUSY
| NV_ENC_ERR_ENCODER_BUSY
| NV_ENC_ERR_INVALID_PARAM => {
return Ok(None);
}
NV_ENC_ERR_INVALID_PTR | NV_ENC_ERR_ENCODER_NOT_INITIALIZED => {
bail!("NvEncLockBitstream failed (fatal): {rc}")
}
other => bail!("NvEncLockBitstream failed: {other}"),
}
let size = lock.bitstream_size_in_bytes as usize;
const MAX_BITSTREAM_BYTES: usize = 16 * 1024 * 1024;
if size > MAX_BITSTREAM_BYTES {
let _ = (session.fn_unlock_bitstream)(session.encoder, bitstream_buffer);
bail!(
"NvEncLockBitstream returned implausible size {} bytes (max {}) — \
likely NV_ENC_LOCK_BITSTREAM struct layout drift",
size,
MAX_BITSTREAM_BYTES
);
}
let data = if size > 0 && !lock.bitstream_buffer_ptr.is_null() {
let slice =
std::slice::from_raw_parts(lock.bitstream_buffer_ptr as *const u8, size);
Bytes::copy_from_slice(slice)
} else {
Bytes::new()
};
let is_keyframe = matches!(lock.picture_type, NV_ENC_PIC_TYPE_IDR | NV_ENC_PIC_TYPE_I);
let pts = lock.output_time_stamp;
let unlock_rc = (session.fn_unlock_bitstream)(session.encoder, bitstream_buffer);
if unlock_rc != NV_ENC_SUCCESS {
bail!("NvEncUnlockBitstream failed: {unlock_rc}");
}
if size == 0 {
return Ok(None);
}
Ok(Some((
lock.frame_idx,
EncodedPacket {
data,
pts,
is_keyframe,
},
)))
}
}
fn encode_pending(&mut self) -> Result<()> {
if self.pending_frames.is_empty() {
return Ok(());
}
let Some(session) = &self.session else {
bail!("encode_pending called without live session");
};
let _scope = unsafe { session.ctx_scope()? };
let pending = std::mem::take(&mut self.pending_frames);
for frame in pending {
if frame.format != self.config.pixel_format {
bail!(
"NVENC session was initialized with {:?} but frame is {:?} \
— pipeline must reinit the encoder if pixel format changes",
self.config.pixel_format,
frame.format
);
}
let slot = self.ring_idx;
unsafe {
let pitch = match frame.format {
PixelFormat::Yuv420p10le => Self::upload_frame_10bit(session, &frame, slot)?,
_ => Self::upload_frame(session, &frame, slot)?,
};
let mut pic: NvEncPicParams = std::mem::zeroed();
pic.version = NV_ENC_PIC_PARAMS_VER;
pic.input_width = session.width;
pic.input_height = session.height;
pic.input_pitch = pitch;
pic.input_buffer = session.input_buffers[slot];
pic.output_bitstream = session.bitstream_buffers[slot];
pic.buffer_fmt = session.buffer_format;
pic.frame_idx = self.frame_counter;
pic.input_timestamp = frame.pts;
pic.picture_struct = 1;
let is_idr = self
.frame_counter
.is_multiple_of(self.config.keyframe_interval);
pic.picture_type = if is_idr {
NV_ENC_PIC_TYPE_IDR
} else {
NV_ENC_PIC_TYPE_P
};
if is_idr {
pic.encode_pic_flags |= NV_ENC_PIC_FLAG_FORCEIDR;
}
let rc = (session.fn_encode_picture)(session.encoder, &mut pic);
self.frame_counter += 1;
match rc {
NV_ENC_SUCCESS => {
if let Some((frame_idx, pkt)) = Self::drain_bitstream(session, slot)? {
self.last_drained_frame_idx[slot] = frame_idx as i64;
self.encoded_packets.push(pkt);
}
}
NV_ENC_ERR_NEED_MORE_INPUT => {
}
other => bail!("NvEncEncodePicture failed: {other}"),
}
}
self.ring_idx = (self.ring_idx + 1) % RING_SIZE;
}
Ok(())
}
fn flush_eos(&mut self) -> Result<()> {
let Some(session) = &self.session else {
return Ok(());
};
unsafe {
let _scope = session.ctx_scope()?;
let mut pic: NvEncPicParams = std::mem::zeroed();
pic.version = NV_ENC_PIC_PARAMS_VER;
pic.encode_pic_flags = NV_ENC_PIC_FLAG_EOS;
pic.input_buffer = ptr::null_mut();
pic.output_bitstream = session.bitstream_buffers[self.ring_idx];
pic.buffer_fmt = session.buffer_format;
let _ = (session.fn_encode_picture)(session.encoder, &mut pic);
for i in 0..RING_SIZE {
let slot = (self.ring_idx + i) % RING_SIZE;
if let Some((frame_idx, pkt)) = Self::drain_bitstream(session, slot)? {
if (frame_idx as i64) > self.last_drained_frame_idx[slot] {
self.last_drained_frame_idx[slot] = frame_idx as i64;
self.encoded_packets.push(pkt);
}
}
}
}
Ok(())
}
}
impl Encoder for NvencEncoder {
fn send_frame(&mut self, frame: &VideoFrame) -> Result<()> {
if frame.format != self.config.pixel_format {
bail!(
"NVENC session was initialized with {:?} but frame is {:?}",
self.config.pixel_format,
frame.format
);
}
self.pending_frames.push(frame.clone());
self.encode_pending()?;
Ok(())
}
fn flush(&mut self) -> Result<()> {
self.encode_pending()?;
if !self.flushed {
self.flush_eos()?;
self.flushed = true;
}
Ok(())
}
fn receive_packet(&mut self) -> Result<Option<EncodedPacket>> {
if self.packet_cursor < self.encoded_packets.len() {
let pkt = self.encoded_packets[self.packet_cursor].clone();
self.packet_cursor += 1;
Ok(Some(pkt))
} else {
Ok(None)
}
}
}
const _: () = assert!(std::mem::size_of::<NvEncOpenEncodeSessionExParams>() == 1552);
const _: () = assert!(std::mem::size_of::<NvEncInitializeParams>() == 1800);
const _: () = assert!(std::mem::size_of::<NvEncRcParams>() == 128);
const _: () = assert!(std::mem::size_of::<NvEncConfigAv1>() == 1552);
const _: () = assert!(std::mem::size_of::<NvEncConfig>() == 3584);
const _: () = assert!(std::mem::size_of::<NvEncPresetConfig>() == 5128);
const _: () = assert!(std::mem::size_of::<NvEncCreateInputBuffer>() == 776);
const _: () = assert!(std::mem::size_of::<NvEncCreateBitstreamBuffer>() == 776);
const _: () = assert!(std::mem::size_of::<NvEncLockBitstream>() == 1544);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, version) == 0);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, output_bitstream) == 8);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, slice_offsets) == 16);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, frame_idx) == 24);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, bitstream_size_in_bytes) == 36);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, output_time_stamp) == 40);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, bitstream_buffer_ptr) == 56);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, picture_type) == 64);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, ltr_frame_bitmap) == 84);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, temporal_id) == 88);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, intra_mb_count) == 92);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, alpha_layer_size_in_bytes) == 108);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, output_stats_ptr_size) == 112);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, reserved) == 116);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, output_stats_ptr) == 120);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, frame_idx_display) == 128);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, reserved1) == 132);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, reserved2) == 1008);
const _: () = assert!(std::mem::offset_of!(NvEncLockBitstream, reserved_internal) == 1512);
const _: () = assert!(std::mem::size_of::<NvEncPicParams>() == 3360);
const _: () = assert!(std::mem::size_of::<NvEncFunctionList>() >= 336);
const _: () = assert!(NV_ENC_BUFFER_FORMAT_YUV420_10BIT == 0x00010000);
const _: () = assert!(NV_ENC_BUFFER_FORMAT_IYUV == 0x00000100);
const _: () = assert!(pixel_bit_depth_minus8_for(PixelFormat::Yuv420p10le) == 2);
const _: () = assert!(pixel_bit_depth_minus8_for(PixelFormat::Yuv420p) == 0);
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_fps_rational_mapping() {
assert_eq!(fps_to_rational(23.976), (24_000, 1001));
assert_eq!(fps_to_rational(24.0), (24, 1));
assert_eq!(fps_to_rational(25.0), (25, 1));
assert_eq!(fps_to_rational(29.97), (30_000, 1001));
assert_eq!(fps_to_rational(30.0), (30, 1));
assert_eq!(fps_to_rational(48.0), (48, 1));
assert_eq!(fps_to_rational(50.0), (50, 1));
assert_eq!(fps_to_rational(59.94), (60_000, 1001));
assert_eq!(fps_to_rational(60.0), (60, 1));
}
#[test]
fn test_fps_rational_1001_family_detection() {
let (n, d) = fps_to_rational(23.9760239760);
assert_eq!(d, 1001);
assert_eq!(n, 24_000);
let (n, d) = fps_to_rational(29.9700299700);
assert_eq!(d, 1001);
assert_eq!(n, 30_000);
let (n, d) = fps_to_rational(59.9400599400);
assert_eq!(d, 1001);
assert_eq!(n, 60_000);
}
#[test]
fn test_fps_rational_generic_fallback() {
assert_eq!(fps_to_rational(100.0), (100, 1));
assert_eq!(fps_to_rational(120.0), (120, 1));
assert_eq!(fps_to_rational(23.5), (23_500, 1000));
}
#[test]
fn test_nvenc_cq_clamps_to_51() {
let clamped = 75u8.min(51);
assert_eq!(clamped, 51);
let ok = 40u8.min(51);
assert_eq!(ok, 40);
let at_limit = 51u8.min(51);
assert_eq!(at_limit, 51);
}
#[test]
fn test_ring_buffer_index_cycles() {
let mut idx = 0usize;
let mut seen = Vec::new();
for _ in 0..(RING_SIZE * 3) {
seen.push(idx);
idx = (idx + 1) % RING_SIZE;
}
assert_eq!(
seen,
vec![0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3],
"ring index must cycle through 0..RING_SIZE"
);
}
#[test]
fn test_ring_size_is_four() {
assert_eq!(RING_SIZE, 4);
}
#[test]
fn test_nvenc_buffer_format_dispatch_10bit() {
let fmt_8 = nvenc_buffer_format_for(PixelFormat::Yuv420p).unwrap();
let fmt_10 = nvenc_buffer_format_for(PixelFormat::Yuv420p10le).unwrap();
assert_eq!(fmt_8, NV_ENC_BUFFER_FORMAT_IYUV);
assert_eq!(fmt_10, NV_ENC_BUFFER_FORMAT_YUV420_10BIT);
assert_ne!(
fmt_8, fmt_10,
"10-bit must select a different SDK constant from 8-bit"
);
}
#[test]
fn test_nvenc_buffer_format_dispatch_rejects_4_2_2_and_4_4_4() {
for unsupported in [
PixelFormat::Yuv422p,
PixelFormat::Yuv422p10le,
PixelFormat::Yuv444p,
PixelFormat::Yuv444p10le,
PixelFormat::Yuva444p10le,
PixelFormat::Nv12,
PixelFormat::Rgb24,
] {
assert!(
nvenc_buffer_format_for(unsupported).is_err(),
"{unsupported:?} must be rejected by NVENC dispatch"
);
}
}
#[test]
fn test_nvenc_pixel_bit_depth_dispatch() {
assert_eq!(pixel_bit_depth_minus8_for(PixelFormat::Yuv420p), 0);
assert_eq!(pixel_bit_depth_minus8_for(PixelFormat::Yuv420p10le), 2);
}
#[test]
fn test_nvenc_transfer_to_h273_codes() {
assert_eq!(transfer_to_h273(TransferFn::Bt709), 1);
assert_eq!(transfer_to_h273(TransferFn::Bt470Bg), 4);
assert_eq!(transfer_to_h273(TransferFn::Linear), 8);
assert_eq!(transfer_to_h273(TransferFn::St2084), 16, "HDR10 PQ");
assert_eq!(transfer_to_h273(TransferFn::AribStdB67), 18, "HLG");
assert_eq!(
transfer_to_h273(TransferFn::Unspecified),
1,
"Unspecified collapses to canonical Bt709 — AV1 has no \
unspecified sentinel for transfer"
);
}
#[test]
fn test_nvenc_av1_config_10bit_hdr_layout() {
let mut cfg: NvEncConfigAv1 = unsafe { std::mem::zeroed() };
let bit_depth_minus8 = pixel_bit_depth_minus8_for(PixelFormat::Yuv420p10le);
let bit_depth_enum: u32 = if bit_depth_minus8 == 0 { 0 } else { 1 };
cfg.output_bit_depth = bit_depth_enum;
cfg.input_bit_depth = bit_depth_enum;
cfg.flags |= AV1_CHROMA_FORMAT_IDC_420;
let cm = ColorMetadata {
transfer: TransferFn::St2084,
matrix_coefficients: 9, colour_primaries: 9, full_range: true,
mastering_display: None,
content_light_level: None,
};
cfg.color_primaries = cm.colour_primaries as u32;
cfg.transfer_characteristics = transfer_to_h273(cm.transfer);
cfg.matrix_coefficients = cm.matrix_coefficients as u32;
cfg.color_range = cm.full_range as u32;
assert_eq!(cfg.output_bit_depth, 1, "10-bit enum value");
assert_eq!(cfg.input_bit_depth, 1, "10-bit input enum value");
assert_eq!(cfg.color_primaries, 9, "BT.2020");
assert_eq!(cfg.transfer_characteristics, 16, "ST 2084 / PQ");
assert_eq!(cfg.matrix_coefficients, 9, "BT.2020 NCL");
assert_eq!(cfg.color_range, 1, "full range");
assert_eq!(
cfg.flags & AV1_CHROMA_FORMAT_IDC_420,
AV1_CHROMA_FORMAT_IDC_420,
"chromaFormatIDC=1 (4:2:0) packed into flags bits 7-8"
);
let bytes = unsafe {
std::slice::from_raw_parts(
&cfg as *const NvEncConfigAv1 as *const u8,
std::mem::size_of::<NvEncConfigAv1>(),
)
};
let bd_offset = std::mem::offset_of!(NvEncConfigAv1, output_bit_depth);
assert_eq!(
u32::from_le_bytes(bytes[bd_offset..bd_offset + 4].try_into().unwrap()),
1,
"output_bit_depth must read back as 1 (10-bit) from raw bytes"
);
let prim_offset = std::mem::offset_of!(NvEncConfigAv1, color_primaries);
assert_eq!(
u32::from_le_bytes(bytes[prim_offset..prim_offset + 4].try_into().unwrap()),
9,
"color_primaries=9 (BT.2020) at the expected offset"
);
let trans_offset = std::mem::offset_of!(NvEncConfigAv1, transfer_characteristics);
assert_eq!(
u32::from_le_bytes(bytes[trans_offset..trans_offset + 4].try_into().unwrap()),
16,
"transfer_characteristics=16 (PQ) at the expected offset"
);
let range_offset = std::mem::offset_of!(NvEncConfigAv1, color_range);
assert_eq!(
u32::from_le_bytes(bytes[range_offset..range_offset + 4].try_into().unwrap()),
1,
"color_range=1 (full) at the expected offset"
);
}
#[test]
fn test_nvenc_av1_config_8bit_sdr_layout() {
let mut cfg: NvEncConfigAv1 = unsafe { std::mem::zeroed() };
let bit_depth_minus8 = pixel_bit_depth_minus8_for(PixelFormat::Yuv420p);
let bit_depth_enum: u32 = if bit_depth_minus8 == 0 { 0 } else { 1 };
cfg.output_bit_depth = bit_depth_enum;
cfg.input_bit_depth = bit_depth_enum;
cfg.flags |= AV1_CHROMA_FORMAT_IDC_420;
let cm = ColorMetadata::default();
cfg.color_primaries = cm.colour_primaries as u32;
cfg.transfer_characteristics = transfer_to_h273(cm.transfer);
cfg.matrix_coefficients = cm.matrix_coefficients as u32;
cfg.color_range = cm.full_range as u32;
assert_eq!(cfg.output_bit_depth, 0, "8-bit enum value");
assert_eq!(cfg.color_primaries, 1, "BT.709 default");
assert_eq!(cfg.transfer_characteristics, 1, "BT.709 default");
assert_eq!(cfg.matrix_coefficients, 1, "BT.709 default");
assert_eq!(cfg.color_range, 0, "studio range default");
}
#[test]
fn test_guid_roundtrip() {
let bytes: [u8; 16] = [
0xb4, 0xe6, 0xc6, 0x21, 0x7a, 0x29, 0xba, 0x4c, 0x99, 0x8f, 0xb6, 0xcb, 0xde, 0x72,
0xad, 0xe3,
];
let g = guid_from_bytes(bytes);
assert_eq!(g.data1, NV_ENC_PRESET_P5_GUID.data1);
assert_eq!(g.data2, NV_ENC_PRESET_P5_GUID.data2);
assert_eq!(g.data3, NV_ENC_PRESET_P5_GUID.data3);
assert_eq!(g.data4, NV_ENC_PRESET_P5_GUID.data4);
}
}