pub mod active_reading;
pub mod asr_backend;
pub mod diarize;
pub mod extract;
pub mod fluidaudio_backend;
pub mod fusion;
pub mod report;
pub mod transcribe;
pub mod vision;
#[cfg(feature = "analyze-sherpa")]
pub mod sherpa_onnx_backend;
#[cfg(feature = "analyze-whisper")]
pub mod whisper_rs_backend;
use std::path::PathBuf;
use std::sync::Arc;
use serde::{Deserialize, Serialize};
use thiserror::Error;
pub use active_reading::{
ActiveReader, ActiveReadingConfig, ActiveReadingError, ActiveReadingMetadata,
ActiveReadingOutput, LlmSampler, LookupResult, Reference, ReferenceKind, UrlFetcher,
};
pub use asr_backend::{
AsrBackend, SpeakerSegment as AsrSpeakerSegment, TranscribeOptions,
TranscriptSegment as AsrTranscriptSegment, TranscriptionResult, WordTiming as AsrWordTiming,
};
pub use fluidaudio_backend::FluidAudioBackend;
pub use diarize::{Diarizer, SpeakerSegment};
pub use extract::{AudioExtractor, ExtractedFrame, FrameExtractor};
pub use fusion::{FusedSegment, FusionEngine};
pub use report::{AnalysisReport, ReportFormat};
pub use transcribe::{TranscriptSegment, TranscriptionBackend, VllmTranscriber, WordTiming};
pub use vision::{VisionAnalyzer, VisionBackend, VisualAnalysis};
#[must_use]
pub fn default_backend() -> Arc<dyn AsrBackend> {
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
{
let fluid = FluidAudioBackend::new()
.unwrap_or_else(|_| FluidAudioBackend::with_binary(PathBuf::from("fluidaudiocli")));
if fluid.is_available() {
return Arc::new(fluid);
}
}
#[cfg(feature = "analyze-sherpa")]
{
let sherpa = sherpa_onnx_backend::SherpaOnnxBackend::new();
if sherpa.is_available() {
return Arc::new(sherpa);
}
}
#[cfg(feature = "analyze-whisper")]
{
let whisper = whisper_rs_backend::WhisperRsBackend::new();
if whisper.is_available() {
return Arc::new(whisper);
}
}
#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
{
Arc::new(FluidAudioBackend::with_binary(PathBuf::from(
"fluidaudiocli",
)))
}
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
{
Arc::new(FluidAudioBackend::with_binary(PathBuf::from(
"fluidaudiocli",
)))
}
}
#[derive(Error, Debug)]
pub enum AnalysisError {
#[error("FFmpeg error: {0}")]
Ffmpeg(String),
#[error("Whisper error: {0}")]
Whisper(String),
#[error("Diarization error: {0}")]
Diarization(String),
#[error("Vision analysis error: {0}")]
Vision(String),
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("JSON error: {0}")]
Json(#[from] serde_json::Error),
#[error("Missing dependency: {0}")]
MissingDependency(String),
#[error("Unsupported format: {0}")]
UnsupportedFormat(String),
#[error("Format error: {0}")]
Format(#[from] std::fmt::Error),
#[error("HTTP error: {0}")]
Http(#[from] reqwest::Error),
#[error("Transcription API error: {0}")]
TranscriptionApi(String),
}
pub type Result<T> = std::result::Result<T, AnalysisError>;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmotionAnalysis {
pub primary: String,
pub confidence: f32,
#[serde(skip_serializing_if = "Option::is_none")]
pub secondary: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VisualContext {
pub action: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub gaze: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub objects: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub scene: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnalysisSegment {
pub start: f64,
pub end: f64,
#[serde(skip_serializing_if = "Option::is_none")]
pub speaker: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub transcript: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub emotion: Option<EmotionAnalysis>,
#[serde(skip_serializing_if = "Option::is_none")]
pub visual: Option<VisualContext>,
#[serde(default)]
pub flags: Vec<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnalysisOutput {
pub segments: Vec<AnalysisSegment>,
#[serde(skip_serializing_if = "Option::is_none")]
pub metadata: Option<VideoMetadata>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VideoMetadata {
pub duration: f64,
pub width: u32,
pub height: u32,
pub fps: f32,
#[serde(skip_serializing_if = "Option::is_none")]
pub audio_channels: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub audio_sample_rate: Option<u32>,
}
#[derive(Debug, Clone)]
pub struct PipelineConfig {
pub scene_threshold: f32,
pub max_frames: usize,
pub whisper_model: String,
pub enable_diarization: bool,
pub vision_backend: VisionBackend,
pub work_dir: PathBuf,
pub dgx_host: Option<String>,
}
impl Default for PipelineConfig {
fn default() -> Self {
Self {
scene_threshold: 0.3,
max_frames: 100,
whisper_model: "base".to_string(),
enable_diarization: true,
vision_backend: VisionBackend::Local,
work_dir: std::env::temp_dir().join("nab_analyze"),
dgx_host: None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_config_default() {
let config = PipelineConfig::default();
assert!((config.scene_threshold - 0.3).abs() < f32::EPSILON);
assert_eq!(config.whisper_model, "base");
assert!(config.enable_diarization);
}
#[test]
fn test_segment_serialization() {
let segment = AnalysisSegment {
start: 0.0,
end: 5.2,
speaker: Some("Speaker_1".to_string()),
transcript: Some("Hello, welcome to the show".to_string()),
emotion: Some(EmotionAnalysis {
primary: "happy".to_string(),
confidence: 0.85,
secondary: None,
}),
visual: Some(VisualContext {
action: "waving".to_string(),
gaze: Some("camera".to_string()),
objects: None,
scene: None,
}),
flags: vec![],
};
let json = serde_json::to_string_pretty(&segment).unwrap();
assert!(json.contains("Speaker_1"));
assert!(json.contains("waving"));
}
}