use crate::config::{MediaPipelineConfig, TranscriptionConfig};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MediaKind {
Audio,
Image,
Video,
Unknown,
}
#[derive(Debug, Clone)]
pub struct MediaAttachment {
pub file_name: String,
pub data: Vec<u8>,
pub mime_type: Option<String>,
}
impl MediaAttachment {
pub fn kind(&self) -> MediaKind {
if let Some(ref mime) = self.mime_type {
let lower = mime.to_ascii_lowercase();
if lower.starts_with("audio/") {
return MediaKind::Audio;
}
if lower.starts_with("image/") {
return MediaKind::Image;
}
if lower.starts_with("video/") {
return MediaKind::Video;
}
}
let ext = self
.file_name
.rsplit_once('.')
.map(|(_, e)| e.to_ascii_lowercase())
.unwrap_or_default();
match ext.as_str() {
"flac" | "mp3" | "mpeg" | "mpga" | "m4a" | "ogg" | "oga" | "opus" | "wav" | "webm" => {
MediaKind::Audio
}
"png" | "jpg" | "jpeg" | "gif" | "bmp" | "webp" | "heic" | "tiff" | "svg" => {
MediaKind::Image
}
"mp4" | "mkv" | "avi" | "mov" | "wmv" | "flv" => MediaKind::Video,
_ => MediaKind::Unknown,
}
}
}
pub struct MediaPipeline<'a> {
config: &'a MediaPipelineConfig,
transcription_config: &'a TranscriptionConfig,
vision_available: bool,
}
impl<'a> MediaPipeline<'a> {
pub fn new(
config: &'a MediaPipelineConfig,
transcription_config: &'a TranscriptionConfig,
vision_available: bool,
) -> Self {
Self {
config,
transcription_config,
vision_available,
}
}
pub async fn process(&self, original_text: &str, attachments: &[MediaAttachment]) -> String {
if !self.config.enabled || attachments.is_empty() {
return original_text.to_string();
}
let mut annotations = Vec::new();
for attachment in attachments {
match attachment.kind() {
MediaKind::Audio if self.config.transcribe_audio => {
let annotation = self.process_audio(attachment).await;
annotations.push(annotation);
}
MediaKind::Image if self.config.describe_images => {
let annotation = self.process_image(attachment);
annotations.push(annotation);
}
MediaKind::Video if self.config.summarize_video => {
let annotation = self.process_video(attachment);
annotations.push(annotation);
}
_ => {}
}
}
if annotations.is_empty() {
return original_text.to_string();
}
let mut enriched = String::with_capacity(
annotations.iter().map(|a| a.len() + 1).sum::<usize>() + original_text.len() + 2,
);
for annotation in &annotations {
enriched.push_str(annotation);
enriched.push('\n');
}
if !original_text.is_empty() {
enriched.push('\n');
enriched.push_str(original_text);
}
enriched.trim().to_string()
}
async fn process_audio(&self, attachment: &MediaAttachment) -> String {
if !self.transcription_config.enabled {
return "[Audio: attached]".to_string();
}
match super::transcription::transcribe_audio(
attachment.data.clone(),
&attachment.file_name,
self.transcription_config,
)
.await
{
Ok(text) => {
let trimmed = text.trim();
if trimmed.is_empty() {
"[Audio transcription: (empty)]".to_string()
} else {
format!("[Audio transcription: {trimmed}]")
}
}
Err(err) => {
tracing::warn!(
file = %attachment.file_name,
error = %err,
"Media pipeline: audio transcription failed"
);
"[Audio: transcription failed]".to_string()
}
}
}
fn process_image(&self, attachment: &MediaAttachment) -> String {
if self.vision_available {
format!(
"[Image: {} attached, will be processed by vision model]",
attachment.file_name
)
} else {
format!("[Image: {} attached]", attachment.file_name)
}
}
fn process_video(&self, attachment: &MediaAttachment) -> String {
format!("[Video: {} attached]", attachment.file_name)
}
}
#[cfg(test)]
mod tests {
use super::*;
fn default_pipeline_config(enabled: bool) -> MediaPipelineConfig {
MediaPipelineConfig {
enabled,
transcribe_audio: true,
describe_images: true,
summarize_video: true,
}
}
fn sample_audio() -> MediaAttachment {
MediaAttachment {
file_name: "voice.ogg".to_string(),
data: vec![0u8; 100],
mime_type: Some("audio/ogg".to_string()),
}
}
fn sample_image() -> MediaAttachment {
MediaAttachment {
file_name: "photo.jpg".to_string(),
data: vec![0u8; 50],
mime_type: Some("image/jpeg".to_string()),
}
}
fn sample_video() -> MediaAttachment {
MediaAttachment {
file_name: "clip.mp4".to_string(),
data: vec![0u8; 200],
mime_type: Some("video/mp4".to_string()),
}
}
#[test]
fn media_kind_from_mime() {
let audio = MediaAttachment {
file_name: "file".to_string(),
data: vec![],
mime_type: Some("audio/ogg".to_string()),
};
assert_eq!(audio.kind(), MediaKind::Audio);
let image = MediaAttachment {
file_name: "file".to_string(),
data: vec![],
mime_type: Some("image/png".to_string()),
};
assert_eq!(image.kind(), MediaKind::Image);
let video = MediaAttachment {
file_name: "file".to_string(),
data: vec![],
mime_type: Some("video/mp4".to_string()),
};
assert_eq!(video.kind(), MediaKind::Video);
}
#[test]
fn media_kind_from_extension() {
let audio = MediaAttachment {
file_name: "voice.ogg".to_string(),
data: vec![],
mime_type: None,
};
assert_eq!(audio.kind(), MediaKind::Audio);
let image = MediaAttachment {
file_name: "photo.png".to_string(),
data: vec![],
mime_type: None,
};
assert_eq!(image.kind(), MediaKind::Image);
let video = MediaAttachment {
file_name: "clip.mp4".to_string(),
data: vec![],
mime_type: None,
};
assert_eq!(video.kind(), MediaKind::Video);
let unknown = MediaAttachment {
file_name: "data.bin".to_string(),
data: vec![],
mime_type: None,
};
assert_eq!(unknown.kind(), MediaKind::Unknown);
}
#[tokio::test]
async fn disabled_pipeline_returns_original_text() {
let config = default_pipeline_config(false);
let tc = TranscriptionConfig::default();
let pipeline = MediaPipeline::new(&config, &tc, false);
let result = pipeline.process("hello", &[sample_audio()]).await;
assert_eq!(result, "hello");
}
#[tokio::test]
async fn empty_attachments_returns_original_text() {
let config = default_pipeline_config(true);
let tc = TranscriptionConfig::default();
let pipeline = MediaPipeline::new(&config, &tc, false);
let result = pipeline.process("hello", &[]).await;
assert_eq!(result, "hello");
}
#[tokio::test]
async fn image_annotation_with_vision() {
let config = default_pipeline_config(true);
let tc = TranscriptionConfig::default();
let pipeline = MediaPipeline::new(&config, &tc, true);
let result = pipeline.process("check this", &[sample_image()]).await;
assert!(
result.contains("[Image: photo.jpg attached, will be processed by vision model]"),
"expected vision annotation, got: {result}"
);
assert!(result.contains("check this"));
}
#[tokio::test]
async fn image_annotation_without_vision() {
let config = default_pipeline_config(true);
let tc = TranscriptionConfig::default();
let pipeline = MediaPipeline::new(&config, &tc, false);
let result = pipeline.process("check this", &[sample_image()]).await;
assert!(
result.contains("[Image: photo.jpg attached]"),
"expected basic image annotation, got: {result}"
);
}
#[tokio::test]
async fn video_annotation() {
let config = default_pipeline_config(true);
let tc = TranscriptionConfig::default();
let pipeline = MediaPipeline::new(&config, &tc, false);
let result = pipeline.process("watch", &[sample_video()]).await;
assert!(
result.contains("[Video: clip.mp4 attached]"),
"expected video annotation, got: {result}"
);
}
#[tokio::test]
async fn audio_without_transcription_enabled() {
let config = default_pipeline_config(true);
let mut tc = TranscriptionConfig::default();
tc.enabled = false;
let pipeline = MediaPipeline::new(&config, &tc, false);
let result = pipeline.process("", &[sample_audio()]).await;
assert_eq!(result, "[Audio: attached]");
}
#[tokio::test]
async fn multiple_attachments_produce_multiple_annotations() {
let config = default_pipeline_config(true);
let mut tc = TranscriptionConfig::default();
tc.enabled = false;
let pipeline = MediaPipeline::new(&config, &tc, false);
let attachments = vec![sample_audio(), sample_image(), sample_video()];
let result = pipeline.process("context", &attachments).await;
assert!(
result.contains("[Audio: attached]"),
"missing audio annotation"
);
assert!(
result.contains("[Image: photo.jpg attached]"),
"missing image annotation"
);
assert!(
result.contains("[Video: clip.mp4 attached]"),
"missing video annotation"
);
assert!(result.contains("context"), "missing original text");
}
#[tokio::test]
async fn disabled_sub_features_skip_processing() {
let config = MediaPipelineConfig {
enabled: true,
transcribe_audio: false,
describe_images: false,
summarize_video: false,
};
let tc = TranscriptionConfig::default();
let pipeline = MediaPipeline::new(&config, &tc, false);
let attachments = vec![sample_audio(), sample_image(), sample_video()];
let result = pipeline.process("hello", &attachments).await;
assert_eq!(result, "hello");
}
}