#![allow(
clippy::uninlined_format_args,
clippy::needless_borrows_for_generic_args,
clippy::unnecessary_literal_unwrap
)]
use std::path::{Path, PathBuf};
use std::process::Command;
use async_trait::async_trait;
use tempfile::NamedTempFile;
use tokio::fs;
use super::traits::{ConverterMetadata, DocumentConverter};
use crate::Result;
use crate::types::{
ConversionOptions, ConversionOutput, ConversionResult, FileFormat, OutputFormat, OutputMetadata,
};
#[derive(Debug)]
pub struct VideoConverter;
impl VideoConverter {
pub fn new() -> Self {
Self
}
fn check_ffmpeg() -> bool {
Command::new("ffmpeg").arg("-version").output().is_ok()
}
fn check_whisper() -> bool {
if Command::new("whisper").arg("--help").output().is_ok() {
return true;
}
let paths = vec![
format!(
"{}/.local/bin/whisper",
std::env::var("HOME").unwrap_or_default()
),
"/usr/local/bin/whisper".to_string(),
"/usr/bin/whisper".to_string(),
];
for path in paths {
if std::path::Path::new(&path).exists() {
return true;
}
}
false
}
fn get_whisper_cmd() -> String {
let paths = vec![
format!(
"{}/.local/bin/whisper",
std::env::var("HOME").unwrap_or_default()
),
"/usr/local/bin/whisper".to_string(),
"/usr/bin/whisper".to_string(),
"whisper".to_string(),
];
for path in &paths {
if std::path::Path::new(path).exists() || path == "whisper" {
return path.clone();
}
}
"whisper".to_string()
}
async fn extract_audio(&self, video_path: &Path) -> Result<PathBuf> {
if !Self::check_ffmpeg() {
return Err(crate::TransmutationError::conversion_failed(
"FFmpeg not found. Install: sudo apt-get install ffmpeg",
));
}
let temp_audio = NamedTempFile::new().map_err(|e| {
crate::TransmutationError::conversion_failed(&format!(
"Failed to create temp file: {}",
e
))
})?;
let audio_path = temp_audio.path().with_extension("wav");
eprintln!("🎬 Extracting audio with FFmpeg...");
let output = Command::new("ffmpeg")
.arg("-i")
.arg(video_path)
.arg("-vn") .arg("-acodec")
.arg("pcm_s16le") .arg("-ar")
.arg("16000") .arg("-ac")
.arg("1") .arg("-y") .arg(&audio_path)
.output()
.map_err(|e| {
crate::TransmutationError::conversion_failed(&format!(
"FFmpeg execution failed: {}",
e
))
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(crate::TransmutationError::conversion_failed(&format!(
"FFmpeg failed: {}",
stderr
)));
}
Ok(audio_path)
}
async fn transcribe_audio(&self, audio_path: &Path, language: Option<&str>) -> Result<String> {
if !Self::check_whisper() {
return Err(crate::TransmutationError::conversion_failed(
"Whisper not found. Install: pip install openai-whisper",
));
}
eprintln!("🎤 Running Whisper transcription...");
let whisper_cmd = Self::get_whisper_cmd();
let mut cmd = Command::new(&whisper_cmd);
cmd.arg(audio_path);
cmd.arg("--model").arg("base"); cmd.arg("--output_format").arg("txt");
cmd.arg("--output_dir").arg("/tmp");
if let Some(lang) = language {
cmd.arg("--language").arg(lang);
}
let output = cmd.output().map_err(|e| {
crate::TransmutationError::conversion_failed(&format!(
"Whisper execution failed: {}",
e
))
})?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(crate::TransmutationError::conversion_failed(&format!(
"Whisper failed: {}",
stderr
)));
}
let audio_name = audio_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("audio");
let transcript_path = format!("/tmp/{}.txt", audio_name);
let transcript = tokio::fs::read_to_string(&transcript_path)
.await
.map_err(|e| {
crate::TransmutationError::conversion_failed(&format!(
"Failed to read transcript: {}",
e
))
})?;
let _ = tokio::fs::remove_file(&transcript_path).await;
Ok(transcript)
}
async fn video_to_markdown(&self, video_path: &Path, language: Option<&str>) -> Result<String> {
let audio_path = self.extract_audio(video_path).await?;
let transcript = self.transcribe_audio(&audio_path, language).await?;
let _ = tokio::fs::remove_file(&audio_path).await;
let mut markdown = String::new();
markdown.push_str("# Video Transcription\n\n");
if let Some(lang) = language {
markdown.push_str(&format!("**Language**: {}\n\n", lang));
}
markdown.push_str("## Transcript\n\n");
markdown.push_str(&transcript);
markdown.push('\n');
Ok(markdown)
}
}
impl Default for VideoConverter {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl DocumentConverter for VideoConverter {
fn supported_formats(&self) -> Vec<FileFormat> {
vec![
FileFormat::Mp4,
FileFormat::Avi,
FileFormat::Mkv,
FileFormat::Mov,
FileFormat::Webm,
]
}
fn output_formats(&self) -> Vec<OutputFormat> {
vec![
OutputFormat::Markdown {
split_pages: false,
optimize_for_llm: true,
},
OutputFormat::Json {
structured: true,
include_metadata: true,
},
]
}
async fn convert(
&self,
input: &Path,
output_format: OutputFormat,
_options: ConversionOptions,
) -> Result<ConversionResult> {
eprintln!("🔄 Video Transcription (FFmpeg + Whisper)");
eprintln!(" Video → Audio → Whisper → {:?}", output_format);
eprintln!();
let language = None;
let markdown = self.video_to_markdown(input, language).await?;
let output_data = match output_format {
OutputFormat::Markdown { .. } => {
eprintln!("✅ Transcription complete!");
markdown.into_bytes()
}
OutputFormat::Json { .. } => {
eprintln!("📝 Converting to JSON...");
let json = serde_json::json!({
"transcription": {
"text": markdown,
"language": language.unwrap_or("auto"),
"source": "video",
}
});
serde_json::to_string_pretty(&json)?.into_bytes()
}
_ => {
return Err(crate::TransmutationError::UnsupportedFormat(format!(
"Output format {:?} not supported for video",
output_format
)));
}
};
let output_size = output_data.len() as u64;
let input_size = fs::metadata(input).await?.len();
Ok(ConversionResult {
input_path: input.to_path_buf(),
input_format: crate::utils::file_detect::detect_format(input).await?,
output_format,
content: vec![ConversionOutput {
page_number: 1,
data: output_data,
metadata: OutputMetadata {
size_bytes: output_size,
chunk_count: 1,
token_count: None,
},
}],
metadata: crate::types::DocumentMetadata {
title: None,
author: None,
created: None,
modified: None,
page_count: 1,
language: language.map(|s| s.to_string()),
custom: std::collections::HashMap::new(),
},
statistics: crate::types::ConversionStatistics {
input_size_bytes: input_size,
output_size_bytes: output_size,
duration: std::time::Duration::from_secs(0),
pages_processed: 1,
tables_extracted: 0,
images_extracted: 0,
cache_hit: false,
},
})
}
fn metadata(&self) -> ConverterMetadata {
ConverterMetadata {
name: "Video Transcription Converter".to_string(),
version: env!("CARGO_PKG_VERSION").to_string(),
description: "Video to text converter using FFmpeg + Whisper ASR".to_string(),
external_deps: vec!["ffmpeg".to_string(), "whisper".to_string()],
}
}
}