use std::path::Path;
use std::process::Command;
use anyhow::{Result, bail, Context};
use image::GrayImage;
use rustfft::{FftPlanner, num_complex::Complex};
use tracing::{debug, info, warn};
use crate::types::*;
#[derive(Debug, Clone)]
pub struct ThumbnailConfig {
pub num_candidates: usize,
pub skip_start_secs: f64,
pub skip_end_secs: f64,
pub min_sharpness: f32,
pub sharpness_weight: f32,
pub contrast_weight: f32,
pub audio_weight: f32,
pub output_width: u32,
pub output_height: u32,
}
impl Default for ThumbnailConfig {
fn default() -> Self {
Self {
num_candidates: 30,
skip_start_secs: 2.0,
skip_end_secs: 2.0,
min_sharpness: 0.3,
sharpness_weight: 0.4,
contrast_weight: 0.3,
audio_weight: 0.3,
output_width: 1280,
output_height: 720,
}
}
}
pub struct ThumbnailSelector {
config: ThumbnailConfig,
}
impl ThumbnailSelector {
pub fn new() -> Self {
Self::with_config(ThumbnailConfig::default())
}
pub fn with_config(config: ThumbnailConfig) -> Self {
Self { config }
}
pub fn find_best_timestamp(
&self,
video_path: impl AsRef<Path>,
audio: &AudioData,
) -> Result<f64> {
let video_path = video_path.as_ref();
info!("Finding best thumbnail timestamp for: {}", video_path.display());
let duration = self.get_video_duration(video_path)?;
debug!("Video duration: {:.2}s", duration);
let start_time = self.config.skip_start_secs;
let end_time = (duration - self.config.skip_end_secs).max(start_time + 1.0);
let step = (end_time - start_time) / self.config.num_candidates as f64;
let timestamps: Vec<f64> = (0..self.config.num_candidates)
.map(|i| start_time + i as f64 * step)
.collect();
let audio_energies = self.compute_audio_energies(audio, ×tamps);
let mut candidates: Vec<(f64, f32)> = Vec::new();
for (i, ×tamp) in timestamps.iter().enumerate() {
match self.extract_frame(video_path, timestamp) {
Ok(frame) => {
let quality = self.analyze_frame_quality(&frame);
let audio_score = audio_energies.get(i).copied().unwrap_or(0.5);
let total_score = quality.sharpness * self.config.sharpness_weight
+ quality.contrast * self.config.contrast_weight
+ audio_score * self.config.audio_weight;
if quality.sharpness >= self.config.min_sharpness {
candidates.push((timestamp, total_score));
debug!(
"Frame at {:.2}s: sharpness={:.3}, contrast={:.3}, audio={:.3}, total={:.3}",
timestamp, quality.sharpness, quality.contrast, audio_score, total_score
);
}
}
Err(e) => {
warn!("Failed to extract frame at {:.2}s: {}", timestamp, e);
}
}
}
candidates.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
if let Some((best_timestamp, best_score)) = candidates.first() {
info!("Best thumbnail at {:.2}s with score {:.3}", best_timestamp, best_score);
Ok(*best_timestamp)
} else {
let fallback = (start_time + end_time) / 2.0;
warn!("No suitable frames found, using fallback at {:.2}s", fallback);
Ok(fallback)
}
}
pub fn find_candidates(
&self,
video_path: impl AsRef<Path>,
audio: &AudioData,
num_results: usize,
) -> Result<Vec<ThumbnailCandidate>> {
let video_path = video_path.as_ref();
let duration = self.get_video_duration(video_path)?;
let start_time = self.config.skip_start_secs;
let end_time = (duration - self.config.skip_end_secs).max(start_time + 1.0);
let num_samples = self.config.num_candidates.max(num_results * 3);
let step = (end_time - start_time) / num_samples as f64;
let timestamps: Vec<f64> = (0..num_samples)
.map(|i| start_time + i as f64 * step)
.collect();
let audio_energies = self.compute_audio_energies(audio, ×tamps);
let mut candidates: Vec<ThumbnailCandidate> = Vec::new();
for (i, ×tamp) in timestamps.iter().enumerate() {
if let Ok(frame) = self.extract_frame(video_path, timestamp) {
let quality = self.analyze_frame_quality(&frame);
let audio_score = audio_energies.get(i).copied().unwrap_or(0.5);
let total_score = quality.sharpness * self.config.sharpness_weight
+ quality.contrast * self.config.contrast_weight
+ audio_score * self.config.audio_weight;
candidates.push(ThumbnailCandidate {
timestamp,
sharpness: quality.sharpness,
contrast: quality.contrast,
audio_energy: audio_score,
total_score,
});
}
}
candidates.sort_by(|a, b| b.total_score.partial_cmp(&a.total_score).unwrap_or(std::cmp::Ordering::Equal));
let mut diversified = Vec::new();
let min_gap = (end_time - start_time) / (num_results as f64 * 2.0);
for candidate in candidates {
let too_close = diversified.iter().any(|c: &ThumbnailCandidate| {
(c.timestamp - candidate.timestamp).abs() < min_gap
});
if !too_close {
diversified.push(candidate);
if diversified.len() >= num_results {
break;
}
}
}
Ok(diversified)
}
pub fn extract_thumbnail(
&self,
video_path: impl AsRef<Path>,
timestamp: f64,
output_path: impl AsRef<Path>,
) -> Result<()> {
let video_path = video_path.as_ref();
let output_path = output_path.as_ref();
let output = Command::new("ffmpeg")
.args([
"-ss", &format!("{:.3}", timestamp),
"-i", &video_path.to_string_lossy(),
"-vframes", "1",
"-vf", &format!("scale={}:{}", self.config.output_width, self.config.output_height),
"-y",
&output_path.to_string_lossy(),
])
.output()
.context("FFmpeg not found")?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
bail!("FFmpeg thumbnail extraction failed: {}", stderr);
}
info!("Extracted thumbnail to: {}", output_path.display());
Ok(())
}
fn get_video_duration(&self, video_path: &Path) -> Result<f64> {
let output = Command::new("ffprobe")
.args([
"-v", "quiet",
"-print_format", "json",
"-show_format",
&video_path.to_string_lossy(),
])
.output()
.context("FFprobe not found")?;
let json: serde_json::Value = serde_json::from_slice(&output.stdout)
.context("Failed to parse ffprobe output")?;
let duration = json["format"]["duration"]
.as_str()
.and_then(|d| d.parse::<f64>().ok())
.ok_or_else(|| anyhow::anyhow!("Could not determine video duration"))?;
Ok(duration)
}
fn extract_frame(&self, video_path: &Path, timestamp: f64) -> Result<GrayImage> {
let output = Command::new("ffmpeg")
.args([
"-ss", &format!("{:.3}", timestamp),
"-i", &video_path.to_string_lossy(),
"-vframes", "1",
"-vf", "scale=320:180,format=gray", "-f", "rawvideo",
"-pix_fmt", "gray",
"pipe:1",
])
.output()
.context("FFmpeg frame extraction failed")?;
if !output.status.success() || output.stdout.is_empty() {
bail!("Failed to extract frame at {:.2}s", timestamp);
}
let width = 320;
let height = 180;
if output.stdout.len() < width * height {
bail!("Incomplete frame data");
}
let img = GrayImage::from_raw(width as u32, height as u32, output.stdout[..width * height].to_vec())
.ok_or_else(|| anyhow::anyhow!("Failed to create image from raw data"))?;
Ok(img)
}
fn analyze_frame_quality(&self, frame: &GrayImage) -> ImageQuality {
let (width, height) = frame.dimensions();
let pixels: Vec<f32> = frame.pixels()
.map(|p| p.0[0] as f32 / 255.0)
.collect();
let sharpness = self.compute_2d_fft_sharpness(&pixels, width as usize, height as usize);
let mean: f32 = pixels.iter().sum::<f32>() / pixels.len() as f32;
let variance: f32 = pixels.iter()
.map(|&p| (p - mean) * (p - mean))
.sum::<f32>() / pixels.len() as f32;
let contrast = variance.sqrt();
let contrast_normalized = (contrast * 4.0).min(1.0);
ImageQuality {
sharpness,
contrast: contrast_normalized,
}
}
fn compute_2d_fft_sharpness(&self, pixels: &[f32], width: usize, height: usize) -> f32 {
let fft_width = width.next_power_of_two();
let fft_height = height.next_power_of_two();
let mut planner = FftPlanner::new();
let row_fft = planner.plan_fft_forward(fft_width);
let mut row_data: Vec<Vec<Complex<f32>>> = (0..height)
.map(|y| {
let mut row: Vec<Complex<f32>> = (0..fft_width)
.map(|x| {
if x < width {
Complex::new(pixels[y * width + x], 0.0)
} else {
Complex::new(0.0, 0.0)
}
})
.collect();
row_fft.process(&mut row);
row
})
.collect();
let col_fft = planner.plan_fft_forward(fft_height);
for x in 0..fft_width {
let mut col: Vec<Complex<f32>> = (0..fft_height)
.map(|y| {
if y < height {
row_data[y][x]
} else {
Complex::new(0.0, 0.0)
}
})
.collect();
col_fft.process(&mut col);
for y in 0..height {
row_data[y][x] = col[y];
}
}
let center_x = fft_width / 2;
let center_y = height / 2;
let radius = (fft_width.min(height) / 4) as f32;
let mut high_freq_energy = 0.0f32;
let mut total_energy = 0.0f32;
for y in 0..height {
for x in 0..fft_width {
let magnitude = (row_data[y][x].re.powi(2) + row_data[y][x].im.powi(2)).sqrt();
total_energy += magnitude;
let dx = (x as i32 - center_x as i32).abs() as f32;
let dy = (y as i32 - center_y as i32).abs() as f32;
let dist = (dx * dx + dy * dy).sqrt();
if dist > radius {
high_freq_energy += magnitude;
}
}
}
if total_energy > 0.0 {
(high_freq_energy / total_energy).min(1.0)
} else {
0.0
}
}
fn compute_audio_energies(&self, audio: &AudioData, timestamps: &[f64]) -> Vec<f32> {
let window_secs = 0.5; let window_samples = (audio.sample_rate as f64 * window_secs) as usize;
let mut energies: Vec<f32> = timestamps.iter()
.map(|&t| {
let center_sample = (t * audio.sample_rate as f64) as usize;
let start = center_sample.saturating_sub(window_samples / 2);
let end = (start + window_samples).min(audio.samples.len());
if start >= audio.samples.len() {
return 0.0;
}
let energy: f32 = audio.samples[start..end]
.iter()
.map(|&s| s * s)
.sum::<f32>() / (end - start) as f32;
energy.sqrt()
})
.collect();
let max_energy = energies.iter().cloned().fold(0.0f32, f32::max);
if max_energy > 0.0 {
for e in &mut energies {
*e /= max_energy;
}
}
energies
}
}
impl Default for ThumbnailSelector {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
struct ImageQuality {
sharpness: f32,
contrast: f32,
}
#[derive(Debug, Clone)]
pub struct ThumbnailCandidate {
pub timestamp: f64,
pub sharpness: f32,
pub contrast: f32,
pub audio_energy: f32,
pub total_score: f32,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_default_config() {
let config = ThumbnailConfig::default();
assert_eq!(config.num_candidates, 30);
assert_eq!(config.skip_start_secs, 2.0);
assert_eq!(config.output_width, 1280);
}
#[test]
fn test_image_quality_analysis() {
let width = 320;
let height = 180;
let mut pixels = vec![0u8; width * height];
for y in 0..height {
for x in 0..width {
pixels[y * width + x] = if x % 4 < 2 { 255 } else { 0 };
}
}
let img = GrayImage::from_raw(width as u32, height as u32, pixels).unwrap();
let selector = ThumbnailSelector::new();
let quality = selector.analyze_frame_quality(&img);
assert!(quality.sharpness > 0.1);
assert!(quality.contrast > 0.3);
}
#[test]
fn test_audio_energy_computation() {
let sample_rate = 44100;
let duration = 10.0;
let num_samples = (sample_rate as f64 * duration) as usize;
let samples: Vec<f32> = (0..num_samples)
.map(|i| {
let t = i as f64 / sample_rate as f64;
let envelope = (-(t - 5.0).powi(2) / 2.0).exp() as f32;
envelope * (2.0 * std::f32::consts::PI * 440.0 * t as f32).sin()
})
.collect();
let audio = AudioData::new(samples, sample_rate);
let selector = ThumbnailSelector::new();
let timestamps: Vec<f64> = (0..10).map(|i| i as f64 + 0.5).collect();
let energies = selector.compute_audio_energies(&audio, ×tamps);
let max_idx = energies.iter()
.enumerate()
.max_by(|a, b| a.1.partial_cmp(b.1).unwrap())
.map(|(i, _)| i)
.unwrap();
assert!(max_idx >= 3 && max_idx <= 6);
}
}