kaccy_ai/
transcript.rs

1//! Video transcript extraction module
2//!
3//! This module provides capabilities for extracting transcripts from videos
4//! using various services including `YouTube` transcripts and external
5//! transcription APIs.
6
7use async_trait::async_trait;
8use serde::{Deserialize, Serialize};
9use std::time::Duration;
10
11use crate::error::{AiError, Result};
12use crate::llm::{ChatRequest, LlmClient};
13
14/// Transcript provider trait for different backends
15#[async_trait]
16pub trait TranscriptProvider: Send + Sync {
17    /// Extract transcript from a video URL
18    async fn extract_transcript(&self, url: &str) -> Result<TranscriptResult>;
19
20    /// Get provider name
21    fn name(&self) -> &str;
22
23    /// Check if this provider supports the given URL
24    fn supports_url(&self, url: &str) -> bool;
25}
26
27/// Video transcript result
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct TranscriptResult {
30    /// Full transcript text
31    pub text: String,
32    /// Transcript segments with timestamps
33    pub segments: Vec<TranscriptSegment>,
34    /// Video metadata
35    pub metadata: VideoMetadata,
36    /// Language of the transcript
37    pub language: Option<String>,
38    /// Whether this is auto-generated or human-created
39    pub is_auto_generated: bool,
40    /// Processing time in milliseconds
41    pub processing_time_ms: u64,
42    /// Provider used
43    pub provider: String,
44}
45
46/// A segment of transcript with timing
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct TranscriptSegment {
49    /// Segment text
50    pub text: String,
51    /// Start time in seconds
52    pub start_time: f64,
53    /// Duration in seconds
54    pub duration: f64,
55    /// Speaker identifier (if available)
56    pub speaker: Option<String>,
57}
58
59impl TranscriptSegment {
60    /// Get end time
61    #[must_use]
62    pub fn end_time(&self) -> f64 {
63        self.start_time + self.duration
64    }
65
66    /// Format start time as HH:MM:SS
67    #[must_use]
68    pub fn formatted_start(&self) -> String {
69        Self::format_time(self.start_time)
70    }
71
72    /// Format time as HH:MM:SS
73    fn format_time(seconds: f64) -> String {
74        let total_secs = seconds as u64;
75        let hours = total_secs / 3600;
76        let mins = (total_secs % 3600) / 60;
77        let secs = total_secs % 60;
78
79        if hours > 0 {
80            format!("{hours:02}:{mins:02}:{secs:02}")
81        } else {
82            format!("{mins:02}:{secs:02}")
83        }
84    }
85}
86
87/// Video metadata
88#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct VideoMetadata {
90    /// Video title
91    pub title: Option<String>,
92    /// Video description
93    pub description: Option<String>,
94    /// Video duration in seconds
95    pub duration_seconds: Option<f64>,
96    /// Channel/author name
97    pub author: Option<String>,
98    /// Video publish date
99    pub publish_date: Option<String>,
100    /// Video platform
101    pub platform: VideoPlatform,
102    /// Video ID on the platform
103    pub video_id: String,
104    /// Thumbnail URL
105    pub thumbnail_url: Option<String>,
106}
107
108/// Supported video platforms
109#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
110pub enum VideoPlatform {
111    YouTube,
112    Vimeo,
113    Twitter,
114    TikTok,
115    Twitch,
116    Unknown,
117}
118
119impl VideoPlatform {
120    /// Detect platform from URL
121    #[must_use]
122    pub fn from_url(url: &str) -> (Self, Option<String>) {
123        let url_lower = url.to_lowercase();
124
125        // YouTube
126        if url_lower.contains("youtube.com") || url_lower.contains("youtu.be") {
127            let video_id = Self::extract_youtube_id(url);
128            return (VideoPlatform::YouTube, video_id);
129        }
130
131        // Vimeo
132        if url_lower.contains("vimeo.com") {
133            let video_id = Self::extract_vimeo_id(url);
134            return (VideoPlatform::Vimeo, video_id);
135        }
136
137        // Twitter/X
138        if url_lower.contains("twitter.com") || url_lower.contains("x.com") {
139            let video_id = Self::extract_twitter_id(url);
140            return (VideoPlatform::Twitter, video_id);
141        }
142
143        // TikTok
144        if url_lower.contains("tiktok.com") {
145            let video_id = Self::extract_tiktok_id(url);
146            return (VideoPlatform::TikTok, video_id);
147        }
148
149        // Twitch
150        if url_lower.contains("twitch.tv") {
151            let video_id = Self::extract_twitch_id(url);
152            return (VideoPlatform::Twitch, video_id);
153        }
154
155        (VideoPlatform::Unknown, None)
156    }
157
158    fn extract_youtube_id(url: &str) -> Option<String> {
159        // Handle youtu.be/VIDEO_ID
160        if url.contains("youtu.be/") {
161            let parts: Vec<&str> = url.split("youtu.be/").collect();
162            if parts.len() > 1 {
163                let id = parts[1].split(['?', '&', '#']).next()?;
164                return Some(id.to_string());
165            }
166        }
167
168        // Handle youtube.com/watch?v=VIDEO_ID
169        if url.contains("v=") {
170            let parts: Vec<&str> = url.split("v=").collect();
171            if parts.len() > 1 {
172                let id = parts[1].split(['&', '#']).next()?;
173                return Some(id.to_string());
174            }
175        }
176
177        // Handle youtube.com/embed/VIDEO_ID
178        if url.contains("/embed/") {
179            let parts: Vec<&str> = url.split("/embed/").collect();
180            if parts.len() > 1 {
181                let id = parts[1].split(['?', '&', '#', '/']).next()?;
182                return Some(id.to_string());
183            }
184        }
185
186        None
187    }
188
189    fn extract_vimeo_id(url: &str) -> Option<String> {
190        // Handle vimeo.com/VIDEO_ID
191        let re = regex::Regex::new(r"vimeo\.com/(\d+)").ok()?;
192        let caps = re.captures(url)?;
193        Some(caps.get(1)?.as_str().to_string())
194    }
195
196    fn extract_twitter_id(url: &str) -> Option<String> {
197        // Handle twitter.com/user/status/ID
198        let re = regex::Regex::new(r"(?:twitter\.com|x\.com)/\w+/status/(\d+)").ok()?;
199        let caps = re.captures(url)?;
200        Some(caps.get(1)?.as_str().to_string())
201    }
202
203    fn extract_tiktok_id(url: &str) -> Option<String> {
204        // Handle tiktok.com/@user/video/ID
205        let re = regex::Regex::new(r"tiktok\.com/@[\w.]+/video/(\d+)").ok()?;
206        let caps = re.captures(url)?;
207        Some(caps.get(1)?.as_str().to_string())
208    }
209
210    fn extract_twitch_id(url: &str) -> Option<String> {
211        // Handle twitch.tv/videos/ID
212        let re = regex::Regex::new(r"twitch\.tv/videos/(\d+)").ok()?;
213        let caps = re.captures(url)?;
214        Some(caps.get(1)?.as_str().to_string())
215    }
216}
217
218/// `YouTube` transcript extractor
219pub struct YouTubeTranscriptProvider {
220    http_client: reqwest::Client,
221}
222
223impl YouTubeTranscriptProvider {
224    /// Create a new `YouTube` transcript provider
225    #[must_use]
226    pub fn new() -> Self {
227        Self {
228            http_client: reqwest::Client::builder()
229                .timeout(Duration::from_secs(30))
230                .build()
231                .expect("Failed to create HTTP client"),
232        }
233    }
234
235    /// Extract video ID from `YouTube` URL
236    fn extract_video_id(&self, url: &str) -> Option<String> {
237        let (platform, id) = VideoPlatform::from_url(url);
238        if platform == VideoPlatform::YouTube {
239            id
240        } else {
241            None
242        }
243    }
244
245    /// Fetch `YouTube` page to extract transcript data
246    async fn fetch_video_page(&self, video_id: &str) -> Result<String> {
247        let url = format!("https://www.youtube.com/watch?v={video_id}");
248
249        let response = self
250            .http_client
251            .get(&url)
252            .header(
253                "User-Agent",
254                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
255            )
256            .header("Accept-Language", "en-US,en;q=0.9")
257            .send()
258            .await
259            .map_err(|e| AiError::Validation(format!("Failed to fetch video page: {e}")))?;
260
261        if !response.status().is_success() {
262            return Err(AiError::Validation(format!(
263                "Failed to fetch video page: HTTP {}",
264                response.status()
265            )));
266        }
267
268        response
269            .text()
270            .await
271            .map_err(|e| AiError::Validation(format!("Failed to read response: {e}")))
272    }
273
274    /// Parse transcript from `YouTube` page data
275    fn parse_transcript_from_page(
276        &self,
277        page_html: &str,
278        video_id: &str,
279    ) -> Result<TranscriptResult> {
280        let start_time = std::time::Instant::now();
281
282        // Extract video title
283        let title = self.extract_title(page_html);
284
285        // Extract description
286        let description = self.extract_description(page_html);
287
288        // Try to find captions track
289        let captions_data = self.extract_captions_data(page_html)?;
290
291        // Parse caption segments
292        let segments = self.parse_caption_segments(&captions_data);
293
294        // Combine segments into full text
295        let text = segments
296            .iter()
297            .map(|s| s.text.as_str())
298            .collect::<Vec<_>>()
299            .join(" ");
300
301        let metadata = VideoMetadata {
302            title,
303            description,
304            duration_seconds: segments.last().map(TranscriptSegment::end_time),
305            author: self.extract_author(page_html),
306            publish_date: self.extract_publish_date(page_html),
307            platform: VideoPlatform::YouTube,
308            video_id: video_id.to_string(),
309            thumbnail_url: Some(format!(
310                "https://img.youtube.com/vi/{video_id}/maxresdefault.jpg"
311            )),
312        };
313
314        Ok(TranscriptResult {
315            text,
316            segments,
317            metadata,
318            language: Some("en".to_string()), // Simplified - actual implementation would detect
319            is_auto_generated: true,          // Would need to check actual caption type
320            processing_time_ms: start_time.elapsed().as_millis() as u64,
321            provider: "youtube".to_string(),
322        })
323    }
324
325    fn extract_title(&self, html: &str) -> Option<String> {
326        // Look for <title>...</title>
327        let re = regex::Regex::new(r"<title>([^<]+)</title>").ok()?;
328        let caps = re.captures(html)?;
329        let title = caps.get(1)?.as_str();
330        // Remove " - YouTube" suffix
331        Some(title.trim_end_matches(" - YouTube").to_string())
332    }
333
334    fn extract_description(&self, html: &str) -> Option<String> {
335        // Look for description meta tag
336        let re = regex::Regex::new(r#"<meta name="description" content="([^"]*)"#).ok()?;
337        let caps = re.captures(html)?;
338        Some(caps.get(1)?.as_str().to_string())
339    }
340
341    fn extract_author(&self, html: &str) -> Option<String> {
342        // Look for channel name
343        let re = regex::Regex::new(r#""ownerChannelName":"([^"]+)""#).ok()?;
344        let caps = re.captures(html)?;
345        Some(caps.get(1)?.as_str().to_string())
346    }
347
348    fn extract_publish_date(&self, html: &str) -> Option<String> {
349        // Look for publish date
350        let re = regex::Regex::new(r#""publishDate":"([^"]+)""#).ok()?;
351        let caps = re.captures(html)?;
352        Some(caps.get(1)?.as_str().to_string())
353    }
354
355    fn extract_captions_data(&self, html: &str) -> Result<String> {
356        // Look for caption track URL in page data
357        // This is a simplified extraction - full implementation would use innertube API
358        let re = regex::Regex::new(r#""captionTracks":\s*\[([^\]]+)\]"#)
359            .map_err(|e| AiError::Validation(e.to_string()))?;
360
361        if let Some(caps) = re.captures(html) {
362            return Ok(caps
363                .get(1)
364                .map(|m| m.as_str().to_string())
365                .unwrap_or_default());
366        }
367
368        // Try alternate format
369        let re2 = regex::Regex::new(r#"timedtext[^"]*\?[^"]*"#)
370            .map_err(|e| AiError::Validation(e.to_string()))?;
371
372        if let Some(caps) = re2.find(html) {
373            return Ok(caps.as_str().to_string());
374        }
375
376        Err(AiError::Validation(
377            "No captions found for this video".to_string(),
378        ))
379    }
380
381    fn parse_caption_segments(&self, _data: &str) -> Vec<TranscriptSegment> {
382        // Simplified - actual implementation would parse XML/JSON caption format
383        // For now, return empty and rely on LLM fallback
384        Vec::new()
385    }
386}
387
388impl Default for YouTubeTranscriptProvider {
389    fn default() -> Self {
390        Self::new()
391    }
392}
393
394#[async_trait]
395impl TranscriptProvider for YouTubeTranscriptProvider {
396    async fn extract_transcript(&self, url: &str) -> Result<TranscriptResult> {
397        let video_id = self
398            .extract_video_id(url)
399            .ok_or_else(|| AiError::Validation("Invalid YouTube URL".to_string()))?;
400
401        let page_html = self.fetch_video_page(&video_id).await?;
402
403        self.parse_transcript_from_page(&page_html, &video_id)
404    }
405
406    fn name(&self) -> &'static str {
407        "youtube"
408    }
409
410    fn supports_url(&self, url: &str) -> bool {
411        let url_lower = url.to_lowercase();
412        url_lower.contains("youtube.com") || url_lower.contains("youtu.be")
413    }
414}
415
416/// LLM-assisted transcript analyzer
417/// Used when native transcripts aren't available
418pub struct LlmTranscriptAnalyzer {
419    llm: LlmClient,
420}
421
422impl LlmTranscriptAnalyzer {
423    #[must_use]
424    pub fn new(llm: LlmClient) -> Self {
425        Self { llm }
426    }
427
428    /// Analyze transcript content for key points
429    pub async fn analyze_transcript(
430        &self,
431        transcript: &TranscriptResult,
432    ) -> Result<TranscriptAnalysis> {
433        let prompt = format!(
434            r#"Analyze the following video transcript and provide:
4351. A brief summary (2-3 sentences)
4362. Main topics covered (bullet points)
4373. Key takeaways
4384. Overall sentiment (positive/neutral/negative)
439
440Video Title: {}
441Transcript:
442{}
443
444Respond in JSON format:
445{{
446    "summary": "<string>",
447    "topics": ["<topic1>", "<topic2>", ...],
448    "key_takeaways": ["<takeaway1>", "<takeaway2>", ...],
449    "sentiment": "<positive|neutral|negative>",
450    "quality_indicators": {{
451        "clarity": <0-100>,
452        "informativeness": <0-100>,
453        "professionalism": <0-100>
454    }}
455}}"#,
456            transcript.metadata.title.as_deref().unwrap_or("Unknown"),
457            &transcript.text[..transcript.text.len().min(10000)]
458        );
459
460        let request = ChatRequest::with_system(
461            "You are an expert content analyst. Analyze video transcripts accurately.",
462            prompt,
463        )
464        .max_tokens(2048)
465        .temperature(0.3);
466
467        let response = self.llm.chat(request).await?;
468
469        self.parse_analysis_response(&response.message.content)
470    }
471
472    fn parse_analysis_response(&self, response: &str) -> Result<TranscriptAnalysis> {
473        // Try to extract JSON
474        let json_str = if let Some(start) = response.find('{') {
475            if let Some(end) = response.rfind('}') {
476                &response[start..=end]
477            } else {
478                response
479            }
480        } else {
481            response
482        };
483
484        serde_json::from_str(json_str)
485            .map_err(|e| AiError::EvaluationFailed(format!("Failed to parse analysis: {e}")))
486    }
487
488    /// Generate a summary of the transcript
489    pub async fn summarize(
490        &self,
491        transcript: &TranscriptResult,
492        max_length: usize,
493    ) -> Result<String> {
494        let prompt = format!(
495            "Summarize the following video transcript in {} words or less. Focus on the main points and conclusions.\n\nTranscript:\n{}",
496            max_length / 5, // Approximate words from char limit
497            &transcript.text[..transcript.text.len().min(15000)]
498        );
499
500        let request = ChatRequest::with_system(
501            "You are a concise summarizer. Create clear, accurate summaries.",
502            prompt,
503        )
504        .max_tokens(512)
505        .temperature(0.3);
506
507        let response = self.llm.chat(request).await?;
508        Ok(response.message.content)
509    }
510}
511
512/// Transcript analysis result
513#[derive(Debug, Clone, Serialize, Deserialize)]
514pub struct TranscriptAnalysis {
515    /// Brief summary
516    pub summary: String,
517    /// Main topics covered
518    pub topics: Vec<String>,
519    /// Key takeaways
520    pub key_takeaways: Vec<String>,
521    /// Overall sentiment
522    pub sentiment: String,
523    /// Quality indicators
524    pub quality_indicators: QualityIndicators,
525}
526
527/// Quality indicators for transcript content
528#[derive(Debug, Clone, Serialize, Deserialize)]
529pub struct QualityIndicators {
530    /// Clarity score (0-100)
531    pub clarity: u32,
532    /// Informativeness score (0-100)
533    pub informativeness: u32,
534    /// Professionalism score (0-100)
535    pub professionalism: u32,
536}
537
538/// Multi-provider transcript service
539pub struct TranscriptService {
540    providers: Vec<Box<dyn TranscriptProvider>>,
541    analyzer: Option<LlmTranscriptAnalyzer>,
542}
543
544impl TranscriptService {
545    /// Create a new transcript service with default providers
546    #[must_use]
547    pub fn new() -> Self {
548        Self {
549            providers: vec![Box::new(YouTubeTranscriptProvider::new())],
550            analyzer: None,
551        }
552    }
553
554    /// Create with LLM support for analysis
555    #[must_use]
556    pub fn with_llm(llm: LlmClient) -> Self {
557        Self {
558            providers: vec![Box::new(YouTubeTranscriptProvider::new())],
559            analyzer: Some(LlmTranscriptAnalyzer::new(llm)),
560        }
561    }
562
563    /// Add a custom provider
564    #[must_use]
565    pub fn add_provider(mut self, provider: Box<dyn TranscriptProvider>) -> Self {
566        self.providers.push(provider);
567        self
568    }
569
570    /// Extract transcript from any supported URL
571    pub async fn extract(&self, url: &str) -> Result<TranscriptResult> {
572        for provider in &self.providers {
573            if provider.supports_url(url) {
574                return provider.extract_transcript(url).await;
575            }
576        }
577
578        Err(AiError::Validation(format!(
579            "No provider supports URL: {url}"
580        )))
581    }
582
583    /// Extract and analyze transcript
584    pub async fn extract_and_analyze(
585        &self,
586        url: &str,
587    ) -> Result<(TranscriptResult, Option<TranscriptAnalysis>)> {
588        let transcript = self.extract(url).await?;
589
590        let analysis = if let Some(ref analyzer) = self.analyzer {
591            Some(analyzer.analyze_transcript(&transcript).await?)
592        } else {
593            None
594        };
595
596        Ok((transcript, analysis))
597    }
598
599    /// Check if a URL is supported
600    #[must_use]
601    pub fn supports_url(&self, url: &str) -> bool {
602        self.providers.iter().any(|p| p.supports_url(url))
603    }
604
605    /// Get list of supported platforms
606    #[must_use]
607    pub fn supported_platforms(&self) -> Vec<&str> {
608        self.providers.iter().map(|p| p.name()).collect()
609    }
610}
611
612impl Default for TranscriptService {
613    fn default() -> Self {
614        Self::new()
615    }
616}
617
618/// Transcript search functionality
619pub struct TranscriptSearch;
620
621impl TranscriptSearch {
622    /// Search for text within transcript segments
623    #[must_use]
624    pub fn search(transcript: &TranscriptResult, query: &str) -> Vec<SearchResult> {
625        let query_lower = query.to_lowercase();
626        let mut results = Vec::new();
627
628        for (index, segment) in transcript.segments.iter().enumerate() {
629            let text_lower = segment.text.to_lowercase();
630            if text_lower.contains(&query_lower) {
631                results.push(SearchResult {
632                    segment_index: index,
633                    text: segment.text.clone(),
634                    start_time: segment.start_time,
635                    timestamp: segment.formatted_start(),
636                    context: Self::get_context(transcript, index),
637                });
638            }
639        }
640
641        results
642    }
643
644    /// Get context around a segment
645    fn get_context(transcript: &TranscriptResult, index: usize) -> String {
646        let start = index.saturating_sub(1);
647        let end = (index + 2).min(transcript.segments.len());
648
649        transcript.segments[start..end]
650            .iter()
651            .map(|s| s.text.as_str())
652            .collect::<Vec<_>>()
653            .join(" ")
654    }
655
656    /// Find all timestamps where a topic is discussed
657    #[must_use]
658    pub fn find_topic_timestamps(transcript: &TranscriptResult, topic: &str) -> Vec<TopicMention> {
659        let results = Self::search(transcript, topic);
660
661        results
662            .into_iter()
663            .map(|r| TopicMention {
664                timestamp: r.timestamp,
665                start_seconds: r.start_time,
666                context: r.context,
667            })
668            .collect()
669    }
670}
671
672/// Search result within transcript
673#[derive(Debug, Clone, Serialize, Deserialize)]
674pub struct SearchResult {
675    /// Index of the segment
676    pub segment_index: usize,
677    /// Matching text
678    pub text: String,
679    /// Start time in seconds
680    pub start_time: f64,
681    /// Formatted timestamp
682    pub timestamp: String,
683    /// Surrounding context
684    pub context: String,
685}
686
687/// Topic mention with timestamp
688#[derive(Debug, Clone, Serialize, Deserialize)]
689pub struct TopicMention {
690    /// Formatted timestamp
691    pub timestamp: String,
692    /// Start time in seconds
693    pub start_seconds: f64,
694    /// Context around the mention
695    pub context: String,
696}
697
698#[cfg(test)]
699mod tests {
700    use super::*;
701
702    #[test]
703    fn test_youtube_id_extraction() {
704        // Test YouTube URL formats
705        let youtube_cases = vec![
706            ("https://www.youtube.com/watch?v=dQw4w9WgXcQ", "dQw4w9WgXcQ"),
707            ("https://youtu.be/dQw4w9WgXcQ", "dQw4w9WgXcQ"),
708            ("https://www.youtube.com/embed/dQw4w9WgXcQ", "dQw4w9WgXcQ"),
709        ];
710
711        for (url, expected_id) in youtube_cases {
712            let (platform, id) = VideoPlatform::from_url(url);
713            assert_eq!(platform, VideoPlatform::YouTube);
714            assert_eq!(id, Some(expected_id.to_string()));
715        }
716
717        // Test Vimeo URL - should extract Vimeo ID (not YouTube)
718        let (platform, id) = VideoPlatform::from_url("https://vimeo.com/123456");
719        assert_eq!(platform, VideoPlatform::Vimeo);
720        assert_eq!(id, Some("123456".to_string()));
721    }
722
723    #[test]
724    fn test_platform_detection() {
725        assert_eq!(
726            VideoPlatform::from_url("https://youtube.com/watch?v=abc").0,
727            VideoPlatform::YouTube
728        );
729        assert_eq!(
730            VideoPlatform::from_url("https://vimeo.com/123").0,
731            VideoPlatform::Vimeo
732        );
733        assert_eq!(
734            VideoPlatform::from_url("https://twitter.com/user/status/123").0,
735            VideoPlatform::Twitter
736        );
737        assert_eq!(
738            VideoPlatform::from_url("https://example.com/video").0,
739            VideoPlatform::Unknown
740        );
741    }
742
743    #[test]
744    fn test_segment_formatting() {
745        let segment = TranscriptSegment {
746            text: "Hello world".to_string(),
747            start_time: 3661.5,
748            duration: 2.0,
749            speaker: None,
750        };
751
752        assert_eq!(segment.formatted_start(), "01:01:01");
753        assert_eq!(segment.end_time(), 3663.5);
754    }
755}