Skip to main content

synaptic_loaders/
youtube.rs

1use async_trait::async_trait;
2use serde_json::Value;
3use std::collections::HashMap;
4use synaptic_core::{Document, Loader, SynapticError};
5
6/// Loader for YouTube video transcripts.
7pub struct YoutubeLoader {
8    client: reqwest::Client,
9    video_ids: Vec<String>,
10    language: String,
11}
12
13impl YoutubeLoader {
14    pub fn new(video_ids: Vec<String>) -> Self {
15        Self {
16            client: reqwest::Client::new(),
17            video_ids,
18            language: "en".to_string(),
19        }
20    }
21
22    pub fn with_language(mut self, lang: impl Into<String>) -> Self {
23        self.language = lang.into();
24        self
25    }
26
27    async fn fetch_transcript(&self, video_id: &str) -> Result<String, SynapticError> {
28        let url = format!(
29            "https://www.youtube.com/api/timedtext?v={}&lang={}&fmt=json3",
30            video_id, self.language
31        );
32        let resp = self
33            .client
34            .get(&url)
35            .send()
36            .await
37            .map_err(|e| SynapticError::Loader(format!("YouTube fetch: {e}")))?;
38        let body: Value = resp
39            .json()
40            .await
41            .map_err(|e| SynapticError::Loader(format!("YouTube parse: {e}")))?;
42
43        let text = body["events"]
44            .as_array()
45            .map(|events| {
46                events
47                    .iter()
48                    .filter_map(|event| {
49                        event["segs"].as_array().map(|segs| {
50                            segs.iter()
51                                .filter_map(|seg| seg["utf8"].as_str())
52                                .collect::<Vec<_>>()
53                                .join("")
54                        })
55                    })
56                    .filter(|s| !s.trim().is_empty())
57                    .collect::<Vec<_>>()
58                    .join(" ")
59            })
60            .unwrap_or_default();
61
62        Ok(text)
63    }
64
65    async fn fetch_title(&self, video_id: &str) -> Option<String> {
66        let url = format!(
67            "https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={}&format=json",
68            video_id
69        );
70        self.client
71            .get(&url)
72            .send()
73            .await
74            .ok()?
75            .json::<Value>()
76            .await
77            .ok()?["title"]
78            .as_str()
79            .map(|s| s.to_string())
80    }
81}
82
83#[async_trait]
84impl Loader for YoutubeLoader {
85    async fn load(&self) -> Result<Vec<Document>, SynapticError> {
86        let mut documents = Vec::new();
87        for video_id in &self.video_ids {
88            let content = self.fetch_transcript(video_id).await?;
89            if content.is_empty() {
90                continue;
91            }
92            let title = self.fetch_title(video_id).await;
93            let mut metadata = HashMap::new();
94            metadata.insert(
95                "source".to_string(),
96                Value::String(format!("youtube:{}", video_id)),
97            );
98            metadata.insert(
99                "url".to_string(),
100                Value::String(format!("https://www.youtube.com/watch?v={}", video_id)),
101            );
102            if let Some(t) = title {
103                metadata.insert("title".to_string(), Value::String(t));
104            }
105            documents.push(Document {
106                id: video_id.clone(),
107                content,
108                metadata,
109            });
110        }
111        Ok(documents)
112    }
113}