synaptic_loaders/
youtube.rs1use async_trait::async_trait;
2use serde_json::Value;
3use std::collections::HashMap;
4use synaptic_core::{Document, Loader, SynapticError};
5
6pub struct YoutubeLoader {
8 client: reqwest::Client,
9 video_ids: Vec<String>,
10 language: String,
11}
12
13impl YoutubeLoader {
14 pub fn new(video_ids: Vec<String>) -> Self {
15 Self {
16 client: reqwest::Client::new(),
17 video_ids,
18 language: "en".to_string(),
19 }
20 }
21
22 pub fn with_language(mut self, lang: impl Into<String>) -> Self {
23 self.language = lang.into();
24 self
25 }
26
27 async fn fetch_transcript(&self, video_id: &str) -> Result<String, SynapticError> {
28 let url = format!(
29 "https://www.youtube.com/api/timedtext?v={}&lang={}&fmt=json3",
30 video_id, self.language
31 );
32 let resp = self
33 .client
34 .get(&url)
35 .send()
36 .await
37 .map_err(|e| SynapticError::Loader(format!("YouTube fetch: {e}")))?;
38 let body: Value = resp
39 .json()
40 .await
41 .map_err(|e| SynapticError::Loader(format!("YouTube parse: {e}")))?;
42
43 let text = body["events"]
44 .as_array()
45 .map(|events| {
46 events
47 .iter()
48 .filter_map(|event| {
49 event["segs"].as_array().map(|segs| {
50 segs.iter()
51 .filter_map(|seg| seg["utf8"].as_str())
52 .collect::<Vec<_>>()
53 .join("")
54 })
55 })
56 .filter(|s| !s.trim().is_empty())
57 .collect::<Vec<_>>()
58 .join(" ")
59 })
60 .unwrap_or_default();
61
62 Ok(text)
63 }
64
65 async fn fetch_title(&self, video_id: &str) -> Option<String> {
66 let url = format!(
67 "https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={}&format=json",
68 video_id
69 );
70 self.client
71 .get(&url)
72 .send()
73 .await
74 .ok()?
75 .json::<Value>()
76 .await
77 .ok()?["title"]
78 .as_str()
79 .map(|s| s.to_string())
80 }
81}
82
83#[async_trait]
84impl Loader for YoutubeLoader {
85 async fn load(&self) -> Result<Vec<Document>, SynapticError> {
86 let mut documents = Vec::new();
87 for video_id in &self.video_ids {
88 let content = self.fetch_transcript(video_id).await?;
89 if content.is_empty() {
90 continue;
91 }
92 let title = self.fetch_title(video_id).await;
93 let mut metadata = HashMap::new();
94 metadata.insert(
95 "source".to_string(),
96 Value::String(format!("youtube:{}", video_id)),
97 );
98 metadata.insert(
99 "url".to_string(),
100 Value::String(format!("https://www.youtube.com/watch?v={}", video_id)),
101 );
102 if let Some(t) = title {
103 metadata.insert("title".to_string(), Value::String(t));
104 }
105 documents.push(Document {
106 id: video_id.clone(),
107 content,
108 metadata,
109 });
110 }
111 Ok(documents)
112 }
113}