1use super::is_twitter_url;
2use crate::{Preview, PreviewError};
3use scraper::{Html, Selector};
4use tracing::debug;
5
6#[derive(Clone)]
8pub struct MetadataExtractor;
9
10impl MetadataExtractor {
11 pub fn new() -> Self {
12 Self
13 }
14
15 pub fn extract(&self, html: &str, url: &str) -> Result<Preview, PreviewError> {
16 let document = Html::parse_document(html);
17 if is_twitter_url(url) {
18 if let Some(preview) = self.extract_twitter_metadata(&document, url) {
19 return Ok(preview);
20 }
21 }
22 self.extract_generic_metadata(&document, url)
24 }
25
26 fn extract_twitter_metadata(&self, document: &Html, url: &str) -> Option<Preview> {
27 let selectors = [
28 ("article[data-testid='tweet']", "Article selector"),
29 ("div[data-testid='tweetText']", "Text selector"),
30 ("div[data-testid='tweetPhoto'] img", "Image selector"),
31 ("div[data-testid='videoPlayer']", "Video selector"),
32 ("div[data-testid='User-Name']", "Username selector"),
33 ];
34
35 for (selector_str, desc) in selectors {
37 if let Ok(selector) = Selector::parse(selector_str) {
38 let count = document.select(&selector).count();
39 debug!("{}: Found {} matches", desc, count);
40 }
41 }
42
43 let og_title = self.extract_title(document);
45 let og_description = self.extract_description(document);
46 let og_image = self.extract_image(document);
47
48 debug!("Basic metadata extraction results:");
49 debug!("Title: {:?}", og_title);
50 debug!("Description: {:?}", og_description);
51 debug!("Image: {:?}", og_image);
52
53 Some(Preview {
55 url: url.to_string(),
56 title: og_title,
57 description: og_description,
58 image_url: og_image,
59 site_name: Some("X (formerly Twitter)".to_string()),
60 favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
61 })
62 }
63
64 fn extract_generic_metadata(
65 &self,
66 document: &Html,
67 url: &str,
68 ) -> Result<Preview, PreviewError> {
69 let title = self.extract_title(document);
70 let description = self.extract_description(document);
71 let image_url = self.extract_image(document);
72 let favicon = self.extract_favicon(document);
73 let site_name = self.extract_site_name(document);
74
75 Ok(Preview {
76 url: url.to_string(),
77 title,
78 description,
79 image_url,
80 favicon,
81 site_name,
82 })
83 }
84
85 fn extract_title(&self, document: &Html) -> Option<String> {
86 let og_title_selector = Selector::parse("meta[property='og:title']").ok()?;
87 let title_selector = Selector::parse("title").ok()?;
88
89 let og_title = document
90 .select(&og_title_selector)
91 .next()
92 .and_then(|el| el.value().attr("content"))
93 .map(|s| s.to_string());
94
95 og_title
97 .or_else(|| {
98 document
99 .select(&title_selector)
100 .next()
101 .map(|el| el.inner_html())
102 })
103 .map(|s| s.trim().to_string())
104 }
105
106 fn extract_description(&self, document: &Html) -> Option<String> {
107 let og_desc_selector = Selector::parse("meta[property='og:description']").ok()?;
108 let meta_desc_selector = Selector::parse("meta[name='description']").ok()?;
109
110 document
111 .select(&og_desc_selector)
112 .next()
113 .and_then(|el| el.value().attr("content"))
114 .or_else(|| {
115 document
116 .select(&meta_desc_selector)
117 .next()
118 .and_then(|el| el.value().attr("content"))
119 })
120 .map(|s| s.trim().to_string())
121 }
122
123 fn extract_image(&self, document: &Html) -> Option<String> {
124 let og_image_selector = Selector::parse("meta[property='og:image']").ok()?;
125
126 document
127 .select(&og_image_selector)
128 .next()
129 .and_then(|el| el.value().attr("content"))
130 .map(|s| s.trim().to_string())
131 }
132
133 fn extract_favicon(&self, document: &Html) -> Option<String> {
134 let favicon_selector =
135 Selector::parse("link[rel='icon'], link[rel='shortcut icon']").ok()?;
136
137 document
138 .select(&favicon_selector)
139 .next()
140 .and_then(|el| el.value().attr("href"))
141 .map(|s| s.trim().to_string())
142 }
143
144 fn extract_site_name(&self, document: &Html) -> Option<String> {
145 let og_site_selector = Selector::parse("meta[property='og:site_name']").ok()?;
146
147 document
148 .select(&og_site_selector)
149 .next()
150 .and_then(|el| el.value().attr("content"))
151 .map(|s| s.trim().to_string())
152 }
153
154 pub fn extract_from_oembed(&self, oembed: &str) -> Option<Preview> {
158 let document = Html::parse_fragment(oembed);
159
160 let text_selector = Selector::parse("p").ok()?;
161 let link_selector = Selector::parse("a").ok()?;
162
163 let tweet_text = document
164 .select(&text_selector)
165 .next()
166 .map(|el| el.text().collect::<String>())
167 .map(|s| s.trim().to_string());
168
169 let image_link = document
170 .select(&link_selector)
171 .find(|a| {
172 a.value()
173 .attr("href")
174 .map(|href| href.contains("t.co"))
175 .unwrap_or(false)
176 })
177 .and_then(|a| a.value().attr("href"))
178 .map(String::from);
179
180 let time = document
181 .select(&link_selector)
182 .last()
183 .map(|el| el.text().collect::<String>());
184
185 Some(Preview {
186 url: String::new(),
187 title: tweet_text.clone(),
188 description: Some(format!(
189 "{}{}",
190 tweet_text.unwrap_or_default(),
191 time.map(|t| format!(" (Posted: {})", t))
192 .unwrap_or_default()
193 )),
194 image_url: image_link,
195 site_name: Some("X (formerly Twitter)".to_string()),
196 favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
197 })
198 }
199}