1use super::is_twitter_url;
2use crate::{Preview, PreviewError};
3use scraper::{Html, Selector};
4#[cfg(feature = "logging")]
5use tracing::debug;
6
7use crate::utils;
8
9#[derive(Clone)]
11pub struct MetadataExtractor;
12
13impl Default for MetadataExtractor {
14 fn default() -> Self {
15 Self::new()
16 }
17}
18
19impl MetadataExtractor {
20 pub fn new() -> Self {
21 Self
22 }
23
24 pub fn extract(&self, html: &str, url: &str) -> Result<Preview, PreviewError> {
25 let document = Html::parse_document(html);
26 if is_twitter_url(url) {
27 if let Some(preview) = self.extract_twitter_metadata(&document, url) {
28 return Ok(preview);
29 }
30 }
31 self.extract_generic_metadata(&document, url)
33 }
34
35 fn extract_twitter_metadata(&self, document: &Html, url: &str) -> Option<Preview> {
36 let selectors = [
37 ("article[data-testid='tweet']", "Article selector"),
38 ("div[data-testid='tweetText']", "Text selector"),
39 ("div[data-testid='tweetPhoto'] img", "Image selector"),
40 ("div[data-testid='videoPlayer']", "Video selector"),
41 ("div[data-testid='User-Name']", "Username selector"),
42 ];
43
44 for (selector_str, _desc) in selectors {
46 if let Ok(selector) = Selector::parse(selector_str) {
47 #[cfg(feature = "logging")]
48 {
49 let count = document.select(&selector).count();
50 debug!("{}: Found {} matches", _desc, count);
51 }
52 #[cfg(not(feature = "logging"))]
53 {
54 let _count = document.select(&selector).count();
55 }
56 }
57 }
58
59 let og_title = self.extract_title(document);
61 let og_description = self.extract_description(document);
62 let og_image = self.extract_image(document);
63
64 #[cfg(feature = "logging")]
65 {
66 debug!("Basic metadata extraction results:");
67 debug!("Title: {:?}", og_title);
68 debug!("Description: {:?}", og_description);
69 debug!("Image: {:?}", og_image);
70 }
71
72 Some(Preview {
74 url: url.to_string(),
75 title: og_title,
76 description: og_description,
77 image_url: og_image,
78 site_name: Some("X (formerly Twitter)".to_string()),
79 favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
80 })
81 }
82
83 fn extract_generic_metadata(
84 &self,
85 document: &Html,
86 url: &str,
87 ) -> Result<Preview, PreviewError> {
88 let title = self.extract_title(document);
89 let description = self.extract_description(document);
90 let image_url = self.extract_image(document);
91 let favicon = self.extract_favicon(document);
92 let site_name = self.extract_site_name(document);
93
94 let host = utils::pickup_host_from_url(url)?;
95
96 let image_url = format_url(image_url, &host);
97
98 let favicon = format_url(favicon, &host);
99
100 Ok(Preview {
101 url: url.to_string(),
102 title,
103 description,
104 image_url,
105 favicon,
106 site_name,
107 })
108 }
109
110 fn extract_title(&self, document: &Html) -> Option<String> {
111 let og_title_selector = Selector::parse("meta[property='og:title']").ok()?;
112 let title_selector = Selector::parse("title").ok()?;
113
114 let og_title = document
115 .select(&og_title_selector)
116 .next()
117 .and_then(|el| el.value().attr("content"))
118 .map(|s| s.to_string());
119
120 og_title
122 .or_else(|| {
123 document
124 .select(&title_selector)
125 .next()
126 .map(|el| el.inner_html())
127 })
128 .map(|s| s.trim().to_string())
129 }
130
131 fn extract_description(&self, document: &Html) -> Option<String> {
132 let og_desc_selector = Selector::parse("meta[property='og:description']").ok()?;
133 let meta_desc_selector = Selector::parse("meta[name='description']").ok()?;
134
135 document
136 .select(&og_desc_selector)
137 .next()
138 .and_then(|el| el.value().attr("content"))
139 .or_else(|| {
140 document
141 .select(&meta_desc_selector)
142 .next()
143 .and_then(|el| el.value().attr("content"))
144 })
145 .map(|s| s.trim().to_string())
146 }
147
148 fn extract_image(&self, document: &Html) -> Option<String> {
149 let og_image_selector =
150 Selector::parse("meta[property='og:image'],meta[itemprop='image']").ok()?;
151
152 document
153 .select(&og_image_selector)
154 .next()
155 .and_then(|el| el.value().attr("content"))
156 .map(|s| s.trim().to_string())
157 }
158
159 fn extract_favicon(&self, document: &Html) -> Option<String> {
160 let favicon_selector =
161 Selector::parse("link[rel='icon'], link[rel='shortcut icon']").ok()?;
162
163 document
164 .select(&favicon_selector)
165 .next()
166 .and_then(|el| el.value().attr("href"))
167 .map(|s| s.trim().to_string())
168 }
169
170 fn extract_site_name(&self, document: &Html) -> Option<String> {
171 let og_site_selector = Selector::parse("meta[property='og:site_name']").ok()?;
172
173 document
174 .select(&og_site_selector)
175 .next()
176 .and_then(|el| el.value().attr("content"))
177 .map(|s| s.trim().to_string())
178 }
179
180 pub fn extract_from_oembed(&self, oembed: &str) -> Option<Preview> {
184 let document = Html::parse_fragment(oembed);
185
186 let text_selector = Selector::parse("p").ok()?;
187 let link_selector = Selector::parse("a").ok()?;
188
189 let tweet_text = document
190 .select(&text_selector)
191 .next()
192 .map(|el| el.text().collect::<String>())
193 .map(|s| s.trim().to_string());
194
195 let image_link = document
196 .select(&link_selector)
197 .find(|a| {
198 a.value()
199 .attr("href")
200 .map(|href| href.contains("t.co"))
201 .unwrap_or(false)
202 })
203 .and_then(|a| a.value().attr("href"))
204 .map(String::from);
205
206 let time = document
207 .select(&link_selector)
208 .next_back()
209 .map(|el| el.text().collect::<String>());
210
211 Some(Preview {
212 url: String::new(),
213 title: tweet_text.clone(),
214 description: Some(format!(
215 "{}{}",
216 tweet_text.unwrap_or_default(),
217 time.map(|t| format!(" (Posted: {t})")).unwrap_or_default()
218 )),
219 image_url: image_link,
220 site_name: Some("X (formerly Twitter)".to_string()),
221 favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
222 })
223 }
224}
225
226fn format_url(url: Option<String>, host: &str) -> Option<String> {
228 fn is_absolute_url(url: &str) -> bool {
229 url.starts_with("http://") || url.starts_with("https://")
230 }
231
232 if let Some(url) = url {
233 if is_absolute_url(&url) {
234 Some(url)
235 } else if url.starts_with('/') {
236 Some(format!("{host}{url}"))
237 } else {
238 Some(format!("{host}/{url}"))
239 }
240 } else {
241 None
242 }
243}