1use super::is_twitter_url;
2use crate::{Preview, PreviewError};
3use scraper::{Html, Selector};
4use tracing::debug;
5
6use crate::utils;
7
8#[derive(Clone)]
10pub struct MetadataExtractor;
11
12impl Default for MetadataExtractor {
13 fn default() -> Self {
14 Self::new()
15 }
16}
17
18impl MetadataExtractor {
19 pub fn new() -> Self {
20 Self
21 }
22
23 pub fn extract(&self, html: &str, url: &str) -> Result<Preview, PreviewError> {
24 let document = Html::parse_document(html);
25 if is_twitter_url(url) {
26 if let Some(preview) = self.extract_twitter_metadata(&document, url) {
27 return Ok(preview);
28 }
29 }
30 self.extract_generic_metadata(&document, url)
32 }
33
34 fn extract_twitter_metadata(&self, document: &Html, url: &str) -> Option<Preview> {
35 let selectors = [
36 ("article[data-testid='tweet']", "Article selector"),
37 ("div[data-testid='tweetText']", "Text selector"),
38 ("div[data-testid='tweetPhoto'] img", "Image selector"),
39 ("div[data-testid='videoPlayer']", "Video selector"),
40 ("div[data-testid='User-Name']", "Username selector"),
41 ];
42
43 for (selector_str, desc) in selectors {
45 if let Ok(selector) = Selector::parse(selector_str) {
46 let count = document.select(&selector).count();
47 debug!("{}: Found {} matches", desc, count);
48 }
49 }
50
51 let og_title = self.extract_title(document);
53 let og_description = self.extract_description(document);
54 let og_image = self.extract_image(document);
55
56 debug!("Basic metadata extraction results:");
57 debug!("Title: {:?}", og_title);
58 debug!("Description: {:?}", og_description);
59 debug!("Image: {:?}", og_image);
60
61 Some(Preview {
63 url: url.to_string(),
64 title: og_title,
65 description: og_description,
66 image_url: og_image,
67 site_name: Some("X (formerly Twitter)".to_string()),
68 favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
69 })
70 }
71
72 fn extract_generic_metadata(
73 &self,
74 document: &Html,
75 url: &str,
76 ) -> Result<Preview, PreviewError> {
77 let title = self.extract_title(document);
78 let description = self.extract_description(document);
79 let image_url = self.extract_image(document);
80 let favicon = self.extract_favicon(document);
81 let site_name = self.extract_site_name(document);
82
83 let host = utils::pickup_host_from_url(url)?;
84
85 let image_url = format_url(image_url, &host);
86
87 let favicon = format_url(favicon, &host);
88
89 Ok(Preview {
90 url: url.to_string(),
91 title,
92 description,
93 image_url,
94 favicon,
95 site_name,
96 })
97 }
98
99 fn extract_title(&self, document: &Html) -> Option<String> {
100 let og_title_selector = Selector::parse("meta[property='og:title']").ok()?;
101 let title_selector = Selector::parse("title").ok()?;
102
103 let og_title = document
104 .select(&og_title_selector)
105 .next()
106 .and_then(|el| el.value().attr("content"))
107 .map(|s| s.to_string());
108
109 og_title
111 .or_else(|| {
112 document
113 .select(&title_selector)
114 .next()
115 .map(|el| el.inner_html())
116 })
117 .map(|s| s.trim().to_string())
118 }
119
120 fn extract_description(&self, document: &Html) -> Option<String> {
121 let og_desc_selector = Selector::parse("meta[property='og:description']").ok()?;
122 let meta_desc_selector = Selector::parse("meta[name='description']").ok()?;
123
124 document
125 .select(&og_desc_selector)
126 .next()
127 .and_then(|el| el.value().attr("content"))
128 .or_else(|| {
129 document
130 .select(&meta_desc_selector)
131 .next()
132 .and_then(|el| el.value().attr("content"))
133 })
134 .map(|s| s.trim().to_string())
135 }
136
137 fn extract_image(&self, document: &Html) -> Option<String> {
138 let og_image_selector =
139 Selector::parse("meta[property='og:image'],meta[itemprop='image']").ok()?;
140
141 document
142 .select(&og_image_selector)
143 .next()
144 .and_then(|el| el.value().attr("content"))
145 .map(|s| s.trim().to_string())
146 }
147
148 fn extract_favicon(&self, document: &Html) -> Option<String> {
149 let favicon_selector =
150 Selector::parse("link[rel='icon'], link[rel='shortcut icon']").ok()?;
151
152 document
153 .select(&favicon_selector)
154 .next()
155 .and_then(|el| el.value().attr("href"))
156 .map(|s| s.trim().to_string())
157 }
158
159 fn extract_site_name(&self, document: &Html) -> Option<String> {
160 let og_site_selector = Selector::parse("meta[property='og:site_name']").ok()?;
161
162 document
163 .select(&og_site_selector)
164 .next()
165 .and_then(|el| el.value().attr("content"))
166 .map(|s| s.trim().to_string())
167 }
168
169 pub fn extract_from_oembed(&self, oembed: &str) -> Option<Preview> {
173 let document = Html::parse_fragment(oembed);
174
175 let text_selector = Selector::parse("p").ok()?;
176 let link_selector = Selector::parse("a").ok()?;
177
178 let tweet_text = document
179 .select(&text_selector)
180 .next()
181 .map(|el| el.text().collect::<String>())
182 .map(|s| s.trim().to_string());
183
184 let image_link = document
185 .select(&link_selector)
186 .find(|a| {
187 a.value()
188 .attr("href")
189 .map(|href| href.contains("t.co"))
190 .unwrap_or(false)
191 })
192 .and_then(|a| a.value().attr("href"))
193 .map(String::from);
194
195 let time = document
196 .select(&link_selector)
197 .last()
198 .map(|el| el.text().collect::<String>());
199
200 Some(Preview {
201 url: String::new(),
202 title: tweet_text.clone(),
203 description: Some(format!(
204 "{}{}",
205 tweet_text.unwrap_or_default(),
206 time.map(|t| format!(" (Posted: {})", t))
207 .unwrap_or_default()
208 )),
209 image_url: image_link,
210 site_name: Some("X (formerly Twitter)".to_string()),
211 favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
212 })
213 }
214}
215
216fn format_url(url: Option<String>, host: &str) -> Option<String> {
218 fn is_absolute_url(url: &str) -> bool {
219 url.starts_with("http://") || url.starts_with("https://")
220 }
221
222 if let Some(url) = url {
223 if is_absolute_url(&url) {
224 Some(url)
225 } else if url.starts_with('/') {
226 Some(format!("{}{}", host, url))
227 } else {
228 Some(format!("{}/{}", host, url))
229 }
230 } else {
231 None
232 }
233}