1use super::is_twitter_url;
2use crate::{Preview, PreviewError};
3use scraper::{Html, Selector};
4use tracing::debug;
5
6use crate::utils;
7
8#[derive(Clone)]
10pub struct MetadataExtractor;
11
12impl Default for MetadataExtractor {
13 fn default() -> Self {
14 Self::new()
15 }
16}
17
18impl MetadataExtractor {
19 pub fn new() -> Self {
20 Self
21 }
22
23 pub fn extract(&self, html: &str, url: &str) -> Result<Preview, PreviewError> {
24 let document = Html::parse_document(html);
25 if is_twitter_url(url) {
26 if let Some(preview) = self.extract_twitter_metadata(&document, url) {
27 return Ok(preview);
28 }
29 }
30 self.extract_generic_metadata(&document, url)
32 }
33
34 fn extract_twitter_metadata(&self, document: &Html, url: &str) -> Option<Preview> {
35 let selectors = [
36 ("article[data-testid='tweet']", "Article selector"),
37 ("div[data-testid='tweetText']", "Text selector"),
38 ("div[data-testid='tweetPhoto'] img", "Image selector"),
39 ("div[data-testid='videoPlayer']", "Video selector"),
40 ("div[data-testid='User-Name']", "Username selector"),
41 ];
42
43 for (selector_str, desc) in selectors {
45 if let Ok(selector) = Selector::parse(selector_str) {
46 let count = document.select(&selector).count();
47 debug!("{}: Found {} matches", desc, count);
48 }
49 }
50
51 let og_title = self.extract_title(document);
53 let og_description = self.extract_description(document);
54 let og_image = self.extract_image(document);
55
56 debug!("Basic metadata extraction results:");
57 debug!("Title: {:?}", og_title);
58 debug!("Description: {:?}", og_description);
59 debug!("Image: {:?}", og_image);
60
61 Some(Preview {
63 url: url.to_string(),
64 title: og_title,
65 description: og_description,
66 image_url: og_image,
67 site_name: Some("X (formerly Twitter)".to_string()),
68 favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
69 })
70 }
71
72 fn extract_generic_metadata(
73 &self,
74 document: &Html,
75 url: &str,
76 ) -> Result<Preview, PreviewError> {
77 let title = self.extract_title(document);
78 let description = self.extract_description(document);
79 let image_url = self.extract_image(document);
80 let favicon = self.extract_favicon(document);
81 let site_name = self.extract_site_name(document);
82
83 let host = utils::pickup_host_from_url(url)?;
84
85 let image_url = if let Some(url) = image_url {
86 if !url.starts_with(&host) {
87 Some(format!("{}{}", host, url))
88 } else {
89 Some(url)
90 }
91 } else {
92 None
93 };
94 let favicon = if let Some(url) = favicon {
95 if !url.starts_with(&host) {
96 Some(format!("{}{}", host, url))
97 } else {
98 Some(url)
99 }
100 } else {
101 None
102 };
103
104 Ok(Preview {
105 url: url.to_string(),
106 title,
107 description,
108 image_url,
109 favicon,
110 site_name,
111 })
112 }
113
114 fn extract_title(&self, document: &Html) -> Option<String> {
115 let og_title_selector = Selector::parse("meta[property='og:title']").ok()?;
116 let title_selector = Selector::parse("title").ok()?;
117
118 let og_title = document
119 .select(&og_title_selector)
120 .next()
121 .and_then(|el| el.value().attr("content"))
122 .map(|s| s.to_string());
123
124 og_title
126 .or_else(|| {
127 document
128 .select(&title_selector)
129 .next()
130 .map(|el| el.inner_html())
131 })
132 .map(|s| s.trim().to_string())
133 }
134
135 fn extract_description(&self, document: &Html) -> Option<String> {
136 let og_desc_selector = Selector::parse("meta[property='og:description']").ok()?;
137 let meta_desc_selector = Selector::parse("meta[name='description']").ok()?;
138
139 document
140 .select(&og_desc_selector)
141 .next()
142 .and_then(|el| el.value().attr("content"))
143 .or_else(|| {
144 document
145 .select(&meta_desc_selector)
146 .next()
147 .and_then(|el| el.value().attr("content"))
148 })
149 .map(|s| s.trim().to_string())
150 }
151
152 fn extract_image(&self, document: &Html) -> Option<String> {
153 let og_image_selector =
154 Selector::parse("meta[property='og:image'],meta[itemprop='image']").ok()?;
155
156 document
157 .select(&og_image_selector)
158 .next()
159 .and_then(|el| el.value().attr("content"))
160 .map(|s| s.trim().to_string())
161 }
162
163 fn extract_favicon(&self, document: &Html) -> Option<String> {
164 let favicon_selector =
165 Selector::parse("link[rel='icon'], link[rel='shortcut icon']").ok()?;
166
167 document
168 .select(&favicon_selector)
169 .next()
170 .and_then(|el| el.value().attr("href"))
171 .map(|s| s.trim().to_string())
172 }
173
174 fn extract_site_name(&self, document: &Html) -> Option<String> {
175 let og_site_selector = Selector::parse("meta[property='og:site_name']").ok()?;
176
177 document
178 .select(&og_site_selector)
179 .next()
180 .and_then(|el| el.value().attr("content"))
181 .map(|s| s.trim().to_string())
182 }
183
184 pub fn extract_from_oembed(&self, oembed: &str) -> Option<Preview> {
188 let document = Html::parse_fragment(oembed);
189
190 let text_selector = Selector::parse("p").ok()?;
191 let link_selector = Selector::parse("a").ok()?;
192
193 let tweet_text = document
194 .select(&text_selector)
195 .next()
196 .map(|el| el.text().collect::<String>())
197 .map(|s| s.trim().to_string());
198
199 let image_link = document
200 .select(&link_selector)
201 .find(|a| {
202 a.value()
203 .attr("href")
204 .map(|href| href.contains("t.co"))
205 .unwrap_or(false)
206 })
207 .and_then(|a| a.value().attr("href"))
208 .map(String::from);
209
210 let time = document
211 .select(&link_selector)
212 .last()
213 .map(|el| el.text().collect::<String>());
214
215 Some(Preview {
216 url: String::new(),
217 title: tweet_text.clone(),
218 description: Some(format!(
219 "{}{}",
220 tweet_text.unwrap_or_default(),
221 time.map(|t| format!(" (Posted: {})", t))
222 .unwrap_or_default()
223 )),
224 image_url: image_link,
225 site_name: Some("X (formerly Twitter)".to_string()),
226 favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
227 })
228 }
229}