1use super::is_twitter_url;
2use crate::{Preview, PreviewError};
3use scraper::{Html, Selector};
4use tracing::debug;
5
6#[derive(Clone)]
8pub struct MetadataExtractor;
9
10impl Default for MetadataExtractor {
11 fn default() -> Self {
12 Self::new()
13 }
14}
15
16impl MetadataExtractor {
17 pub fn new() -> Self {
18 Self
19 }
20
21 pub fn extract(&self, html: &str, url: &str) -> Result<Preview, PreviewError> {
22 let document = Html::parse_document(html);
23 if is_twitter_url(url) {
24 if let Some(preview) = self.extract_twitter_metadata(&document, url) {
25 return Ok(preview);
26 }
27 }
28 self.extract_generic_metadata(&document, url)
30 }
31
32 fn extract_twitter_metadata(&self, document: &Html, url: &str) -> Option<Preview> {
33 let selectors = [
34 ("article[data-testid='tweet']", "Article selector"),
35 ("div[data-testid='tweetText']", "Text selector"),
36 ("div[data-testid='tweetPhoto'] img", "Image selector"),
37 ("div[data-testid='videoPlayer']", "Video selector"),
38 ("div[data-testid='User-Name']", "Username selector"),
39 ];
40
41 for (selector_str, desc) in selectors {
43 if let Ok(selector) = Selector::parse(selector_str) {
44 let count = document.select(&selector).count();
45 debug!("{}: Found {} matches", desc, count);
46 }
47 }
48
49 let og_title = self.extract_title(document);
51 let og_description = self.extract_description(document);
52 let og_image = self.extract_image(document);
53
54 debug!("Basic metadata extraction results:");
55 debug!("Title: {:?}", og_title);
56 debug!("Description: {:?}", og_description);
57 debug!("Image: {:?}", og_image);
58
59 Some(Preview {
61 url: url.to_string(),
62 title: og_title,
63 description: og_description,
64 image_url: og_image,
65 site_name: Some("X (formerly Twitter)".to_string()),
66 favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
67 })
68 }
69
70 fn extract_generic_metadata(
71 &self,
72 document: &Html,
73 url: &str,
74 ) -> Result<Preview, PreviewError> {
75 let title = self.extract_title(document);
76 let description = self.extract_description(document);
77 let image_url = self.extract_image(document);
78 let favicon = self.extract_favicon(document);
79 let site_name = self.extract_site_name(document);
80
81 Ok(Preview {
82 url: url.to_string(),
83 title,
84 description,
85 image_url,
86 favicon,
87 site_name,
88 })
89 }
90
91 fn extract_title(&self, document: &Html) -> Option<String> {
92 let og_title_selector = Selector::parse("meta[property='og:title']").ok()?;
93 let title_selector = Selector::parse("title").ok()?;
94
95 let og_title = document
96 .select(&og_title_selector)
97 .next()
98 .and_then(|el| el.value().attr("content"))
99 .map(|s| s.to_string());
100
101 og_title
103 .or_else(|| {
104 document
105 .select(&title_selector)
106 .next()
107 .map(|el| el.inner_html())
108 })
109 .map(|s| s.trim().to_string())
110 }
111
112 fn extract_description(&self, document: &Html) -> Option<String> {
113 let og_desc_selector = Selector::parse("meta[property='og:description']").ok()?;
114 let meta_desc_selector = Selector::parse("meta[name='description']").ok()?;
115
116 document
117 .select(&og_desc_selector)
118 .next()
119 .and_then(|el| el.value().attr("content"))
120 .or_else(|| {
121 document
122 .select(&meta_desc_selector)
123 .next()
124 .and_then(|el| el.value().attr("content"))
125 })
126 .map(|s| s.trim().to_string())
127 }
128
129 fn extract_image(&self, document: &Html) -> Option<String> {
130 let og_image_selector = Selector::parse("meta[property='og:image']").ok()?;
131
132 document
133 .select(&og_image_selector)
134 .next()
135 .and_then(|el| el.value().attr("content"))
136 .map(|s| s.trim().to_string())
137 }
138
139 fn extract_favicon(&self, document: &Html) -> Option<String> {
140 let favicon_selector =
141 Selector::parse("link[rel='icon'], link[rel='shortcut icon']").ok()?;
142
143 document
144 .select(&favicon_selector)
145 .next()
146 .and_then(|el| el.value().attr("href"))
147 .map(|s| s.trim().to_string())
148 }
149
150 fn extract_site_name(&self, document: &Html) -> Option<String> {
151 let og_site_selector = Selector::parse("meta[property='og:site_name']").ok()?;
152
153 document
154 .select(&og_site_selector)
155 .next()
156 .and_then(|el| el.value().attr("content"))
157 .map(|s| s.trim().to_string())
158 }
159
160 pub fn extract_from_oembed(&self, oembed: &str) -> Option<Preview> {
164 let document = Html::parse_fragment(oembed);
165
166 let text_selector = Selector::parse("p").ok()?;
167 let link_selector = Selector::parse("a").ok()?;
168
169 let tweet_text = document
170 .select(&text_selector)
171 .next()
172 .map(|el| el.text().collect::<String>())
173 .map(|s| s.trim().to_string());
174
175 let image_link = document
176 .select(&link_selector)
177 .find(|a| {
178 a.value()
179 .attr("href")
180 .map(|href| href.contains("t.co"))
181 .unwrap_or(false)
182 })
183 .and_then(|a| a.value().attr("href"))
184 .map(String::from);
185
186 let time = document
187 .select(&link_selector)
188 .last()
189 .map(|el| el.text().collect::<String>());
190
191 Some(Preview {
192 url: String::new(),
193 title: tweet_text.clone(),
194 description: Some(format!(
195 "{}{}",
196 tweet_text.unwrap_or_default(),
197 time.map(|t| format!(" (Posted: {})", t))
198 .unwrap_or_default()
199 )),
200 image_url: image_link,
201 site_name: Some("X (formerly Twitter)".to_string()),
202 favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
203 })
204 }
205}