url_preview/
extractor.rs

1use super::is_twitter_url;
2use crate::{Preview, PreviewError};
3use scraper::{Html, Selector};
4#[cfg(feature = "logging")]
5use tracing::debug;
6
7use crate::utils;
8
9/// Metadata extractor, responsible for extracting preview information from webpage content
10#[derive(Clone)]
11pub struct MetadataExtractor;
12
13impl Default for MetadataExtractor {
14    fn default() -> Self {
15        Self::new()
16    }
17}
18
19impl MetadataExtractor {
20    pub fn new() -> Self {
21        Self
22    }
23
24    pub fn extract(&self, html: &str, url: &str) -> Result<Preview, PreviewError> {
25        let document = Html::parse_document(html);
26        if is_twitter_url(url) {
27            if let Some(preview) = self.extract_twitter_metadata(&document, url) {
28                return Ok(preview);
29            }
30        }
31        // If not a Twitter URL or Twitter extraction failed, use generic extraction method
32        self.extract_generic_metadata(&document, url)
33    }
34
35    fn extract_twitter_metadata(&self, document: &Html, url: &str) -> Option<Preview> {
36        let selectors = [
37            ("article[data-testid='tweet']", "Article selector"),
38            ("div[data-testid='tweetText']", "Text selector"),
39            ("div[data-testid='tweetPhoto'] img", "Image selector"),
40            ("div[data-testid='videoPlayer']", "Video selector"),
41            ("div[data-testid='User-Name']", "Username selector"),
42        ];
43
44        // Print matching results for all selectors
45        for (selector_str, _desc) in selectors {
46            if let Ok(selector) = Selector::parse(selector_str) {
47                #[cfg(feature = "logging")]
48                {
49                    let count = document.select(&selector).count();
50                    debug!("{}: Found {} matches", _desc, count);
51                }
52                #[cfg(not(feature = "logging"))]
53                {
54                    let _count = document.select(&selector).count();
55                }
56            }
57        }
58
59        // Try to extract basic metadata
60        let og_title = self.extract_title(document);
61        let og_description = self.extract_description(document);
62        let og_image = self.extract_image(document);
63
64        #[cfg(feature = "logging")]
65        {
66            debug!("Basic metadata extraction results:");
67            debug!("Title: {:?}", og_title);
68            debug!("Description: {:?}", og_description);
69            debug!("Image: {:?}", og_image);
70        }
71
72        // Return basic info even if specific tweet elements not found
73        Some(Preview {
74            url: url.to_string(),
75            title: og_title,
76            description: og_description,
77            image_url: og_image,
78            site_name: Some("X (formerly Twitter)".to_string()),
79            favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
80        })
81    }
82
83    fn extract_generic_metadata(
84        &self,
85        document: &Html,
86        url: &str,
87    ) -> Result<Preview, PreviewError> {
88        let title = self.extract_title(document);
89        let description = self.extract_description(document);
90        let image_url = self.extract_image(document);
91        let favicon = self.extract_favicon(document);
92        let site_name = self.extract_site_name(document);
93
94        let host = utils::pickup_host_from_url(url)?;
95
96        let image_url = format_url(image_url, &host);
97
98        let favicon = format_url(favicon, &host);
99
100        Ok(Preview {
101            url: url.to_string(),
102            title,
103            description,
104            image_url,
105            favicon,
106            site_name,
107        })
108    }
109
110    fn extract_title(&self, document: &Html) -> Option<String> {
111        let og_title_selector = Selector::parse("meta[property='og:title']").ok()?;
112        let title_selector = Selector::parse("title").ok()?;
113
114        let og_title = document
115            .select(&og_title_selector)
116            .next()
117            .and_then(|el| el.value().attr("content"))
118            .map(|s| s.to_string());
119
120        // If there is no Open Graph title, try to get the regular title
121        og_title
122            .or_else(|| {
123                document
124                    .select(&title_selector)
125                    .next()
126                    .map(|el| el.inner_html())
127            })
128            .map(|s| s.trim().to_string())
129    }
130
131    fn extract_description(&self, document: &Html) -> Option<String> {
132        let og_desc_selector = Selector::parse("meta[property='og:description']").ok()?;
133        let meta_desc_selector = Selector::parse("meta[name='description']").ok()?;
134
135        document
136            .select(&og_desc_selector)
137            .next()
138            .and_then(|el| el.value().attr("content"))
139            .or_else(|| {
140                document
141                    .select(&meta_desc_selector)
142                    .next()
143                    .and_then(|el| el.value().attr("content"))
144            })
145            .map(|s| s.trim().to_string())
146    }
147
148    fn extract_image(&self, document: &Html) -> Option<String> {
149        let og_image_selector =
150            Selector::parse("meta[property='og:image'],meta[itemprop='image']").ok()?;
151
152        document
153            .select(&og_image_selector)
154            .next()
155            .and_then(|el| el.value().attr("content"))
156            .map(|s| s.trim().to_string())
157    }
158
159    fn extract_favicon(&self, document: &Html) -> Option<String> {
160        let favicon_selector =
161            Selector::parse("link[rel='icon'], link[rel='shortcut icon']").ok()?;
162
163        document
164            .select(&favicon_selector)
165            .next()
166            .and_then(|el| el.value().attr("href"))
167            .map(|s| s.trim().to_string())
168    }
169
170    fn extract_site_name(&self, document: &Html) -> Option<String> {
171        let og_site_selector = Selector::parse("meta[property='og:site_name']").ok()?;
172
173        document
174            .select(&og_site_selector)
175            .next()
176            .and_then(|el| el.value().attr("content"))
177            .map(|s| s.trim().to_string())
178    }
179
180    /// Create a preview from oEmbed data.
181    ///
182    /// Takes oEmbed HTML content as a string and extracts relevant metadata to create a preview.
183    pub fn extract_from_oembed(&self, oembed: &str) -> Option<Preview> {
184        let document = Html::parse_fragment(oembed);
185
186        let text_selector = Selector::parse("p").ok()?;
187        let link_selector = Selector::parse("a").ok()?;
188
189        let tweet_text = document
190            .select(&text_selector)
191            .next()
192            .map(|el| el.text().collect::<String>())
193            .map(|s| s.trim().to_string());
194
195        let image_link = document
196            .select(&link_selector)
197            .find(|a| {
198                a.value()
199                    .attr("href")
200                    .map(|href| href.contains("t.co"))
201                    .unwrap_or(false)
202            })
203            .and_then(|a| a.value().attr("href"))
204            .map(String::from);
205
206        let time = document
207            .select(&link_selector)
208            .next_back()
209            .map(|el| el.text().collect::<String>());
210
211        Some(Preview {
212            url: String::new(),
213            title: tweet_text.clone(),
214            description: Some(format!(
215                "{}{}",
216                tweet_text.unwrap_or_default(),
217                time.map(|t| format!(" (Posted: {t})")).unwrap_or_default()
218            )),
219            image_url: image_link,
220            site_name: Some("X (formerly Twitter)".to_string()),
221            favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
222        })
223    }
224}
225
226// Helper function to check if a URL is absolute and format it accordingly
227fn format_url(url: Option<String>, host: &str) -> Option<String> {
228    fn is_absolute_url(url: &str) -> bool {
229        url.starts_with("http://") || url.starts_with("https://")
230    }
231
232    if let Some(url) = url {
233        if is_absolute_url(&url) {
234            Some(url)
235        } else if url.starts_with('/') {
236            Some(format!("{host}{url}"))
237        } else {
238            Some(format!("{host}/{url}"))
239        }
240    } else {
241        None
242    }
243}