url_preview/
extractor.rs

1use super::is_twitter_url;
2use crate::{Preview, PreviewError};
3use scraper::{Html, Selector};
4use tracing::debug;
5
6use crate::utils;
7
8/// Metadata extractor, responsible for extracting preview information from webpage content
9#[derive(Clone)]
10pub struct MetadataExtractor;
11
12impl Default for MetadataExtractor {
13    fn default() -> Self {
14        Self::new()
15    }
16}
17
18impl MetadataExtractor {
19    pub fn new() -> Self {
20        Self
21    }
22
23    pub fn extract(&self, html: &str, url: &str) -> Result<Preview, PreviewError> {
24        let document = Html::parse_document(html);
25        if is_twitter_url(url) {
26            if let Some(preview) = self.extract_twitter_metadata(&document, url) {
27                return Ok(preview);
28            }
29        }
30        // If not a Twitter URL or Twitter extraction failed, use generic extraction method
31        self.extract_generic_metadata(&document, url)
32    }
33
34    fn extract_twitter_metadata(&self, document: &Html, url: &str) -> Option<Preview> {
35        let selectors = [
36            ("article[data-testid='tweet']", "Article selector"),
37            ("div[data-testid='tweetText']", "Text selector"),
38            ("div[data-testid='tweetPhoto'] img", "Image selector"),
39            ("div[data-testid='videoPlayer']", "Video selector"),
40            ("div[data-testid='User-Name']", "Username selector"),
41        ];
42
43        // Print matching results for all selectors
44        for (selector_str, desc) in selectors {
45            if let Ok(selector) = Selector::parse(selector_str) {
46                let count = document.select(&selector).count();
47                debug!("{}: Found {} matches", desc, count);
48            }
49        }
50
51        // Try to extract basic metadata
52        let og_title = self.extract_title(document);
53        let og_description = self.extract_description(document);
54        let og_image = self.extract_image(document);
55
56        debug!("Basic metadata extraction results:");
57        debug!("Title: {:?}", og_title);
58        debug!("Description: {:?}", og_description);
59        debug!("Image: {:?}", og_image);
60
61        // Return basic info even if specific tweet elements not found
62        Some(Preview {
63            url: url.to_string(),
64            title: og_title,
65            description: og_description,
66            image_url: og_image,
67            site_name: Some("X (formerly Twitter)".to_string()),
68            favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
69        })
70    }
71
72    fn extract_generic_metadata(
73        &self,
74        document: &Html,
75        url: &str,
76    ) -> Result<Preview, PreviewError> {
77        let title = self.extract_title(document);
78        let description = self.extract_description(document);
79        let image_url = self.extract_image(document);
80        let favicon = self.extract_favicon(document);
81        let site_name = self.extract_site_name(document);
82
83        let host = utils::pickup_host_from_url(url)?;
84
85        let image_url = format_url(image_url, &host);
86
87        let favicon = format_url(favicon, &host);
88
89        Ok(Preview {
90            url: url.to_string(),
91            title,
92            description,
93            image_url,
94            favicon,
95            site_name,
96        })
97    }
98
99    fn extract_title(&self, document: &Html) -> Option<String> {
100        let og_title_selector = Selector::parse("meta[property='og:title']").ok()?;
101        let title_selector = Selector::parse("title").ok()?;
102
103        let og_title = document
104            .select(&og_title_selector)
105            .next()
106            .and_then(|el| el.value().attr("content"))
107            .map(|s| s.to_string());
108
109        // If there is no Open Graph title, try to get the regular title
110        og_title
111            .or_else(|| {
112                document
113                    .select(&title_selector)
114                    .next()
115                    .map(|el| el.inner_html())
116            })
117            .map(|s| s.trim().to_string())
118    }
119
120    fn extract_description(&self, document: &Html) -> Option<String> {
121        let og_desc_selector = Selector::parse("meta[property='og:description']").ok()?;
122        let meta_desc_selector = Selector::parse("meta[name='description']").ok()?;
123
124        document
125            .select(&og_desc_selector)
126            .next()
127            .and_then(|el| el.value().attr("content"))
128            .or_else(|| {
129                document
130                    .select(&meta_desc_selector)
131                    .next()
132                    .and_then(|el| el.value().attr("content"))
133            })
134            .map(|s| s.trim().to_string())
135    }
136
137    fn extract_image(&self, document: &Html) -> Option<String> {
138        let og_image_selector =
139            Selector::parse("meta[property='og:image'],meta[itemprop='image']").ok()?;
140
141        document
142            .select(&og_image_selector)
143            .next()
144            .and_then(|el| el.value().attr("content"))
145            .map(|s| s.trim().to_string())
146    }
147
148    fn extract_favicon(&self, document: &Html) -> Option<String> {
149        let favicon_selector =
150            Selector::parse("link[rel='icon'], link[rel='shortcut icon']").ok()?;
151
152        document
153            .select(&favicon_selector)
154            .next()
155            .and_then(|el| el.value().attr("href"))
156            .map(|s| s.trim().to_string())
157    }
158
159    fn extract_site_name(&self, document: &Html) -> Option<String> {
160        let og_site_selector = Selector::parse("meta[property='og:site_name']").ok()?;
161
162        document
163            .select(&og_site_selector)
164            .next()
165            .and_then(|el| el.value().attr("content"))
166            .map(|s| s.trim().to_string())
167    }
168
169    /// Create a preview from oEmbed data.
170    ///
171    /// Takes oEmbed HTML content as a string and extracts relevant metadata to create a preview.
172    pub fn extract_from_oembed(&self, oembed: &str) -> Option<Preview> {
173        let document = Html::parse_fragment(oembed);
174
175        let text_selector = Selector::parse("p").ok()?;
176        let link_selector = Selector::parse("a").ok()?;
177
178        let tweet_text = document
179            .select(&text_selector)
180            .next()
181            .map(|el| el.text().collect::<String>())
182            .map(|s| s.trim().to_string());
183
184        let image_link = document
185            .select(&link_selector)
186            .find(|a| {
187                a.value()
188                    .attr("href")
189                    .map(|href| href.contains("t.co"))
190                    .unwrap_or(false)
191            })
192            .and_then(|a| a.value().attr("href"))
193            .map(String::from);
194
195        let time = document
196            .select(&link_selector)
197            .last()
198            .map(|el| el.text().collect::<String>());
199
200        Some(Preview {
201            url: String::new(),
202            title: tweet_text.clone(),
203            description: Some(format!(
204                "{}{}",
205                tweet_text.unwrap_or_default(),
206                time.map(|t| format!(" (Posted: {})", t))
207                    .unwrap_or_default()
208            )),
209            image_url: image_link,
210            site_name: Some("X (formerly Twitter)".to_string()),
211            favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
212        })
213    }
214}
215
216// Helper function to check if a URL is absolute and format it accordingly
217fn format_url(url: Option<String>, host: &str) -> Option<String> {
218    fn is_absolute_url(url: &str) -> bool {
219        url.starts_with("http://") || url.starts_with("https://")
220    }
221
222    if let Some(url) = url {
223        if is_absolute_url(&url) {
224            Some(url)
225        } else if url.starts_with('/') {
226            Some(format!("{}{}", host, url))
227        } else {
228            Some(format!("{}/{}", host, url))
229        }
230    } else {
231        None
232    }
233}