url_preview/
extractor.rs

1use super::is_twitter_url;
2use crate::{Preview, PreviewError};
3use scraper::{Html, Selector};
4use tracing::debug;
5
6use crate::utils;
7
8/// Metadata extractor, responsible for extracting preview information from webpage content
9#[derive(Clone)]
10pub struct MetadataExtractor;
11
12impl Default for MetadataExtractor {
13    fn default() -> Self {
14        Self::new()
15    }
16}
17
18impl MetadataExtractor {
19    pub fn new() -> Self {
20        Self
21    }
22
23    pub fn extract(&self, html: &str, url: &str) -> Result<Preview, PreviewError> {
24        let document = Html::parse_document(html);
25        if is_twitter_url(url) {
26            if let Some(preview) = self.extract_twitter_metadata(&document, url) {
27                return Ok(preview);
28            }
29        }
30        // If not a Twitter URL or Twitter extraction failed, use generic extraction method
31        self.extract_generic_metadata(&document, url)
32    }
33
34    fn extract_twitter_metadata(&self, document: &Html, url: &str) -> Option<Preview> {
35        let selectors = [
36            ("article[data-testid='tweet']", "Article selector"),
37            ("div[data-testid='tweetText']", "Text selector"),
38            ("div[data-testid='tweetPhoto'] img", "Image selector"),
39            ("div[data-testid='videoPlayer']", "Video selector"),
40            ("div[data-testid='User-Name']", "Username selector"),
41        ];
42
43        // Print matching results for all selectors
44        for (selector_str, desc) in selectors {
45            if let Ok(selector) = Selector::parse(selector_str) {
46                let count = document.select(&selector).count();
47                debug!("{}: Found {} matches", desc, count);
48            }
49        }
50
51        // Try to extract basic metadata
52        let og_title = self.extract_title(document);
53        let og_description = self.extract_description(document);
54        let og_image = self.extract_image(document);
55
56        debug!("Basic metadata extraction results:");
57        debug!("Title: {:?}", og_title);
58        debug!("Description: {:?}", og_description);
59        debug!("Image: {:?}", og_image);
60
61        // Return basic info even if specific tweet elements not found
62        Some(Preview {
63            url: url.to_string(),
64            title: og_title,
65            description: og_description,
66            image_url: og_image,
67            site_name: Some("X (formerly Twitter)".to_string()),
68            favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
69        })
70    }
71
72    fn extract_generic_metadata(
73        &self,
74        document: &Html,
75        url: &str,
76    ) -> Result<Preview, PreviewError> {
77        let title = self.extract_title(document);
78        let description = self.extract_description(document);
79        let image_url = self.extract_image(document);
80        let favicon = self.extract_favicon(document);
81        let site_name = self.extract_site_name(document);
82
83        let host = utils::pickup_host_from_url(url)?;
84
85        let image_url = if let Some(url) = image_url {
86            if !url.starts_with(&host) {
87                Some(format!("{}{}", host, url))
88            } else {
89                Some(url)
90            }
91        } else {
92            None
93        };
94        let favicon = if let Some(url) = favicon {
95            if !url.starts_with(&host) {
96                Some(format!("{}{}", host, url))
97            } else {
98                Some(url)
99            }
100        } else {
101            None
102        };
103
104        Ok(Preview {
105            url: url.to_string(),
106            title,
107            description,
108            image_url,
109            favicon,
110            site_name,
111        })
112    }
113
114    fn extract_title(&self, document: &Html) -> Option<String> {
115        let og_title_selector = Selector::parse("meta[property='og:title']").ok()?;
116        let title_selector = Selector::parse("title").ok()?;
117
118        let og_title = document
119            .select(&og_title_selector)
120            .next()
121            .and_then(|el| el.value().attr("content"))
122            .map(|s| s.to_string());
123
124        // If there is no Open Graph title, try to get the regular title
125        og_title
126            .or_else(|| {
127                document
128                    .select(&title_selector)
129                    .next()
130                    .map(|el| el.inner_html())
131            })
132            .map(|s| s.trim().to_string())
133    }
134
135    fn extract_description(&self, document: &Html) -> Option<String> {
136        let og_desc_selector = Selector::parse("meta[property='og:description']").ok()?;
137        let meta_desc_selector = Selector::parse("meta[name='description']").ok()?;
138
139        document
140            .select(&og_desc_selector)
141            .next()
142            .and_then(|el| el.value().attr("content"))
143            .or_else(|| {
144                document
145                    .select(&meta_desc_selector)
146                    .next()
147                    .and_then(|el| el.value().attr("content"))
148            })
149            .map(|s| s.trim().to_string())
150    }
151
152    fn extract_image(&self, document: &Html) -> Option<String> {
153        let og_image_selector =
154            Selector::parse("meta[property='og:image'],meta[itemprop='image']").ok()?;
155
156        document
157            .select(&og_image_selector)
158            .next()
159            .and_then(|el| el.value().attr("content"))
160            .map(|s| s.trim().to_string())
161    }
162
163    fn extract_favicon(&self, document: &Html) -> Option<String> {
164        let favicon_selector =
165            Selector::parse("link[rel='icon'], link[rel='shortcut icon']").ok()?;
166
167        document
168            .select(&favicon_selector)
169            .next()
170            .and_then(|el| el.value().attr("href"))
171            .map(|s| s.trim().to_string())
172    }
173
174    fn extract_site_name(&self, document: &Html) -> Option<String> {
175        let og_site_selector = Selector::parse("meta[property='og:site_name']").ok()?;
176
177        document
178            .select(&og_site_selector)
179            .next()
180            .and_then(|el| el.value().attr("content"))
181            .map(|s| s.trim().to_string())
182    }
183
184    /// Create a preview from oEmbed data.
185    ///
186    /// Takes oEmbed HTML content as a string and extracts relevant metadata to create a preview.
187    pub fn extract_from_oembed(&self, oembed: &str) -> Option<Preview> {
188        let document = Html::parse_fragment(oembed);
189
190        let text_selector = Selector::parse("p").ok()?;
191        let link_selector = Selector::parse("a").ok()?;
192
193        let tweet_text = document
194            .select(&text_selector)
195            .next()
196            .map(|el| el.text().collect::<String>())
197            .map(|s| s.trim().to_string());
198
199        let image_link = document
200            .select(&link_selector)
201            .find(|a| {
202                a.value()
203                    .attr("href")
204                    .map(|href| href.contains("t.co"))
205                    .unwrap_or(false)
206            })
207            .and_then(|a| a.value().attr("href"))
208            .map(String::from);
209
210        let time = document
211            .select(&link_selector)
212            .last()
213            .map(|el| el.text().collect::<String>());
214
215        Some(Preview {
216            url: String::new(),
217            title: tweet_text.clone(),
218            description: Some(format!(
219                "{}{}",
220                tweet_text.unwrap_or_default(),
221                time.map(|t| format!(" (Posted: {})", t))
222                    .unwrap_or_default()
223            )),
224            image_url: image_link,
225            site_name: Some("X (formerly Twitter)".to_string()),
226            favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
227        })
228    }
229}