url_preview/
extractor.rs

1use super::is_twitter_url;
2use crate::{Preview, PreviewError};
3use scraper::{Html, Selector};
4use tracing::debug;
5
6/// Metadata extractor, responsible for extracting preview information from webpage content
7#[derive(Clone)]
8pub struct MetadataExtractor;
9
10impl Default for MetadataExtractor {
11    fn default() -> Self {
12        Self::new()
13    }
14}
15
16impl MetadataExtractor {
17    pub fn new() -> Self {
18        Self
19    }
20
21    pub fn extract(&self, html: &str, url: &str) -> Result<Preview, PreviewError> {
22        let document = Html::parse_document(html);
23        if is_twitter_url(url) {
24            if let Some(preview) = self.extract_twitter_metadata(&document, url) {
25                return Ok(preview);
26            }
27        }
28        // If not a Twitter URL or Twitter extraction failed, use generic extraction method
29        self.extract_generic_metadata(&document, url)
30    }
31
32    fn extract_twitter_metadata(&self, document: &Html, url: &str) -> Option<Preview> {
33        let selectors = [
34            ("article[data-testid='tweet']", "Article selector"),
35            ("div[data-testid='tweetText']", "Text selector"),
36            ("div[data-testid='tweetPhoto'] img", "Image selector"),
37            ("div[data-testid='videoPlayer']", "Video selector"),
38            ("div[data-testid='User-Name']", "Username selector"),
39        ];
40
41        // Print matching results for all selectors
42        for (selector_str, desc) in selectors {
43            if let Ok(selector) = Selector::parse(selector_str) {
44                let count = document.select(&selector).count();
45                debug!("{}: Found {} matches", desc, count);
46            }
47        }
48
49        // Try to extract basic metadata
50        let og_title = self.extract_title(document);
51        let og_description = self.extract_description(document);
52        let og_image = self.extract_image(document);
53
54        debug!("Basic metadata extraction results:");
55        debug!("Title: {:?}", og_title);
56        debug!("Description: {:?}", og_description);
57        debug!("Image: {:?}", og_image);
58
59        // Return basic info even if specific tweet elements not found
60        Some(Preview {
61            url: url.to_string(),
62            title: og_title,
63            description: og_description,
64            image_url: og_image,
65            site_name: Some("X (formerly Twitter)".to_string()),
66            favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
67        })
68    }
69
70    fn extract_generic_metadata(
71        &self,
72        document: &Html,
73        url: &str,
74    ) -> Result<Preview, PreviewError> {
75        let title = self.extract_title(document);
76        let description = self.extract_description(document);
77        let image_url = self.extract_image(document);
78        let favicon = self.extract_favicon(document);
79        let site_name = self.extract_site_name(document);
80
81        Ok(Preview {
82            url: url.to_string(),
83            title,
84            description,
85            image_url,
86            favicon,
87            site_name,
88        })
89    }
90
91    fn extract_title(&self, document: &Html) -> Option<String> {
92        let og_title_selector = Selector::parse("meta[property='og:title']").ok()?;
93        let title_selector = Selector::parse("title").ok()?;
94
95        let og_title = document
96            .select(&og_title_selector)
97            .next()
98            .and_then(|el| el.value().attr("content"))
99            .map(|s| s.to_string());
100
101        // If there is no Open Graph title, try to get the regular title
102        og_title
103            .or_else(|| {
104                document
105                    .select(&title_selector)
106                    .next()
107                    .map(|el| el.inner_html())
108            })
109            .map(|s| s.trim().to_string())
110    }
111
112    fn extract_description(&self, document: &Html) -> Option<String> {
113        let og_desc_selector = Selector::parse("meta[property='og:description']").ok()?;
114        let meta_desc_selector = Selector::parse("meta[name='description']").ok()?;
115
116        document
117            .select(&og_desc_selector)
118            .next()
119            .and_then(|el| el.value().attr("content"))
120            .or_else(|| {
121                document
122                    .select(&meta_desc_selector)
123                    .next()
124                    .and_then(|el| el.value().attr("content"))
125            })
126            .map(|s| s.trim().to_string())
127    }
128
129    fn extract_image(&self, document: &Html) -> Option<String> {
130        let og_image_selector = Selector::parse("meta[property='og:image']").ok()?;
131
132        document
133            .select(&og_image_selector)
134            .next()
135            .and_then(|el| el.value().attr("content"))
136            .map(|s| s.trim().to_string())
137    }
138
139    fn extract_favicon(&self, document: &Html) -> Option<String> {
140        let favicon_selector =
141            Selector::parse("link[rel='icon'], link[rel='shortcut icon']").ok()?;
142
143        document
144            .select(&favicon_selector)
145            .next()
146            .and_then(|el| el.value().attr("href"))
147            .map(|s| s.trim().to_string())
148    }
149
150    fn extract_site_name(&self, document: &Html) -> Option<String> {
151        let og_site_selector = Selector::parse("meta[property='og:site_name']").ok()?;
152
153        document
154            .select(&og_site_selector)
155            .next()
156            .and_then(|el| el.value().attr("content"))
157            .map(|s| s.trim().to_string())
158    }
159
160    /// Create a preview from oEmbed data.
161    ///
162    /// Takes oEmbed HTML content as a string and extracts relevant metadata to create a preview.
163    pub fn extract_from_oembed(&self, oembed: &str) -> Option<Preview> {
164        let document = Html::parse_fragment(oembed);
165
166        let text_selector = Selector::parse("p").ok()?;
167        let link_selector = Selector::parse("a").ok()?;
168
169        let tweet_text = document
170            .select(&text_selector)
171            .next()
172            .map(|el| el.text().collect::<String>())
173            .map(|s| s.trim().to_string());
174
175        let image_link = document
176            .select(&link_selector)
177            .find(|a| {
178                a.value()
179                    .attr("href")
180                    .map(|href| href.contains("t.co"))
181                    .unwrap_or(false)
182            })
183            .and_then(|a| a.value().attr("href"))
184            .map(String::from);
185
186        let time = document
187            .select(&link_selector)
188            .last()
189            .map(|el| el.text().collect::<String>());
190
191        Some(Preview {
192            url: String::new(),
193            title: tweet_text.clone(),
194            description: Some(format!(
195                "{}{}",
196                tweet_text.unwrap_or_default(),
197                time.map(|t| format!(" (Posted: {})", t))
198                    .unwrap_or_default()
199            )),
200            image_url: image_link,
201            site_name: Some("X (formerly Twitter)".to_string()),
202            favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
203        })
204    }
205}