url_preview/
extractor.rs

1use super::is_twitter_url;
2use crate::{Preview, PreviewError};
3use scraper::{Html, Selector};
4use tracing::debug;
5
6/// Metadata extractor, responsible for extracting preview information from webpage content
7#[derive(Clone)]
8pub struct MetadataExtractor;
9
10impl MetadataExtractor {
11    pub fn new() -> Self {
12        Self
13    }
14
15    pub fn extract(&self, html: &str, url: &str) -> Result<Preview, PreviewError> {
16        let document = Html::parse_document(html);
17        if is_twitter_url(url) {
18            if let Some(preview) = self.extract_twitter_metadata(&document, url) {
19                return Ok(preview);
20            }
21        }
22        // If not a Twitter URL or Twitter extraction failed, use generic extraction method
23        self.extract_generic_metadata(&document, url)
24    }
25
26    fn extract_twitter_metadata(&self, document: &Html, url: &str) -> Option<Preview> {
27        let selectors = [
28            ("article[data-testid='tweet']", "Article selector"),
29            ("div[data-testid='tweetText']", "Text selector"),
30            ("div[data-testid='tweetPhoto'] img", "Image selector"),
31            ("div[data-testid='videoPlayer']", "Video selector"),
32            ("div[data-testid='User-Name']", "Username selector"),
33        ];
34
35        // Print matching results for all selectors
36        for (selector_str, desc) in selectors {
37            if let Ok(selector) = Selector::parse(selector_str) {
38                let count = document.select(&selector).count();
39                debug!("{}: Found {} matches", desc, count);
40            }
41        }
42
43        // Try to extract basic metadata
44        let og_title = self.extract_title(document);
45        let og_description = self.extract_description(document);
46        let og_image = self.extract_image(document);
47
48        debug!("Basic metadata extraction results:");
49        debug!("Title: {:?}", og_title);
50        debug!("Description: {:?}", og_description);
51        debug!("Image: {:?}", og_image);
52
53        // Return basic info even if specific tweet elements not found
54        Some(Preview {
55            url: url.to_string(),
56            title: og_title,
57            description: og_description,
58            image_url: og_image,
59            site_name: Some("X (formerly Twitter)".to_string()),
60            favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
61        })
62    }
63
64    fn extract_generic_metadata(
65        &self,
66        document: &Html,
67        url: &str,
68    ) -> Result<Preview, PreviewError> {
69        let title = self.extract_title(document);
70        let description = self.extract_description(document);
71        let image_url = self.extract_image(document);
72        let favicon = self.extract_favicon(document);
73        let site_name = self.extract_site_name(document);
74
75        Ok(Preview {
76            url: url.to_string(),
77            title,
78            description,
79            image_url,
80            favicon,
81            site_name,
82        })
83    }
84
85    fn extract_title(&self, document: &Html) -> Option<String> {
86        let og_title_selector = Selector::parse("meta[property='og:title']").ok()?;
87        let title_selector = Selector::parse("title").ok()?;
88
89        let og_title = document
90            .select(&og_title_selector)
91            .next()
92            .and_then(|el| el.value().attr("content"))
93            .map(|s| s.to_string());
94
95        // If there is no Open Graph title, try to get the regular title
96        og_title
97            .or_else(|| {
98                document
99                    .select(&title_selector)
100                    .next()
101                    .map(|el| el.inner_html())
102            })
103            .map(|s| s.trim().to_string())
104    }
105
106    fn extract_description(&self, document: &Html) -> Option<String> {
107        let og_desc_selector = Selector::parse("meta[property='og:description']").ok()?;
108        let meta_desc_selector = Selector::parse("meta[name='description']").ok()?;
109
110        document
111            .select(&og_desc_selector)
112            .next()
113            .and_then(|el| el.value().attr("content"))
114            .or_else(|| {
115                document
116                    .select(&meta_desc_selector)
117                    .next()
118                    .and_then(|el| el.value().attr("content"))
119            })
120            .map(|s| s.trim().to_string())
121    }
122
123    fn extract_image(&self, document: &Html) -> Option<String> {
124        let og_image_selector = Selector::parse("meta[property='og:image']").ok()?;
125
126        document
127            .select(&og_image_selector)
128            .next()
129            .and_then(|el| el.value().attr("content"))
130            .map(|s| s.trim().to_string())
131    }
132
133    fn extract_favicon(&self, document: &Html) -> Option<String> {
134        let favicon_selector =
135            Selector::parse("link[rel='icon'], link[rel='shortcut icon']").ok()?;
136
137        document
138            .select(&favicon_selector)
139            .next()
140            .and_then(|el| el.value().attr("href"))
141            .map(|s| s.trim().to_string())
142    }
143
144    fn extract_site_name(&self, document: &Html) -> Option<String> {
145        let og_site_selector = Selector::parse("meta[property='og:site_name']").ok()?;
146
147        document
148            .select(&og_site_selector)
149            .next()
150            .and_then(|el| el.value().attr("content"))
151            .map(|s| s.trim().to_string())
152    }
153
154    /// Create a preview from oEmbed data.
155    ///
156    /// Takes oEmbed HTML content as a string and extracts relevant metadata to create a preview.
157    pub fn extract_from_oembed(&self, oembed: &str) -> Option<Preview> {
158        let document = Html::parse_fragment(oembed);
159
160        let text_selector = Selector::parse("p").ok()?;
161        let link_selector = Selector::parse("a").ok()?;
162
163        let tweet_text = document
164            .select(&text_selector)
165            .next()
166            .map(|el| el.text().collect::<String>())
167            .map(|s| s.trim().to_string());
168
169        let image_link = document
170            .select(&link_selector)
171            .find(|a| {
172                a.value()
173                    .attr("href")
174                    .map(|href| href.contains("t.co"))
175                    .unwrap_or(false)
176            })
177            .and_then(|a| a.value().attr("href"))
178            .map(String::from);
179
180        let time = document
181            .select(&link_selector)
182            .last()
183            .map(|el| el.text().collect::<String>());
184
185        Some(Preview {
186            url: String::new(),
187            title: tweet_text.clone(),
188            description: Some(format!(
189                "{}{}",
190                tweet_text.unwrap_or_default(),
191                time.map(|t| format!(" (Posted: {})", t))
192                    .unwrap_or_default()
193            )),
194            image_url: image_link,
195            site_name: Some("X (formerly Twitter)".to_string()),
196            favicon: Some("https://abs.twimg.com/favicons/twitter.ico".to_string()),
197        })
198    }
199}