link_preview/
preview.rs

1use std::str::FromStr;
2use std::string::FromUtf8Error;
3
4use scraper::Html;
5use thiserror::Error;
6use url::Url;
7
8#[cfg(feature = "serde")]
9use serde::{Deserialize, Serialize};
10
11use crate::html::{find_link, find_meta_tag, first_inner_html};
12use crate::providers::og::{find_og_tag, OpenGraphTag};
13use crate::providers::schema::{find_schema_tag, SchemaMetaTag};
14use crate::providers::twitter::{find_twitter_tag, TwitterMetaTag};
15
16#[derive(Error, Debug)]
17pub enum Error {
18    #[error("The provided byte slice contains invalid UTF-8 characters")]
19    InvalidUtf8(FromUtf8Error),
20}
21
22/// Represents a link preview, which contains metadata about a web page
23#[derive(Clone, Debug)]
24#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
25pub struct LinkPreview {
26    pub title: Option<String>,
27    pub description: Option<String>,
28    pub domain: Option<String>,
29    pub image_url: Option<Url>,
30}
31
32impl LinkPreview {
33    /// Retrieves the `String` representation of `image_url` `Url` instance
34    pub fn image_url_str(&self) -> Option<String> {
35        if let Some(image_url) = self.image_url.clone() {
36            return Some(image_url.to_string());
37        }
38
39        None
40    }
41
42    /// Attempts to find the description of the page in the following order:
43    ///
44    /// - Document's `<link rel="canonical" /> element's `href` attribute
45    /// - OpenGraphTag's image meta tag (`og:image`)
46    pub fn find_first_domain(html: &Html) -> Option<String> {
47        if let Some(domain) = find_link(html, "canonical") {
48            return LinkPreview::domain_from_string(domain);
49        }
50
51        if let Some(domain) = find_og_tag(html, OpenGraphTag::Url) {
52            return LinkPreview::domain_from_string(domain);
53        }
54
55        None
56    }
57
58    /// Attempts to parse a `Url` from a `String` and then attempts to retrieve
59    /// the `domain` fragment from such `Url` instance.
60    ///
61    /// If either the `Url` is invalid or theres no a domain fragment available
62    /// (the provided `Url` points to an IP instead of a domain), `None` is
63    /// returned.
64    fn domain_from_string(value: String) -> Option<String> {
65        let url = Url::parse(&value).ok()?;
66
67        url.domain().map(|domain| domain.to_string())
68    }
69
70    /// Attempts to find the description of the page in the following order:
71    ///
72    /// - OpenGraphTag's image meta tag (`og:image`)
73    /// - Document's `<link rel="image_url" /> element's `href` attribute
74    /// - Twitter Card's image meta tag (`twitter:image`)
75    /// - Schema.org image meta tag (`image`)
76    pub fn find_first_image_url(html: &Html) -> Option<Url> {
77        if let Some(image_url) = find_og_tag(html, OpenGraphTag::Image) {
78            return Url::parse(&image_url).ok();
79        }
80
81        if let Some(image_url) = find_link(html, "image_src") {
82            return Url::parse(&image_url).ok();
83        }
84
85        if let Some(image_url) = find_schema_tag(html, SchemaMetaTag::Image) {
86            return Url::parse(&image_url).ok();
87        }
88
89        if let Some(image_url) = find_twitter_tag(html, TwitterMetaTag::Image) {
90            return Url::parse(&image_url).ok();
91        }
92
93        None
94    }
95
96    /// Attempts to find the description of the page in the following order:
97    ///
98    /// - OpenGraphTag's description meta tag (`og:description`)
99    /// - Twitter Card's description meta tag (`twitter:description`)
100    /// - Schema.org description meta tag (`description`)
101    /// - Description meta tag (`description`)
102    /// - The first `p` element from the document
103    pub fn find_first_description(html: &Html) -> Option<String> {
104        if let Some(description) = find_og_tag(html, OpenGraphTag::Description) {
105            return Some(description);
106        }
107
108        if let Some(description) = find_twitter_tag(html, TwitterMetaTag::Description) {
109            return Some(description);
110        }
111
112        if let Some(description) = find_schema_tag(html, SchemaMetaTag::Description) {
113            return Some(description);
114        }
115
116        if let Some(description) = find_meta_tag(html, "description") {
117            return Some(description);
118        }
119
120        if let Some(description) = first_inner_html(html, "p") {
121            return Some(description);
122        }
123
124        None
125    }
126
127    /// Attempts to find the title of the page in the following order:
128    ///
129    /// - OpenGraphTag's title meta tag (`og:title`)
130    /// - Twitter Card's title meta tag (`twitter:title`)
131    /// - Schema.org title meta tag (`title`)
132    /// - The HTML's document title
133    /// - The first `<h1>` tag in the document
134    /// - The first `<h2>` tag in the document
135    pub fn find_first_title(html: &Html) -> Option<String> {
136        if let Some(title) = find_og_tag(html, OpenGraphTag::Title) {
137            return Some(title);
138        }
139
140        if let Some(title) = find_twitter_tag(html, TwitterMetaTag::Title) {
141            return Some(title);
142        }
143
144        if let Some(title) = find_schema_tag(html, SchemaMetaTag::Name) {
145            return Some(title);
146        }
147
148        if let Some(title) = first_inner_html(html, "title") {
149            return Some(title);
150        }
151
152        if let Some(title) = first_inner_html(html, "h1") {
153            return Some(title);
154        }
155
156        if let Some(title) = first_inner_html(html, "h2") {
157            return Some(title);
158        }
159
160        None
161    }
162}
163
164impl From<Html> for LinkPreview {
165    fn from(html: Html) -> Self {
166        let image_url: Option<Url> = LinkPreview::find_first_image_url(&html);
167        let domain = LinkPreview::find_first_domain(&html);
168
169        LinkPreview {
170            title: LinkPreview::find_first_title(&html),
171            description: LinkPreview::find_first_description(&html),
172            domain,
173            image_url,
174        }
175    }
176}
177
178impl From<&Html> for LinkPreview {
179    fn from(html: &Html) -> Self {
180        let image_url: Option<Url> = LinkPreview::find_first_image_url(html);
181        let domain: Option<String> = LinkPreview::find_first_domain(html);
182
183        LinkPreview {
184            title: LinkPreview::find_first_title(html),
185            description: LinkPreview::find_first_description(html),
186            domain,
187            image_url,
188        }
189    }
190}
191
192impl FromStr for LinkPreview {
193    type Err = Error;
194
195    fn from_str(html: &str) -> Result<Self, Self::Err> {
196        let html = Html::parse_document(html);
197        let image_url: Option<Url> = LinkPreview::find_first_image_url(&html);
198        let domain: Option<String> = LinkPreview::find_first_domain(&html);
199
200        Ok(LinkPreview {
201            title: LinkPreview::find_first_title(&html),
202            description: LinkPreview::find_first_description(&html),
203            domain,
204            image_url,
205        })
206    }
207}
208
209/// Attempts to convert a HTML document byte slice into a HTML string instance
210/// and then parses the document into a `Html` instance
211pub fn html_from_bytes(value: &[u8]) -> Result<Html, Error> {
212    let utf8 = String::from_utf8(value.to_vec()).map_err(Error::InvalidUtf8)?;
213
214    Ok(Html::parse_document(utf8.as_str()))
215}
216
217#[cfg(test)]
218mod tests {
219    use std::str::FromStr;
220
221    use crate::html_from_bytes;
222    use crate::tests::FULL_FEATURED_HTML;
223
224    use super::LinkPreview;
225
226    #[test]
227    fn creates_instance_of_link_preview_from_html_instance() {
228        let html = html_from_bytes(FULL_FEATURED_HTML).unwrap();
229        let link_preview = LinkPreview::from(&html);
230
231        assert_eq!(
232            link_preview.title.unwrap(),
233            "SEO Strategies for a better web"
234        );
235        assert_eq!(link_preview.description.unwrap(), "John Appleseed tells you his secrets on SEO for a better web experience by taking advantage of OpenGraph\'s Tags!");
236        assert_eq!(
237            link_preview.image_url.unwrap().to_string(),
238            "https://www.apple.com/ac/structured-data/images/open_graph_logo.png?201809210816"
239        );
240        assert_eq!(link_preview.domain.unwrap().to_string(), "en.wikipedia.com");
241    }
242
243    #[test]
244    fn creates_instance_of_link_preview_from_str_instance() {
245        let html = String::from_utf8(FULL_FEATURED_HTML.to_vec()).unwrap();
246        let link_preview = LinkPreview::from_str(&html).unwrap();
247
248        assert_eq!(
249            link_preview.title.unwrap(),
250            "SEO Strategies for a better web"
251        );
252        assert_eq!(link_preview.description.unwrap(), "John Appleseed tells you his secrets on SEO for a better web experience by taking advantage of OpenGraph\'s Tags!");
253        assert_eq!(
254            link_preview.image_url.unwrap().to_string(),
255            "https://www.apple.com/ac/structured-data/images/open_graph_logo.png?201809210816"
256        );
257        assert_eq!(link_preview.domain.unwrap().to_string(), "en.wikipedia.com");
258    }
259
260    #[test]
261    fn finds_first_title() {
262        let html = html_from_bytes(FULL_FEATURED_HTML).unwrap();
263        let title = LinkPreview::find_first_title(&html);
264
265        assert_eq!(title.unwrap(), "SEO Strategies for a better web");
266    }
267
268    #[test]
269    fn finds_first_description() {
270        let html = html_from_bytes(FULL_FEATURED_HTML).unwrap();
271        let title = LinkPreview::find_first_description(&html);
272
273        assert_eq!(title.unwrap(), "John Appleseed tells you his secrets on SEO for a better web experience by taking advantage of OpenGraph\'s Tags!");
274    }
275
276    #[test]
277    fn finds_first_image_url() {
278        let html = html_from_bytes(FULL_FEATURED_HTML).unwrap();
279        let image_url: Option<String> =
280            LinkPreview::find_first_image_url(&html).map(|url| url.to_string());
281
282        assert_eq!(
283            image_url.unwrap(),
284            "https://www.apple.com/ac/structured-data/images/open_graph_logo.png?201809210816"
285        );
286    }
287
288    #[test]
289    fn finds_first_domain() {
290        let html = html_from_bytes(FULL_FEATURED_HTML).unwrap();
291        let domain = LinkPreview::find_first_domain(&html).map(|url| url.to_string());
292
293        assert_eq!(domain.unwrap(), "en.wikipedia.com");
294    }
295}