metascraper/
lib.rs

1//! MetaScraper
2//!
3//! `metascraper` is on [Crates.io][crate] and [GitHub][github].
4//!
5//! [crate]: https://crates.io/crates/metascraper
6//! [github]: https://github.com/mehmetcansahin/metascraper
7//!
8//! # Examples
9//!
10//! ## Parsing a document
11//!
12//! ```
13//! use metascraper::MetaScraper;
14//!
15//! let input = include_str!("test.html");
16//! let metascraper = MetaScraper::parse(input).unwrap();
17//! let metadata = metascraper.metadata();
18//! ```
19
20use tl::{ParseError, ParserOptions, VDom};
21
22#[derive(Debug)]
23pub struct Metatag {
24    pub name: String,
25    pub content: String,
26}
27
28#[derive(Debug)]
29pub struct MetaData {
30    pub title: Option<String>,
31    pub description: Option<String>,
32    pub canonical: Option<String>,
33    pub language: Option<String>,
34    pub rss: Option<String>,
35    pub image: Option<String>,
36    pub amp: Option<String>,
37    pub author: Option<String>,
38    pub date: Option<String>,
39    pub metatags: Option<Vec<Metatag>>,
40}
41
42pub struct MetaScraper<'a> {
43    dom: VDom<'a>,
44}
45
46impl MetaScraper<'_> {
47    /// Parse input
48    pub fn parse(input: &str) -> Result<MetaScraper, ParseError> {
49        match tl::parse(input, ParserOptions::default()) {
50            Ok(dom) => Ok(MetaScraper { dom }),
51            Err(err) => Err(err),
52        }
53    }
54
55    /// Returns the inner text of the given selector.
56    pub fn inner_text(&self, selector: &str) -> Option<String> {
57        self.dom
58            .query_selector(selector)
59            .and_then(|mut iter| iter.next())
60            .and_then(|node_handle| node_handle.get(self.dom.parser()))
61            .map(|node| node.inner_text(self.dom.parser()).to_string())
62    }
63
64    /// Returns the value of the given attribute of the given selector.
65    pub fn attribute(&self, selector: &str, attr: &str) -> Option<String> {
66        self.dom
67            .query_selector(selector)
68            .and_then(|mut iter| iter.next())
69            .and_then(|node_handle| node_handle.get(self.dom.parser()))
70            .and_then(|node| node.as_tag())
71            .and_then(|html_tag| html_tag.attributes().get(attr).flatten())
72            .map(|bytes| bytes.as_utf8_str().to_string())
73    }
74
75    /// Metatags return in vector.
76    pub fn metatags(&self) -> Option<Vec<Metatag>> {
77        let mut metatags: Vec<Metatag> = Vec::new();
78        let query_sellector_iter = self.dom.query_selector(r#"meta"#)?;
79        for node_handle in query_sellector_iter {
80            let node = node_handle.get(self.dom.parser())?;
81            if let Some(tag) = node.as_tag() {
82                let name = tag
83                    .attributes()
84                    .get("name")
85                    .or_else(|| tag.attributes().get("property"))
86                    .or_else(|| tag.attributes().get("itemprop"))
87                    .or_else(|| tag.attributes().get("http-equiv"))
88                    .flatten()
89                    .map(|x| x.as_utf8_str().to_string());
90
91                let content = tag
92                    .attributes()
93                    .get("content")
94                    .or_else(|| tag.attributes().get("description"))
95                    .flatten()
96                    .map(|x| x.as_utf8_str().to_string());
97
98                if name.is_some() && content.is_some() {
99                    let nt = Metatag {
100                        name: name?,
101                        content: content?,
102                    };
103                    metatags.push(nt);
104                }
105            }
106        }
107        Some(metatags)
108    }
109
110    /// Returns the rss
111    pub fn rss(&self) -> Option<String> {
112        self.attribute(r#"link[type*=rss]"#, "href")
113            .or_else(|| self.attribute("link[type*=atom]", "href"))
114            .or_else(|| self.attribute("meta[property*=feed]", "href"))
115            .or_else(|| self.attribute("meta[property*=atom]", "href"))
116    }
117
118    /// Returns the title
119    pub fn title(&self) -> Option<String> {
120        self.inner_text("title")
121            .or_else(|| self.attribute("meta[property*=title]", "content"))
122            .or_else(|| self.inner_text(".post-title"))
123            .or_else(|| self.inner_text(".entry-title"))
124            .or_else(|| self.inner_text("h1[class*=title] a"))
125            .or_else(|| self.inner_text("h1[class*=title]"))
126    }
127
128    /// Returns the description
129    pub fn description(&self) -> Option<String> {
130        self.attribute(r#"meta[name*=description]"#, "content")
131            .or_else(|| self.attribute("meta[property*=description]", "content"))
132            .or_else(|| self.attribute("meta[itemprop*=description]", "content"))
133            .or_else(|| self.attribute("meta[description]", "description"))
134            .or_else(|| self.inner_text("p[id=description]"))
135    }
136
137    /// Returns the canonical
138    pub fn canonical(&self) -> Option<String> {
139        self.attribute("link[rel=canonical]", "href")
140            .or_else(|| self.attribute("meta[property*=url]", "content"))
141            .or_else(|| self.attribute("meta[name*=url]", "content"))
142            .or_else(|| self.attribute("link[rel=alternate][hreflang*=default]", "href"))
143    }
144
145    /// Returns the language
146    pub fn language(&self) -> Option<String> {
147        self.attribute("html", "lang")
148            .or_else(|| self.attribute("meta[itemprop=inLanguage]", "content"))
149            .or_else(|| self.attribute("meta[property*=locale]", "content"))
150    }
151
152    /// Returns the image
153    pub fn image(&self) -> Option<String> {
154        self.attribute("meta[property=og:image]", "content")
155            .or_else(|| self.attribute("meta[name*=image]", "content"))
156            .or_else(|| self.attribute("meta[itemprop*=image]", "content"))
157            .or_else(|| self.attribute("article img[src]", "src"))
158            .or_else(|| self.attribute("#content img[src]", "src"))
159            .or_else(|| self.attribute("img[alt*=author]", "src"))
160            .or_else(|| self.attribute("img[src]:not([aria-hidden=true])", "src"))
161    }
162
163    /// Returns the amp
164    pub fn amp(&self) -> Option<String> {
165        self.attribute("link[rel=amphtml]", "href")
166    }
167
168    /// Returns the author
169    pub fn author(&self) -> Option<String> {
170        self.attribute("meta[name*=author]", "content")
171            .or_else(|| self.attribute("meta[property*=author]", "content"))
172            .or_else(|| self.attribute("meta[itemprop*=author]", "content"))
173    }
174
175    /// Returns the date
176    pub fn date(&self) -> Option<String> {
177        self.attribute("meta[property*=updated_time]", "content")
178            .or_else(|| self.attribute("meta[property*=modified_time]", "content"))
179            .or_else(|| self.attribute("meta[property*=published_time]", "content"))
180            .or_else(|| self.attribute("meta[property*=release_date]", "content"))
181            .or_else(|| self.attribute("meta[itemprop*=datemodified]", "content"))
182            .or_else(|| self.attribute("meta[itemprop*=date]", "datetime"))
183            .or_else(|| self.attribute("meta[name*=date]", "content"))
184            .or_else(|| self.inner_text(".byline"))
185            .or_else(|| self.inner_text(".dateline"))
186            .or_else(|| self.inner_text(".date"))
187            .or_else(|| self.inner_text("#date"))
188            .or_else(|| self.inner_text(".publish"))
189            .or_else(|| self.inner_text("#publish"))
190            .or_else(|| self.inner_text(".post-timestamp"))
191            .or_else(|| self.inner_text("#post-timestamp"))
192            .or_else(|| self.inner_text(".time"))
193            .or_else(|| self.inner_text("#time"))
194    }
195
196    /// Returns the metadata
197    pub fn metadata(&self) -> MetaData {
198        MetaData {
199            title: self.title(),
200            description: self.description(),
201            canonical: self.canonical(),
202            language: self.language(),
203            rss: self.rss(),
204            metatags: self.metatags(),
205            image: self.image(),
206            amp: self.amp(),
207            author: self.author(),
208            date: self.date(),
209        }
210    }
211}
212
213#[cfg(test)]
214mod tests {
215    use crate::MetaScraper;
216
217    #[test]
218    fn test_page() {
219        let input = include_str!("test.html");
220        let metascraper = MetaScraper::parse(input).unwrap();
221        let metadata = metascraper.metadata();
222        assert_eq!(metadata.title, Some("Title".to_string()));
223        assert_eq!(metadata.language, Some("en".to_string()));
224        assert_eq!(metadata.description, Some("Description".to_string()));
225        assert_eq!(
226            metadata.canonical,
227            Some("https://mehmetcan.sahin.dev".to_string())
228        );
229        assert_eq!(metadata.rss, Some("rss.xml".to_string()));
230    }
231
232    #[test]
233    fn test_web_page() {
234        let input = reqwest::blocking::get("https://www.w3.org/")
235            .unwrap()
236            .text()
237            .unwrap();
238
239        let metascraper = MetaScraper::parse(&input).unwrap();
240        let metadata = metascraper.metadata();
241        assert_eq!(
242            metadata.title,
243            Some("World Wide Web Consortium (W3C)".to_string())
244        );
245        assert_eq!(metadata.language, Some("en".to_string()));
246        assert_eq!(metadata.description, Some("The World Wide Web Consortium (W3C) is an international community where Member organizations, a full-time staff, and the public work together to develop Web standards.".to_string()));
247        assert_eq!(metadata.rss, Some("/blog/news/feed/atom".to_string()));
248    }
249}