1use tl::{ParseError, ParserOptions, VDom};
21
22#[derive(Debug)]
23pub struct Metatag {
24 pub name: String,
25 pub content: String,
26}
27
28#[derive(Debug)]
29pub struct MetaData {
30 pub title: Option<String>,
31 pub description: Option<String>,
32 pub canonical: Option<String>,
33 pub language: Option<String>,
34 pub rss: Option<String>,
35 pub image: Option<String>,
36 pub amp: Option<String>,
37 pub author: Option<String>,
38 pub date: Option<String>,
39 pub metatags: Option<Vec<Metatag>>,
40}
41
42pub struct MetaScraper<'a> {
43 dom: VDom<'a>,
44}
45
46impl MetaScraper<'_> {
47 pub fn parse(input: &str) -> Result<MetaScraper, ParseError> {
49 match tl::parse(input, ParserOptions::default()) {
50 Ok(dom) => Ok(MetaScraper { dom }),
51 Err(err) => Err(err),
52 }
53 }
54
55 pub fn inner_text(&self, selector: &str) -> Option<String> {
57 self.dom
58 .query_selector(selector)
59 .and_then(|mut iter| iter.next())
60 .and_then(|node_handle| node_handle.get(self.dom.parser()))
61 .map(|node| node.inner_text(self.dom.parser()).to_string())
62 }
63
64 pub fn attribute(&self, selector: &str, attr: &str) -> Option<String> {
66 self.dom
67 .query_selector(selector)
68 .and_then(|mut iter| iter.next())
69 .and_then(|node_handle| node_handle.get(self.dom.parser()))
70 .and_then(|node| node.as_tag())
71 .and_then(|html_tag| html_tag.attributes().get(attr).flatten())
72 .map(|bytes| bytes.as_utf8_str().to_string())
73 }
74
75 pub fn metatags(&self) -> Option<Vec<Metatag>> {
77 let mut metatags: Vec<Metatag> = Vec::new();
78 let query_sellector_iter = self.dom.query_selector(r#"meta"#)?;
79 for node_handle in query_sellector_iter {
80 let node = node_handle.get(self.dom.parser())?;
81 if let Some(tag) = node.as_tag() {
82 let name = tag
83 .attributes()
84 .get("name")
85 .or_else(|| tag.attributes().get("property"))
86 .or_else(|| tag.attributes().get("itemprop"))
87 .or_else(|| tag.attributes().get("http-equiv"))
88 .flatten()
89 .map(|x| x.as_utf8_str().to_string());
90
91 let content = tag
92 .attributes()
93 .get("content")
94 .or_else(|| tag.attributes().get("description"))
95 .flatten()
96 .map(|x| x.as_utf8_str().to_string());
97
98 if name.is_some() && content.is_some() {
99 let nt = Metatag {
100 name: name?,
101 content: content?,
102 };
103 metatags.push(nt);
104 }
105 }
106 }
107 Some(metatags)
108 }
109
110 pub fn rss(&self) -> Option<String> {
112 self.attribute(r#"link[type*=rss]"#, "href")
113 .or_else(|| self.attribute("link[type*=atom]", "href"))
114 .or_else(|| self.attribute("meta[property*=feed]", "href"))
115 .or_else(|| self.attribute("meta[property*=atom]", "href"))
116 }
117
118 pub fn title(&self) -> Option<String> {
120 self.inner_text("title")
121 .or_else(|| self.attribute("meta[property*=title]", "content"))
122 .or_else(|| self.inner_text(".post-title"))
123 .or_else(|| self.inner_text(".entry-title"))
124 .or_else(|| self.inner_text("h1[class*=title] a"))
125 .or_else(|| self.inner_text("h1[class*=title]"))
126 }
127
128 pub fn description(&self) -> Option<String> {
130 self.attribute(r#"meta[name*=description]"#, "content")
131 .or_else(|| self.attribute("meta[property*=description]", "content"))
132 .or_else(|| self.attribute("meta[itemprop*=description]", "content"))
133 .or_else(|| self.attribute("meta[description]", "description"))
134 .or_else(|| self.inner_text("p[id=description]"))
135 }
136
137 pub fn canonical(&self) -> Option<String> {
139 self.attribute("link[rel=canonical]", "href")
140 .or_else(|| self.attribute("meta[property*=url]", "content"))
141 .or_else(|| self.attribute("meta[name*=url]", "content"))
142 .or_else(|| self.attribute("link[rel=alternate][hreflang*=default]", "href"))
143 }
144
145 pub fn language(&self) -> Option<String> {
147 self.attribute("html", "lang")
148 .or_else(|| self.attribute("meta[itemprop=inLanguage]", "content"))
149 .or_else(|| self.attribute("meta[property*=locale]", "content"))
150 }
151
152 pub fn image(&self) -> Option<String> {
154 self.attribute("meta[property=og:image]", "content")
155 .or_else(|| self.attribute("meta[name*=image]", "content"))
156 .or_else(|| self.attribute("meta[itemprop*=image]", "content"))
157 .or_else(|| self.attribute("article img[src]", "src"))
158 .or_else(|| self.attribute("#content img[src]", "src"))
159 .or_else(|| self.attribute("img[alt*=author]", "src"))
160 .or_else(|| self.attribute("img[src]:not([aria-hidden=true])", "src"))
161 }
162
163 pub fn amp(&self) -> Option<String> {
165 self.attribute("link[rel=amphtml]", "href")
166 }
167
168 pub fn author(&self) -> Option<String> {
170 self.attribute("meta[name*=author]", "content")
171 .or_else(|| self.attribute("meta[property*=author]", "content"))
172 .or_else(|| self.attribute("meta[itemprop*=author]", "content"))
173 }
174
175 pub fn date(&self) -> Option<String> {
177 self.attribute("meta[property*=updated_time]", "content")
178 .or_else(|| self.attribute("meta[property*=modified_time]", "content"))
179 .or_else(|| self.attribute("meta[property*=published_time]", "content"))
180 .or_else(|| self.attribute("meta[property*=release_date]", "content"))
181 .or_else(|| self.attribute("meta[itemprop*=datemodified]", "content"))
182 .or_else(|| self.attribute("meta[itemprop*=date]", "datetime"))
183 .or_else(|| self.attribute("meta[name*=date]", "content"))
184 .or_else(|| self.inner_text(".byline"))
185 .or_else(|| self.inner_text(".dateline"))
186 .or_else(|| self.inner_text(".date"))
187 .or_else(|| self.inner_text("#date"))
188 .or_else(|| self.inner_text(".publish"))
189 .or_else(|| self.inner_text("#publish"))
190 .or_else(|| self.inner_text(".post-timestamp"))
191 .or_else(|| self.inner_text("#post-timestamp"))
192 .or_else(|| self.inner_text(".time"))
193 .or_else(|| self.inner_text("#time"))
194 }
195
196 pub fn metadata(&self) -> MetaData {
198 MetaData {
199 title: self.title(),
200 description: self.description(),
201 canonical: self.canonical(),
202 language: self.language(),
203 rss: self.rss(),
204 metatags: self.metatags(),
205 image: self.image(),
206 amp: self.amp(),
207 author: self.author(),
208 date: self.date(),
209 }
210 }
211}
212
213#[cfg(test)]
214mod tests {
215 use crate::MetaScraper;
216
217 #[test]
218 fn test_page() {
219 let input = include_str!("test.html");
220 let metascraper = MetaScraper::parse(input).unwrap();
221 let metadata = metascraper.metadata();
222 assert_eq!(metadata.title, Some("Title".to_string()));
223 assert_eq!(metadata.language, Some("en".to_string()));
224 assert_eq!(metadata.description, Some("Description".to_string()));
225 assert_eq!(
226 metadata.canonical,
227 Some("https://mehmetcan.sahin.dev".to_string())
228 );
229 assert_eq!(metadata.rss, Some("rss.xml".to_string()));
230 }
231
232 #[test]
233 fn test_web_page() {
234 let input = reqwest::blocking::get("https://www.w3.org/")
235 .unwrap()
236 .text()
237 .unwrap();
238
239 let metascraper = MetaScraper::parse(&input).unwrap();
240 let metadata = metascraper.metadata();
241 assert_eq!(
242 metadata.title,
243 Some("World Wide Web Consortium (W3C)".to_string())
244 );
245 assert_eq!(metadata.language, Some("en".to_string()));
246 assert_eq!(metadata.description, Some("The World Wide Web Consortium (W3C) is an international community where Member organizations, a full-time staff, and the public work together to develop Web standards.".to_string()));
247 assert_eq!(metadata.rss, Some("/blog/news/feed/atom".to_string()));
248 }
249}