1pub struct MetaScraper {
2 document: scraper::Html,
3}
4
5impl MetaScraper {
6 pub fn new(html: &str) -> Self {
7 let document = scraper::Html::parse_document(html);
8
9 MetaScraper { document }
10 }
11
12 pub fn extract_title(&self) -> Option<String> {
17 self.document
18 .select(&scraper::Selector::parse("title").unwrap())
19 .next()
20 .map(|element| element.text().collect::<String>())
21 }
22
23 pub fn extract_og_title(&self) -> Option<String> {
28 let og_title_selector = scraper::Selector::parse("meta[property='og:title']").unwrap();
29
30 let og_title = self
31 .document
32 .select(&og_title_selector)
33 .next()
34 .and_then(|element| {
35 element
36 .value()
37 .attr("content")
38 .map(|content| content.to_string())
39 });
40
41 og_title
42 }
43
44 pub fn extract_twitter_title(&self) -> Option<String> {
49 let twitter_title_selector =
50 scraper::Selector::parse("meta[name='twitter:title']").unwrap();
51
52 let twitter_title = self
53 .document
54 .select(&twitter_title_selector)
55 .next()
56 .and_then(|element| {
57 element
58 .value()
59 .attr("content")
60 .map(|content| content.to_string())
61 });
62
63 twitter_title
64 }
65
66 pub fn title(&self) -> Option<String> {
75 self.extract_og_title()
76 .or_else(|| self.extract_twitter_title())
77 .or_else(|| self.extract_title())
78 }
79
80 pub fn extract_description(&self) -> Option<String> {
85 let description_selector = scraper::Selector::parse("meta[name='description']").unwrap();
86
87 let description = self
88 .document
89 .select(&description_selector)
90 .next()
91 .and_then(|element| {
92 element
93 .value()
94 .attr("content")
95 .map(|content| content.to_string())
96 });
97
98 description
99 }
100
101 pub fn extract_og_description(&self) -> Option<String> {
106 let og_description_selector =
107 scraper::Selector::parse("meta[property='og:description']").unwrap();
108
109 let og_description = self
110 .document
111 .select(&og_description_selector)
112 .next()
113 .and_then(|element| {
114 element
115 .value()
116 .attr("content")
117 .map(|content| content.to_string())
118 });
119
120 og_description
121 }
122
123 pub fn extract_twitter_description(&self) -> Option<String> {
128 let twitter_description_selector =
129 scraper::Selector::parse("meta[name='twitter:description']").unwrap();
130
131 let twitter_description = self
132 .document
133 .select(&twitter_description_selector)
134 .next()
135 .and_then(|element| {
136 element
137 .value()
138 .attr("content")
139 .map(|content| content.to_string())
140 });
141
142 twitter_description
143 }
144
145 pub fn description(&self) -> Option<String> {
154 self.extract_og_description()
155 .or_else(|| self.extract_twitter_description())
156 .or_else(|| self.extract_description())
157 }
158
159 pub fn favicon(&self) -> Option<String> {
164 let favicon_selector = scraper::Selector::parse("link[rel='icon']").unwrap();
165
166 let favicon = self
167 .document
168 .select(&favicon_selector)
169 .next()
170 .and_then(|element| element.value().attr("href").map(|href| href.to_string()));
171
172 favicon
173 }
174
175 pub fn extract_og_image(&self) -> Option<String> {
180 let og_image_selector = scraper::Selector::parse("meta[property='og:image']").unwrap();
181
182 let og_image = self
183 .document
184 .select(&og_image_selector)
185 .next()
186 .and_then(|element| {
187 element
188 .value()
189 .attr("content")
190 .map(|content| content.to_string())
191 });
192
193 og_image
194 }
195
196 pub fn extract_og_images(&self) -> Vec<String> {
202 let og_image_selector = scraper::Selector::parse("meta[property='og:image']").unwrap();
203
204 let og_images = self
205 .document
206 .select(&og_image_selector)
207 .into_iter()
208 .filter_map(|element| {
209 element
210 .value()
211 .attr("content")
212 .map(|content| content.to_string())
213 })
214 .collect::<Vec<String>>();
215
216 og_images
217 }
218
219 pub fn extract_twitter_image(&self) -> Option<String> {
225 let twitter_image_selector =
226 scraper::Selector::parse("meta[name='twitter:image']").unwrap();
227
228 let twitter_image = self
229 .document
230 .select(&twitter_image_selector)
231 .next()
232 .and_then(|element| {
233 element
234 .value()
235 .attr("content")
236 .map(|content| content.to_string())
237 });
238
239 twitter_image
240 }
241
242 pub fn image(&self) -> Option<String> {
250 self.extract_og_image()
251 .or_else(|| self.extract_twitter_image())
252 }
253
254 pub fn lang(&self) -> Option<String> {
261 let html_selector = scraper::Selector::parse("html").unwrap();
262
263 let lang = self
264 .document
265 .select(&html_selector)
266 .next()
267 .and_then(|element| {
268 element
269 .value()
270 .attr("lang")
271 .map(|content| content.to_string())
272 });
273
274 lang
275 }
276}
277
278#[cfg(test)]
279mod test {
280 use super::*;
281
282 #[test]
283 fn extract_title() {
284 let scraper = MetaScraper::new(r#"<title>Page Title</title>"#);
285
286 let title = scraper.extract_title();
287
288 assert_eq!(title, Some("Page Title".to_string()));
289 }
290
291 #[test]
292 fn extract_og_title() {
293 let scraper = MetaScraper::new(r#"<meta property="og:title" content="Page Title" />"#);
294
295 let og_title = scraper.extract_og_title();
296
297 assert_eq!(og_title, Some("Page Title".to_string()));
298 }
299
300 #[test]
301 fn extract_twitter_title() {
302 let scraper = MetaScraper::new(r#"<meta name="twitter:title" content="Page Title" />"#);
303
304 let og_title = scraper.extract_twitter_title();
305
306 assert_eq!(og_title, Some("Page Title".to_string()));
307 }
308
309 #[test]
310 fn extract_description() {
311 let scraper = MetaScraper::new(r#"<meta name="description" content="My Description" />"#);
312
313 let description = scraper.extract_description();
314
315 assert_eq!(description, Some("My Description".to_string()));
316 }
317
318 #[test]
319 fn extract_og_description() {
320 let scraper =
321 MetaScraper::new(r#"<meta property="og:description" content="My Description" />"#);
322
323 let og_description = scraper.extract_og_description();
324
325 assert_eq!(og_description, Some("My Description".to_string()));
326 }
327
328 #[test]
329 fn extract_twitter_description() {
330 let scraper =
331 MetaScraper::new(r#"<meta name="twitter:description" content="My Description" />"#);
332
333 let twitter_description = scraper.extract_twitter_description();
334
335 assert_eq!(twitter_description, Some("My Description".to_string()));
336 }
337
338 #[test]
339 fn favicon() {
340 let scraper = MetaScraper::new(r#"<link rel="icon" href="/favicon.ico" />"#);
341
342 let favicon = scraper.favicon();
343
344 assert_eq!(favicon, Some("/favicon.ico".to_string()));
345 }
346
347 #[test]
348 fn extract_og_image() {
349 let scraper = MetaScraper::new(
350 r#"<meta property="og:image" content="https://example.com/image.jpg" />"#,
351 );
352
353 let og_image = scraper.extract_og_image();
354
355 assert_eq!(og_image, Some("https://example.com/image.jpg".to_string()));
356 }
357
358 #[test]
359 fn extract_og_images() {
360 let scraper = MetaScraper::new(
361 r#"
362 <meta property="og:image" content="https://example.com/image.jpg" />
363 <meta property="og:image" content="https://example.com/image.png" />"#,
364 );
365
366 let og_image = scraper.extract_og_images();
367
368 assert_eq!(
369 og_image,
370 vec![
371 "https://example.com/image.jpg".to_string(),
372 "https://example.com/image.png".to_string()
373 ]
374 );
375 }
376
377 #[test]
378 fn extract_twitter_image() {
379 let scraper = MetaScraper::new(
380 r#"<meta name="twitter:image" content="https://example.com/image.jpg" />"#,
381 );
382
383 let twitter_image = scraper.extract_twitter_image();
384
385 assert_eq!(
386 twitter_image,
387 Some("https://example.com/image.jpg".to_string())
388 );
389 }
390
391 #[test]
392 fn lang() {
393 let scraper = MetaScraper::new(
394 r#"
395 <html lang="en">
396 ...
397 </html>
398 "#,
399 );
400
401 let lang = scraper.lang();
402
403 assert_eq!(lang, Some("en".to_owned()));
404 }
405}