1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
use html5ever::parse_document;
use html5ever::tendril::TendrilSink;
use std::collections::HashMap;
use std::default::Default;
use std::io;
use std::path::Path;
use html5ever::driver::ParseOpts;
use html5ever::rcdom::RcDom;
use crate::opengraph::Opengraph;
use crate::parser::Parser;
use crate::schema_org::SchemaOrg;
#[derive(Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct HTML {
pub title: Option<String>,
pub description: Option<String>,
pub url: Option<String>,
pub feed: Option<String>,
pub language: Option<String>,
pub text_content: String,
pub meta: HashMap<String, String>,
pub opengraph: Opengraph,
pub schema_org: Vec<SchemaOrg>,
}
impl HTML {
fn empty(url: Option<String>) -> Self {
Self {
title: None,
description: None,
url,
feed: None,
language: None,
text_content: String::new(),
meta: HashMap::new(),
opengraph: Opengraph::empty(),
schema_org: Vec::new(),
}
}
pub fn from_dom(dom: RcDom, url: Option<String>) -> Self {
let mut html = Self::empty(url);
let parser = Parser::start(dom.document);
parser.traverse(&mut html);
html
}
pub fn from_file(path: &str, url: Option<String>) -> Result<Self, io::Error> {
parse_document(RcDom::default(), ParseOpts::default())
.from_utf8()
.from_file(Path::new(path))
.and_then(|dom| Ok(Self::from_dom(dom, url)))
}
pub fn from_string(html: String, url: Option<String>) -> Result<Self, io::Error> {
parse_document(RcDom::default(), ParseOpts::default())
.from_utf8()
.read_from(&mut html.as_bytes())
.and_then(|dom| Ok(Self::from_dom(dom, url)))
}
}
#[cfg(test)]
mod tests {
use super::HTML;
#[test]
fn from_string() {
let input = "<html><head><title>Hello</title></head><body>Contents".to_string();
let html = HTML::from_string(input, None);
assert!(html.is_ok());
let html = html.unwrap();
assert_eq!(html.title, Some("Hello".to_string()));
assert!(html.description.is_none());
assert_eq!(html.text_content, "Contents".to_string());
}
}