1use error::Error;
2use std::io::Read;
3
4use html5ever::rcdom::NodeData::{
5 Comment, Doctype, Document, Element, ProcessingInstruction, Text,
6};
7use html5ever::rcdom::{Handle, RcDom};
8use html5ever::tendril::TendrilSink;
9use html5ever::{parse_document, Attribute};
10
11#[cfg(feature = "reqwest")]
12use reqwest;
13#[cfg(feature = "reqwest")]
14use std::time::Duration;
15
16use Audio;
17use Image;
18use Object;
19use Video;
20
21pub struct Opts {
22 pub include_images: bool,
23 pub include_audios: bool,
24 pub include_videos: bool,
25}
26
27impl Default for Opts {
28 fn default() -> Opts {
29 Opts {
30 include_images: false,
31 include_videos: false,
32 include_audios: false,
33 }
34 }
35}
36
37#[cfg(feature = "reqwest")]
38pub fn scrape(url: &str, option: Opts) -> Result<Object, Error> {
39 let client = reqwest::Client::builder()
40 .timeout(Duration::new(30, 0))
41 .build()?;
42 let mut res = client.get(url).send()?;
43 if res.status().is_success() {
44 extract(&mut res, option).map(|mut obj| {
45 obj.images = obj
46 .images
47 .iter()
48 .map(|i| {
49 let mut i = i.clone();
50 i.normalize(&res.url());
51 i
52 })
53 .collect::<Vec<Image>>();
54 obj
55 })
56 } else {
57 Err(Error::Unexpected)
58 }
59}
60
61pub fn extract<R>(input: &mut R, option: Opts) -> Result<Object, Error>
62where
63 R: Read,
64{
65 let dom = parse_document(RcDom::default(), Default::default())
66 .from_utf8()
67 .read_from(input)
68 .unwrap();
69 let mut og_props = Vec::new();
70 let mut images = Vec::new();
71 let mut audios = Vec::new();
72 let mut videos = Vec::new();
73 walk(
74 dom.document,
75 &mut og_props,
76 &mut images,
77 &mut audios,
78 &mut videos,
79 &option,
80 );
81 let mut obj = Object::new(&og_props);
82 obj.images.append(&mut images);
83 obj.audios.append(&mut audios);
84 obj.videos.append(&mut videos);
85 Ok(obj)
86}
87
88fn walk(
89 handle: Handle,
90 og_props: &mut Vec<(String, String)>,
91 images: &mut Vec<Image>,
92 audios: &mut Vec<Audio>,
93 videos: &mut Vec<Video>,
94 option: &Opts,
95) {
96 match handle.data {
97 Document => (),
98 Doctype { .. } => (),
99 Text { .. } => (),
100 Comment { .. } => (),
101 Element {
102 ref name,
103 ref attrs,
104 ..
105 } => {
106 let tag_name = name.local.as_ref();
107 match tag_name {
108 "meta" => {
109 let mut ps = extract_open_graph_from_meta_tag(&attrs.borrow());
110 og_props.append(&mut ps);
111 }
112 "img" => {
113 if option.include_images {
114 if let Some(image) = extract_image(&attrs.borrow()) {
115 images.push(image);
116 }
117 }
118 }
119 "audio" => {
120 if option.include_audios {
121 if let Some(audio) = extract_audio(&attrs.borrow()) {
122 audios.push(audio);
123 }
124 }
125 }
126 "video" => {
127 if option.include_videos {
128 if let Some(video) = extract_video(&attrs.borrow()) {
129 videos.push(video);
130 }
131 }
132 }
133 _ => (),
134 }
135 }
136 ProcessingInstruction { .. } => unreachable!(),
137 }
138 for child in handle.children.borrow().iter() {
139 walk(child.clone(), og_props, images, audios, videos, option)
140 }
141}
142
143fn attr(attr_name: &str, attrs: &Vec<Attribute>) -> Option<String> {
144 for attr in attrs.iter() {
145 if attr.name.local.as_ref() == attr_name {
146 return Some(attr.value.to_string());
147 }
148 }
149 None
150}
151
152pub fn extract_open_graph_from_meta_tag(attrs: &Vec<Attribute>) -> Vec<(String, String)> {
153 let mut og_props = vec![];
154 match extract_open_graph_prop("property", attrs) {
155 Some((key, content)) => og_props.push((key, content)),
156 None => (),
157 }
158 match extract_open_graph_prop("name", attrs) {
159 Some((key, content)) => og_props.push((key, content)),
160 None => (),
161 }
162 og_props
163}
164
165fn extract_open_graph_prop<'a>(
166 attr_name: &str,
167 attrs: &Vec<Attribute>,
168) -> Option<(String, String)> {
169 attr(attr_name, attrs).and_then(|property| {
170 if property.starts_with("og:") {
171 let end = property.chars().count();
172 let key = unsafe { property.get_unchecked(3..end) }.to_string();
173 attr("content", attrs).map(|content| (key, content))
174 } else {
175 None
176 }
177 })
178}
179
180pub fn extract_image(attrs: &Vec<Attribute>) -> Option<Image> {
181 attr("src", attrs).map(|src| Image::new(src.to_string()))
182}
183
184pub fn extract_audio(attrs: &Vec<Attribute>) -> Option<Audio> {
185 attr("src", attrs).map(|src| Audio::new(src.to_string()))
186}
187
188pub fn extract_video(attrs: &Vec<Attribute>) -> Option<Video> {
189 attr("src", attrs).map(|src| Video::new(src.to_string()))
190}
191
192#[cfg(test)]
193mod test {
194 use super::*;
195 use object::ObjectType;
196 #[test]
197 fn extract_open_graph_object() {
198 let x = r#"
199<html prefix="og: http://ogp.me/ns#">
200<head>
201<title>The Rock (1996)</title>
202<meta property="og:title" content="The Rock" />
203<meta property="og:type" content="video.movie" />
204<meta property="og:url" content="http://www.imdb.com/title/tt0117500/" />
205<meta property="og:image" content="http://ia.media-imdb.com/images/rock.jpg" />
206</head>
207</html>
208 "#;
209 let obj = extract(&mut x.to_string().as_bytes(), Default::default());
210 assert!(obj.is_ok());
211 let obj = obj.unwrap();
212 assert_eq!(&obj.title, "The Rock");
213 assert_eq!(obj.obj_type, ObjectType::Movie);
214 assert_eq!(&obj.url, "http://www.imdb.com/title/tt0117500/");
215 assert_eq!(obj.images.len(), 1);
216 assert_eq!(
217 &obj.images[0].url,
218 "http://ia.media-imdb.com/images/rock.jpg"
219 );
220 }
221}