opengraph_rs/
scraper.rs

1use error::Error;
2use std::io::Read;
3
4use html5ever::rcdom::NodeData::{
5    Comment, Doctype, Document, Element, ProcessingInstruction, Text,
6};
7use html5ever::rcdom::{Handle, RcDom};
8use html5ever::tendril::TendrilSink;
9use html5ever::{parse_document, Attribute};
10
11#[cfg(feature = "reqwest")]
12use reqwest;
13#[cfg(feature = "reqwest")]
14use std::time::Duration;
15
16use Audio;
17use Image;
18use Object;
19use Video;
20
21pub struct Opts {
22    pub include_images: bool,
23    pub include_audios: bool,
24    pub include_videos: bool,
25}
26
27impl Default for Opts {
28    fn default() -> Opts {
29        Opts {
30            include_images: false,
31            include_videos: false,
32            include_audios: false,
33        }
34    }
35}
36
37#[cfg(feature = "reqwest")]
38pub fn scrape(url: &str, option: Opts) -> Result<Object, Error> {
39    let client = reqwest::Client::builder()
40        .timeout(Duration::new(30, 0))
41        .build()?;
42    let mut res = client.get(url).send()?;
43    if res.status().is_success() {
44        extract(&mut res, option).map(|mut obj| {
45            obj.images = obj
46                .images
47                .iter()
48                .map(|i| {
49                    let mut i = i.clone();
50                    i.normalize(&res.url());
51                    i
52                })
53                .collect::<Vec<Image>>();
54            obj
55        })
56    } else {
57        Err(Error::Unexpected)
58    }
59}
60
61pub fn extract<R>(input: &mut R, option: Opts) -> Result<Object, Error>
62where
63    R: Read,
64{
65    let dom = parse_document(RcDom::default(), Default::default())
66        .from_utf8()
67        .read_from(input)
68        .unwrap();
69    let mut og_props = Vec::new();
70    let mut images = Vec::new();
71    let mut audios = Vec::new();
72    let mut videos = Vec::new();
73    walk(
74        dom.document,
75        &mut og_props,
76        &mut images,
77        &mut audios,
78        &mut videos,
79        &option,
80    );
81    let mut obj = Object::new(&og_props);
82    obj.images.append(&mut images);
83    obj.audios.append(&mut audios);
84    obj.videos.append(&mut videos);
85    Ok(obj)
86}
87
88fn walk(
89    handle: Handle,
90    og_props: &mut Vec<(String, String)>,
91    images: &mut Vec<Image>,
92    audios: &mut Vec<Audio>,
93    videos: &mut Vec<Video>,
94    option: &Opts,
95) {
96    match handle.data {
97        Document => (),
98        Doctype { .. } => (),
99        Text { .. } => (),
100        Comment { .. } => (),
101        Element {
102            ref name,
103            ref attrs,
104            ..
105        } => {
106            let tag_name = name.local.as_ref();
107            match tag_name {
108                "meta" => {
109                    let mut ps = extract_open_graph_from_meta_tag(&attrs.borrow());
110                    og_props.append(&mut ps);
111                }
112                "img" => {
113                    if option.include_images {
114                        if let Some(image) = extract_image(&attrs.borrow()) {
115                            images.push(image);
116                        }
117                    }
118                }
119                "audio" => {
120                    if option.include_audios {
121                        if let Some(audio) = extract_audio(&attrs.borrow()) {
122                            audios.push(audio);
123                        }
124                    }
125                }
126                "video" => {
127                    if option.include_videos {
128                        if let Some(video) = extract_video(&attrs.borrow()) {
129                            videos.push(video);
130                        }
131                    }
132                }
133                _ => (),
134            }
135        }
136        ProcessingInstruction { .. } => unreachable!(),
137    }
138    for child in handle.children.borrow().iter() {
139        walk(child.clone(), og_props, images, audios, videos, option)
140    }
141}
142
143fn attr(attr_name: &str, attrs: &Vec<Attribute>) -> Option<String> {
144    for attr in attrs.iter() {
145        if attr.name.local.as_ref() == attr_name {
146            return Some(attr.value.to_string());
147        }
148    }
149    None
150}
151
152pub fn extract_open_graph_from_meta_tag(attrs: &Vec<Attribute>) -> Vec<(String, String)> {
153    let mut og_props = vec![];
154    match extract_open_graph_prop("property", attrs) {
155        Some((key, content)) => og_props.push((key, content)),
156        None => (),
157    }
158    match extract_open_graph_prop("name", attrs) {
159        Some((key, content)) => og_props.push((key, content)),
160        None => (),
161    }
162    og_props
163}
164
165fn extract_open_graph_prop<'a>(
166    attr_name: &str,
167    attrs: &Vec<Attribute>,
168) -> Option<(String, String)> {
169    attr(attr_name, attrs).and_then(|property| {
170        if property.starts_with("og:") {
171            let end = property.chars().count();
172            let key = unsafe { property.get_unchecked(3..end) }.to_string();
173            attr("content", attrs).map(|content| (key, content))
174        } else {
175            None
176        }
177    })
178}
179
180pub fn extract_image(attrs: &Vec<Attribute>) -> Option<Image> {
181    attr("src", attrs).map(|src| Image::new(src.to_string()))
182}
183
184pub fn extract_audio(attrs: &Vec<Attribute>) -> Option<Audio> {
185    attr("src", attrs).map(|src| Audio::new(src.to_string()))
186}
187
188pub fn extract_video(attrs: &Vec<Attribute>) -> Option<Video> {
189    attr("src", attrs).map(|src| Video::new(src.to_string()))
190}
191
192#[cfg(test)]
193mod test {
194    use super::*;
195    use object::ObjectType;
196    #[test]
197    fn extract_open_graph_object() {
198        let x = r#"
199<html prefix="og: http://ogp.me/ns#">
200<head>
201<title>The Rock (1996)</title>
202<meta property="og:title" content="The Rock" />
203<meta property="og:type" content="video.movie" />
204<meta property="og:url" content="http://www.imdb.com/title/tt0117500/" />
205<meta property="og:image" content="http://ia.media-imdb.com/images/rock.jpg" />
206</head>
207</html>
208                "#;
209        let obj = extract(&mut x.to_string().as_bytes(), Default::default());
210        assert!(obj.is_ok());
211        let obj = obj.unwrap();
212        assert_eq!(&obj.title, "The Rock");
213        assert_eq!(obj.obj_type, ObjectType::Movie);
214        assert_eq!(&obj.url, "http://www.imdb.com/title/tt0117500/");
215        assert_eq!(obj.images.len(), 1);
216        assert_eq!(
217            &obj.images[0].url,
218            "http://ia.media-imdb.com/images/rock.jpg"
219        );
220    }
221}