opengraph 0.2.4

Parses html and extracts Open Graph protocol markup
Documentation
use std::io::Read;
use error::Error;

use html5ever::rcdom::NodeData::{
    Document,
    Doctype,
    Text,
    Comment,
    Element,
    ProcessingInstruction
};
use html5ever::rcdom::{RcDom, Handle};
use html5ever::{parse_document, Attribute};
use html5ever::tendril::TendrilSink;

#[cfg(feature = "reqwest")]
use std::time::Duration;
#[cfg(feature = "reqwest")]
use reqwest;

use Object;
use Image;
use Audio;
use Video;

pub struct Opts {
    pub include_images: bool,
    pub include_audios: bool,
    pub include_videos: bool,
}

impl Default for Opts {
    fn default() -> Opts {
        Opts {
            include_images: false,
            include_videos: false,
            include_audios: false,
        }
    }
}

#[cfg(feature = "reqwest")]
pub fn scrape(url: &str, option: Opts) -> Result<Object, Error> {
    let client = reqwest::Client::builder()
        .timeout(Duration::new(30, 0))
        .build()?;
    let mut res = client.get(url).send()?;
    if res.status().is_success() {
        extract(&mut res, option).map(|mut obj| {
            obj.images = obj.images.iter().map(|i| {
                let mut i = i.clone();
                i.normalize(&res.url());
                i
            }).collect::<Vec<Image>>();
            obj
        })
    } else {
        Err(Error::Unexpected)
    }
}

pub fn extract<R>(input: &mut R, option: Opts) -> Result<Object, Error> where R: Read {
    let dom = parse_document(RcDom::default(), Default::default())
        .from_utf8()
        .read_from(input)
        .unwrap();
    let mut og_props = Vec::new();
    let mut images   = Vec::new();
    let mut audios   = Vec::new();
    let mut videos   = Vec::new();
    walk(dom.document,
         &mut og_props,
         &mut images,
         &mut audios,
         &mut videos,
         &option);
    let mut obj = Object::new(&og_props);
    obj.images.append(&mut images);
    obj.audios.append(&mut audios);
    obj.videos.append(&mut videos);
    Ok(obj)
}

fn walk(handle:    Handle,
        og_props:  &mut Vec<(String, String)>,
        images:    &mut Vec<Image>,
        audios:    &mut Vec<Audio>,
        videos:    &mut Vec<Video>,
        option:    &Opts) {
    match handle.data {
        Document       => (),
        Doctype { .. } => (),
        Text { .. }    => (),
        Comment { .. } => (),
        Element { ref name, ref attrs, ..} => {
            let tag_name = name.local.as_ref();
            match tag_name {
                "meta" => {
                    let mut ps = extract_open_graph_from_meta_tag(&attrs.borrow());
                    og_props.append(&mut ps);
                },
                "img" => {
                    if option.include_images {
                        if let Some(image) = extract_image(&attrs.borrow()) {
                            images.push(image);
                        }
                    }
                },
                "audio" => {
                    if option.include_audios {
                        if let Some(audio) = extract_audio(&attrs.borrow()) {
                            audios.push(audio);
                        }
                    }
                },
                "video" => {
                    if option.include_videos {
                        if let Some(video) = extract_video(&attrs.borrow()) {
                            videos.push(video);
                        }
                    }
                },
                _ => (),
            }
        },
        ProcessingInstruction { .. } => unreachable!()
    }
    for child in handle.children.borrow().iter() {
        walk(child.clone(), og_props, images, audios, videos, option)
    }
}

fn attr(attr_name: &str, attrs: &Vec<Attribute>) -> Option<String> {
    for attr in attrs.iter() {
        if attr.name.local.as_ref() == attr_name {
            return Some(attr.value.to_string())
        }
    }
    None
}

pub fn extract_open_graph_from_meta_tag(attrs: &Vec<Attribute>) -> Vec<(String, String)> {
    let mut og_props = vec!();
    match extract_open_graph_prop("property", attrs) {
        Some((key, content)) => og_props.push((key, content)),
        None                 => (),
    }
    match extract_open_graph_prop("name", attrs) {
        Some((key, content)) => og_props.push((key, content)),
        None                 => (),
    }
    og_props
}

fn extract_open_graph_prop<'a>(attr_name: &str, attrs: &Vec<Attribute>) -> Option<(String, String)> {
    attr(attr_name, attrs)
        .and_then(|property|
                  if property.starts_with("og:") {
                      let end = property.chars().count();
                      let key = unsafe {
                          property.slice_unchecked(3, end)
                      }.to_string();
                      attr("content", attrs).map(|content| (key, content))
                  } else {
                      None
                  })
}

pub fn extract_image(attrs: &Vec<Attribute>) -> Option<Image> {
    attr("src", attrs).map(|src| Image::new(src.to_string()))
}

pub fn extract_audio(attrs: &Vec<Attribute>) -> Option<Audio> {
    attr("src", attrs).map(|src| Audio::new(src.to_string()))
}

pub fn extract_video(attrs: &Vec<Attribute>) -> Option<Video> {
    attr("src", attrs).map(|src| Video::new(src.to_string()))
}

#[cfg(test)]
mod test {
    use super::*;
    use object::ObjectType;
    #[test]
    fn extract_open_graph_object() {
        let x = r#"
<html prefix="og: http://ogp.me/ns#">
<head>
<title>The Rock (1996)</title>
<meta property="og:title" content="The Rock" />
<meta property="og:type" content="video.movie" />
<meta property="og:url" content="http://www.imdb.com/title/tt0117500/" />
<meta property="og:image" content="http://ia.media-imdb.com/images/rock.jpg" />
</head>
</html>
                "#;
        let obj = extract(&mut x.to_string().as_bytes(), Default::default());
        assert!(obj.is_ok());
        let obj = obj.unwrap();
        assert_eq!(&obj.title, "The Rock");
        assert_eq!(obj.obj_type, ObjectType::Movie);
        assert_eq!(&obj.url, "http://www.imdb.com/title/tt0117500/");
        assert_eq!(obj.images.len(), 1);
        assert_eq!(&obj.images[0].url, "http://ia.media-imdb.com/images/rock.jpg");
    }
}