hansard/
retrieve.rs

1use atom_syndication::Feed;
2use hyper::Client;
3use std::io::Read;
4use std::thread;
5
6use std::fs;
7use std::fs::{ File, create_dir };
8use std::io::prelude::*;
9use std::path::Path;
10use zip::ZipArchive;
11
12const BOUND_VOL_URL: &'static str = "http://api.data.parliament.uk/resources/files/feed?dataset=14";
13const BASE: &'static str = "./data";
14const VOL_ZIP_DIR: &'static str = "vol_zip";
15const XML_DIR: &'static str = "xml";
16const INNER_ZIP_DIR: &'static str = "inner_zip";
17
18fn get_save_zip(url: String) -> thread::JoinHandle<()> {
19    thread::spawn(move || {
20        let split_path = url.split("/").collect::<Vec<&str>>();
21        let file_name = split_path.last().unwrap();
22        let full_path = format!("{}/{}/{}", BASE, VOL_ZIP_DIR, file_name);
23
24        if Path::new(full_path.as_str()).exists() {
25            info!("Skipping: {}", full_path);
26        } else {
27            info!("Getting: {}", url);
28
29            let mut zip_buf = Vec::new();
30            if let Err(e) =  Client::new()
31                .get(url.as_str())
32                .send().unwrap()
33                .read_to_end(&mut zip_buf) {
34
35                info!("Error: {:?}", e);
36                return;
37            }
38
39            info!("Saving: {}", file_name);
40
41            let mut file = File::create(full_path.clone()).unwrap();
42            file.write_all(zip_buf.as_slice()).unwrap();
43        }
44
45        let zip_file = File::open(full_path).unwrap();
46        process_zip(zip_file);
47    })
48}
49
50fn process_zip<T: Read + Seek>(zip_file: T) {
51    let mut zip = ZipArchive::new(zip_file).unwrap();
52
53    for i in 0..zip.len() {
54        let mut file = zip.by_index(i).unwrap();
55        let og_file_name = format!("{}", file.name());
56        let inner_split_path = og_file_name.split("/").collect::<Vec<&str>>();
57        let inner_file_name = inner_split_path.last().unwrap();
58        let folder = if inner_file_name.ends_with("xml") { XML_DIR } else { INNER_ZIP_DIR };
59        let inner_file_path = format!("{}/{}/{}", BASE, folder, inner_file_name);
60
61        if !inner_file_name.contains("html") &&
62            !inner_file_name.ends_with("pdf") &&
63            !inner_file_name.ends_with("htm") &&
64            !Path::new(inner_file_path.as_str()).exists() {
65
66            info!("Extracting: {}", file.name());
67
68            let mut zip_buf = Vec::new();
69            if let Err(e) = file.read_to_end(&mut zip_buf) {
70                info!("Error: {}", e);
71            }
72
73            info!("Saving: {}", inner_file_path);
74
75            let mut inner_file = File::create(inner_file_path.clone()).unwrap();
76            inner_file.write_all(zip_buf.as_slice()).unwrap();
77        } else {
78            info!("Skipping: {}", inner_file_name);
79        }
80
81        if inner_file_name.ends_with("zip") &&
82            !inner_file_name.contains("html") {
83            let inner_zip = File::open(inner_file_path).unwrap();
84            process_zip(inner_zip);
85        }
86    }
87}
88
89/// Returns a vec of the Bound volumes xml
90pub fn xml() -> Vec<String> {
91    retrieve();
92
93    fs::read_dir(format!("{}/{}", BASE, XML_DIR))
94        .unwrap()
95        .map(|ent| {
96            let mut xml_buf = String::new();
97            File::open(ent.unwrap().path()).unwrap().read_to_string(&mut xml_buf).unwrap_or(0usize);
98            xml_buf
99        }).collect::<Vec<String>>()
100}
101
102/// Retrieves the bound volumes
103pub fn retrieve() {
104    let mut atom_str = String::new();
105
106    Client::new()
107        .get(BOUND_VOL_URL)
108        .send().unwrap()
109        .read_to_string(&mut atom_str).unwrap();
110
111    let feed = atom_str.parse::<Feed>().unwrap();
112
113    if let Err(e) = create_dir(BASE) {
114        info!("Create dir: {}", e);
115    }
116    if let Err(e) = create_dir(format!("{}/{}", BASE, VOL_ZIP_DIR)) {
117        info!("Create dir: {}", e);
118    }
119    if let Err(e) = create_dir(format!("{}/{}", BASE, XML_DIR)) {
120        info!("Create dir: {}", e);
121    }
122    if let Err(e) = create_dir(format!("{}/{}", BASE, INNER_ZIP_DIR)) {
123        info!("Create dir: {}", e);
124    }
125
126    let vol_urls = feed.entries.iter()
127        .map(|e| e.links.first().unwrap().href.clone())
128        .collect::<Vec<String>>();
129
130    let handles = vol_urls.iter()
131        .map(|url| get_save_zip(url.clone()))
132        .collect::<Vec<thread::JoinHandle<()>>>();
133
134    for h in handles {
135        if let Err(e) = h.join() {
136            info!("Error: {:?}", e);
137        }
138    }
139}