1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
//! # urls2disk
//!
//! [![Build Status](https://travis-ci.org/bcmyers/urls2disk.svg?branch=master)](https://travis-ci.org/bcmyers/urls2disk)
//!
//! [crates.io](https://crates.io/crates/urls2disk) |
//! [docs.rs](https://docs.rs/urls2disk) |
//! [github.com](https://github.com/bcmyers/urls2disk)
//!
//! `urls2disk` is a rust crate that helps you to download a series of
//! webpages in parallel and save them to disk. Depending on your
//! choice, it will either write the raw bytes of the webpages to disk or it will
//! first convert them to PDF before writing them to disk. It's helpful for general
//! webscraping as well as for converting a bunch of webpages to PDF.
//!
//! A key feature of `urls2disk` is that you can set a maximum
//! number of requests per second while downloading webpages; so you can effectively throttle
//! yourself so as not to run afoul of any servers that will block you if you
//! hit them with too many requests at once.
//!
//! Under the hood, `urls2disk` uses [wkhtmltopdf](https://wkhtmltopdf.org/) to
//! convert webpages to PDF if you choose that option; so to use it you'll need
//! wkhtmltopdf installed on your machine. Installing wkhtmltopdf on macOS with
//! [Homebrew](https://brew.sh/) is super simple. Just `brew install Caskroom/cask/wkhtmltopdf`
//! in your terminal. For other systems or if you don't have Homebrew, you're on your own
//! for installing wkhtmltopdf, but perhaps at some point I'll lookup instructions for how to
//! install it on different setups and include them here. As far as versions go, I've only tested
//! with wkhtmltopdf 0.12.4.
//!
//! Here's an example of downloading Apple, Inc.'s annual reports from 2010-2017
//! from the SEC website using `urls2disk`:
//!
//! ```rust
//! extern crate reqwest;
//! extern crate urls2disk;
//!
//! use std::fs;
//! use std::path::Path;
//!
//! use urls2disk::{wkhtmltopdf, ClientBuilder, Result, SimpleDocument, Url};
//!
//! // This function will download Apple, Inc.'s annual reports for the years 2010 to 2017
//! // from the SEC's website to your disk.  It will download two copies of each annual
//! // report: one of just the raw html and another that has been converted to PDF.
//! fn run() -> Result<()> {
//!     // Create an output directory.
//!     let output_directory = Path::new("./data");
//!     if !output_directory.exists() {
//!         fs::create_dir_all(output_directory)?;
//!     }
//!
//!     // Create a vector of urls we would like to download.
//!     // These urls represent the annual reports for Apple, Inc. from 2010 to 2017.
//!     let base = "https://www.sec.gov/Archives/edgar/data/";
//!     let urls = vec![
//!         "320193/000119312510238044/d10k.htm",
//!         "320193/000119312511282113/d220209d10k.htm",
//!         "320193/000119312512444068/d411355d10k.htm",
//!         "320193/000119312513416534/d590790d10k.htm",
//!         "320193/000119312514383437/d783162d10k.htm",
//!         "320193/000119312515356351/d17062d10k.htm",
//!         "320193/000162828016020309/a201610-k9242016.htm",
//!         "320193/000032019317000070/a10-k20179302017.htm",
//!     ].iter()
//!         .map(|stem| format!("{}{}", &base, stem))
//!         .collect::<Vec<String>>();
//!
//!     // Turn the vector of urls into a vector of boxed Document trait objects (here we'll
//!     // be using the SimpleDocument struct as one possible implementer of the Document trait).
//!     // For this batch, we set the wkhtmltopdf option to false; so when we feed this list
//!     // to the Client it will just download the raw webpages in html format instead of
//!     // first converting them to PDF.
//!     let html_documents = urls.iter()
//!         .enumerate()
//!         .map(|(i, url_string)| {
//!             let filename = format!("Apple 10-K {}.html", i + 2010);
//!             let path = output_directory.join(&filename);
//!             let url = url_string.parse::<Url>()?;
//!             let wkhtmltopdf = false;
//!             let document = SimpleDocument::new(path, url, wkhtmltopdf);
//!             Ok(Box::new(document))
//!         })
//!         .collect::<Result<Vec<Box<SimpleDocument>>>>()?;
//!
//!     // Turn the vector of urls into another vector of boxed Document trait objects
//!     // (to show off additional functionality).  This time we'll set the wkhtmltopdf
//!     // option to true; so when we feed this list to the Client it will first convert
//!     // the wepages to PDF before writing them to disk.
//!     let pdf_documents = urls.iter()
//!         .enumerate()
//!         .map(|(i, url_string)| {
//!             let filename = format!("Apple 10-K {}.pdf", i + 2010);
//!             let path = output_directory.join(&filename);
//!             let url = url_string.parse::<Url>()?;
//!             let wkhtmltopdf = true;
//!             let document = SimpleDocument::new(path, url, wkhtmltopdf);
//!             Ok(Box::new(document))
//!         })
//!         .collect::<Result<Vec<Box<SimpleDocument>>>>()?;
//!
//!     // Combine our two vectors into one vector of Box<SimpleDocument>.
//!     let mut documents = [&html_documents[..], &pdf_documents[..]].concat();
//!
//!     // Create the client.
//!     // Here, we're showing several customization options, but if you want to use
//!     // just the default settings, you could simply build the client with
//!     // `let client = ClientBuilder::default().build()?;`
//!     let client = ClientBuilder::default()
//!         .set_max_requests_per_second(9)
//!         .set_max_threads_cpu(4)
//!         .set_max_threads_io(50)
//!         .set_reqwest_client(reqwest::Client::new())
//!         .set_wkhtmltopdf_setting(wkhtmltopdf::Setting::Zoom(3.5))
//!         .set_wkhtmltopdf_settings(vec![
//!             wkhtmltopdf::Setting::DisableExternalLinks(true),
//!             wkhtmltopdf::Setting::DisableJavascript(true),
//!         ])
//!         .build()?;
//!
//!     // Let the client go. It will download and write to disk all the
//!     // documents while simultaneously respecting the 'requests per second' and
//!     // other limits we provided. If you already have the documents on disk,
//!     // the client will not redownload them.
//!     client.get_documents(&mut documents)?;
//!
//!     // Note: Here, if you want to, you can now access the raw bytes of all the urls
//!     // you downloaded, since they are now stored on each SimpleDocument in addition
//!     // to being saved on your disk.
//!     Ok(())
//! }
//!
//! fn main() {
//!     run().unwrap();
//! }
//! ```
#![cfg_attr(feature = "clippy", feature(plugin))]
#![cfg_attr(feature = "clippy", plugin(clippy))]
#![cfg_attr(not(feature = "clippy"), allow(unknown_lints))]
#![deny(missing_debug_implementations)]
#![deny(missing_docs)]
#![doc(html_root_url = "https://docs.rs/urls2disk/0.1.1")]

#[macro_use]
extern crate cfg_if;
extern crate crossbeam;
#[macro_use]
extern crate failure;
#[macro_use]
extern crate log;
extern crate num_cpus;
extern crate reqwest;
extern crate url;

mod client;
mod client_builder;
mod document;
mod error;
mod semaphore;
mod simple_document;
pub mod wkhtmltopdf;
mod utils;

pub use client::Client;
pub use client_builder::ClientBuilder;
pub use document::Document;
pub use error::{Error, Result};
pub use simple_document::SimpleDocument;

/// `Url` is an alias for `url::Url`
pub type Url = url::Url;