extern crate clap;
extern crate futures;
extern crate nipper;
extern crate reqwest;
extern crate tokio;
use clap::{App, Arg};
use futures::future;
use nipper::Document;
use std::convert::*;
use std::error::Error;
use std::fmt;
use tokio::fs::File;
use tokio::io::AsyncWriteExt;
type Result<T> = std::result::Result<T, CrateError>;
#[derive(Debug, Clone, PartialEq)]
enum CrateError {
IoError,
HttpReqError,
URLFormatError,
}
impl fmt::Display for CrateError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
CrateError::IoError => write!(f, "Io Error!"),
CrateError::HttpReqError => write!(f, "Http Request Error!"),
CrateError::URLFormatError => write!(f, "URL Format Error!"),
}
}
}
impl Error for CrateError {}
#[derive(Debug, Default)]
struct CliOpts<'input> {
page: &'input str,
out_dir: &'input str,
ftype: FileType,
}
#[derive(Debug, Clone, Copy, PartialEq)]
enum FileType {
PDF,
DOC,
DOCX,
XLSX,
CSV,
PPT,
PPTX,
ALL,
}
impl From<&str> for FileType {
fn from(input: &str) -> Self {
match input {
"pdf" => FileType::PDF,
"doc" => FileType::DOC,
"docx" => FileType::DOCX,
"xlsx" => FileType::XLSX,
"ppt" => FileType::PPT,
"pptx" => FileType::PPTX,
"csv" => FileType::CSV,
_ => FileType::ALL,
}
}
}
impl Into<&str> for FileType {
fn into(self) -> &'static str {
match self {
FileType::PDF => ".pdf",
FileType::DOC => ".doc",
FileType::DOCX => ".docx",
FileType::XLSX => ".xlsx",
FileType::PPT => ".ppt",
FileType::PPTX => ".pptx",
FileType::CSV => ".csv",
FileType::ALL => "",
}
}
}
impl Default for FileType {
fn default() -> Self {
FileType::ALL
}
}
async fn download_one(location: &str, out_dir: &str) -> Result<()> {
match reqwest::get(location).await {
Err(_) => Err(CrateError::HttpReqError),
Ok(resp) => match resp.bytes().await {
Err(_) => Err(CrateError::IoError),
Ok(body) => {
match File::create(format!(
"{}{}",
out_dir,
location.split('/').last().unwrap()
))
.await
{
Err(_) => Err(CrateError::IoError),
Ok(mut file) => file.write_all(&body).await.map_err(|_| CrateError::IoError),
}
}
},
}
}
#[tokio::main]
async fn download_all(vector_path: Vec<String>, out_dir: &str) {
let futures: Vec<_> = vector_path
.iter()
.map(|path| download_one(path, out_dir))
.collect();
let vec_results = future::join_all(futures).await;
vec_results
.into_iter()
.map(|res| res.unwrap_err())
.for_each(|e| eprintln!("{:?}", e));
}
fn parse_page(base_uri: &str, ftype: FileType) -> Result<Option<Vec<String>>> {
if let Err(err) = check_url(base_uri) {
return Err(err);
}
match reqwest::blocking::get(base_uri) {
Err(_) => Err(CrateError::HttpReqError),
Ok(resp) => match resp.text() {
Err(_) => Err(CrateError::HttpReqError),
Ok(body) => {
let document = Document::from(&body);
let elements: Vec<String> = document
.select("a")
.iter()
.map(|elem| elem.attr("href").unwrap_or_default().to_string())
.filter(|elem_str| {
if ftype == FileType::ALL {
elem_str.contains(".")
} else {
let ftype_str: &str = ftype.into();
elem_str.ends_with(ftype_str)
}
})
.map(|elem| format!("{}{}", &base_uri, elem))
.collect();
if elements.len() > 0 {
Ok(Some(elements))
} else {
Ok(None)
}
}
},
}
}
fn check_url(url_str: &str) -> Result<()> {
if let Some(res) = url_str.split(':').next() {
return match res {
"http" => Ok(()),
"https" => Ok(()),
_ => Err(CrateError::URLFormatError),
};
}
Err(CrateError::URLFormatError)
}
fn main() {
let matches = App::new("dhref")
.version("0.1.2")
.author("Kostas L. <konlampro94@gmail.com>")
.about("Download files embed in a page through\n relative and root-relative hyperlinks,\nfrom your terminal.")
.arg(
Arg::with_name("uri")
.required(true)
.takes_value(true)
.help("Http page url to be scraped. (Required)"),
)
.arg(
Arg::with_name("out_dir")
.takes_value(true)
.short("o")
.help("Relative path for the folder to place the output. (Optional)"),
)
.arg(
Arg::with_name("ftype").short("f").takes_value(true).help(
"File suffix for the type of files to be searched( e.g pdf,doc,csv). (Optional)",
),
)
.get_matches();
let cli_opts = CliOpts {
page: &matches.value_of("uri").unwrap().to_ascii_lowercase(),
out_dir: matches.value_of("out_dir").unwrap_or("./"),
ftype: matches.value_of("ftype").unwrap_or("").into(),
};
match parse_page(cli_opts.page, cli_opts.ftype) {
Ok(Some(paths)) => {
download_all(paths, cli_opts.out_dir);
}
Err(err) => eprintln!("Error: {}", err),
Ok(None) => {}
}
}
#[cfg(test)]
mod tests {
use super::*;
const PAGE: &str = "https://15445.courses.cs.cmu.edu/fall2019/schedule.html";
#[test]
fn test_check_url() {
let mut url = "file://hello";
assert_eq!(check_url(url), Err(CrateError::URLFormatError));
url = "https://google.com";
assert_eq!(check_url(url), Ok(()));
url = "http://google.com";
assert_eq!(check_url(url), Ok(()));
}
#[test]
fn test_parse_page() {
let mut result = parse_page(PAGE, FileType::PDF);
assert!(result.is_ok());
result = parse_page(PAGE, FileType::ALL);
assert!(result.is_ok());
let page = "http://";
result = parse_page(page, FileType::ALL);
assert!(result.is_err());
}
}