#![warn(missing_docs)]
use lazy_static::lazy_static;
use regex::Regex;
use reqwest::blocking::Client;
use std::io::Result;
use std::sync::mpsc::{channel, Receiver, Sender};
use std::time::Duration;
use threadpool::ThreadPool;
use url::Url;
pub fn list(url: Url) -> Result<Vec<Url>> {
Crawler::new().list(url)
}
struct Visitor {
min_len: usize,
}
impl Visitor {
fn new() -> Visitor {
Visitor { min_len: 0 }
}
fn visit(&self, url: &Url) -> Option<Visitor> {
let min_len = url.path().len();
if min_len > self.min_len {
Some(Visitor { min_len })
} else {
None
}
}
}
type Message = Option<Result<Url>>;
pub struct Crawler {
client: Client,
workers: ThreadPool,
rx: Receiver<Message>,
tx: Sender<Message>,
}
fn mk_error(msg: &str) -> std::io::Error {
std::io::Error::new(std::io::ErrorKind::Other, msg)
}
impl Crawler {
pub fn new() -> Crawler {
let workers = ThreadPool::new(4);
let (tx, rx) = channel();
let client = Client::builder()
.danger_accept_invalid_certs(std::env::var("LOGREDUCE_SSL_NO_VERIFY").is_ok())
.build()
.expect("Client");
Crawler {
workers,
client,
tx,
rx,
}
}
pub fn list(&self, url: Url) -> Result<Vec<Url>> {
self.start(url);
self.workers.join();
if self.workers.panic_count() > 0 {
Err(mk_error("Crawler panicked!"))
} else {
self.rx.try_iter().flatten().collect()
}
}
pub fn walk(self, url: Url) -> impl Iterator<Item = Result<Url>> {
self.start(url);
CrawlerIter {
workers: self.workers,
abort: false,
rx: self.rx,
}
}
fn start(&self, url: Url) {
Crawler::process(&Visitor::new(), &self.client, &self.workers, &self.tx, url);
}
fn process(
visitor: &Visitor,
client: &Client,
pool: &ThreadPool,
tx: &Sender<Message>,
url: Url,
) {
if let Some(visitor) = visitor.visit(&url) {
let tx = tx.clone();
let sub_pool = pool.clone();
let client = client.clone();
pool.execute(move || match http_list(&client, url) {
Ok(urls) => {
for url in urls {
if url.path().ends_with("/etc/") {
continue;
} else if let Some(url) = path_dir(&url) {
Crawler::process(&visitor, &client, &sub_pool, &tx, url)
} else {
tx.send(Some(Ok(url))).unwrap()
}
}
tx.send(None).unwrap()
}
Err(e) => tx.send(Some(Err(e))).unwrap(),
});
}
}
}
impl Default for Crawler {
fn default() -> Self {
Self::new()
}
}
struct CrawlerIter {
workers: ThreadPool,
abort: bool,
rx: Receiver<Message>,
}
impl CrawlerIter {
fn is_done(&self) -> bool {
self.abort
|| (self.workers.active_count() + self.workers.queued_count() == 0
&& self.workers.panic_count() == 0)
}
}
impl Iterator for CrawlerIter {
type Item = Result<Url>;
fn next(&mut self) -> Option<Self::Item> {
match self.rx.try_recv() {
Ok(Some(r)) => Some(r),
Ok(None) => self.next(),
Err(_) if self.is_done() => None,
Err(_) if self.workers.panic_count() > 0 => {
self.abort = true;
Some(Err(mk_error("Crawler panicked!")))
}
Err(_) => match self.rx.recv_timeout(Duration::from_secs(1)) {
Ok(Some(r)) => Some(r),
_ => self.next(),
},
}
}
}
fn path_dir(url: &Url) -> Option<Url> {
if url.path().ends_with('/') {
Some(url.clone())
} else if url.path().ends_with("/index.html") {
let mut new_url = url.clone();
let new_len = url.path().len() - 10;
new_url.set_path(&url.path()[..new_len]);
Some(new_url)
} else {
None
}
}
pub fn http_list(client: &Client, url: Url) -> Result<Vec<Url>> {
match client.get(url).send() {
Ok(resp) => {
let url = resp.url().clone();
match resp.text() {
Ok(text) => parse_index_of(url, &text),
Err(e) => Err(mk_error(&format!("Response failed {}", e))),
}
}
Err(e) => Err(mk_error(&format!("Reqwest failed {}", e))),
}
}
fn parse_index_of(base_url: Url, page: &str) -> Result<Vec<Url>> {
lazy_static! {
static ref RE: Regex = Regex::new(r#"<a href="([\\/a-zA-Z0-9][^"]+)">"#).unwrap();
}
RE.captures_iter(page)
.map(|c| c.get(1).unwrap().as_str())
.map(|link| {
base_url
.join(link)
.map_err(|e| mk_error(&format!("Invalid link {}", e)))
})
.collect()
}
#[test]
fn test_httpdir() {
let mut server = mockito::Server::new();
let root = "/logs/98/24398/5/check/dhall-diff/23b9eed/";
let url = Url::parse(&server.url()).unwrap().join(root);
let base_mock = server.mock("GET", root)
.with_body(
r#"
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
<head>
<title>Index of /logs/98/24398/5/check/dhall-diff/23b9eed</title>
</head>
<body>
<h1>Index of /logs/98/24398/5/check/dhall-diff/23b9eed</h1>
<table>
<tr><th valign="top"><img src="/icons/blank.gif" alt="[ICO]"></th><th><a href="?C=N;O=D">Name</a></th><th><a href="?C=M;O=A">Last modified</a></th><th><a href="?C=S;O=A">Size</a></th></tr>
<tr><th colspan="4"><hr></th></tr>
<tr><td valign="top"><img src="/icons/back.gif" alt="[PARENTDIR]"></td><td><a href="/logs/98/24398/5/check/dhall-diff/">Parent Directory</a></td><td> </td><td align="right"> - </td></tr>
<tr><td valign="top"><img src="/icons/compressed.gif" alt="[ ]"></td><td><a href="job-output.json.gz">job-output.json.gz</a></td><td align="right">2022-03-23 17:33 </td><td align="right">7.0K</td></tr>
<tr><td valign="top"><img src="/icons/compressed.gif" alt="[ ]"></td><td><a href="job-output.txt.gz">job-output.txt.gz</a></td><td align="right">2022-03-23 17:33 </td><td align="right">4.7K</td></tr>
<tr><td valign="top"><img src="/icons/folder.gif" alt="[DIR]"></td><td><a href="zuul-info/">zuul-info/</a></td><td align="right">2022-03-23 17:31 </td><td align="right"> - </td></tr>
<tr><td valign="top"><img src="/icons/unknown.gif" alt="[ ]"></td><td><a href="zuul-manifest.json">zuul-manifest.json</a></td><td align="right">2022-03-23 17:33 </td><td align="right">478 </td></tr>
<tr><th colspan="4"><hr></th></tr>
</table>
</body></html>
"#).expect(2).create();
let info_mock = server.mock("GET", &*format!("{}zuul-info/", root)).with_body(
r#"
<tr><td valign="top"><img src="/icons/back.gif" alt="[PARENTDIR]"></td><td><a href="/logs/98/24398/5/check/dhall-diff/23b9eed/">Parent Directory</a></td><td> </td><td align="right"> - </td></tr>
<tr><td valign="top"><img src="/icons/text.gif" alt="[TXT]"></td><td><a href="inventory.yaml">inventory.yaml</a></td><td align="right">2022-03-23 17:31 </td><td align="right">817 </td></tr>
"#).expect(2).create();
let catch_all = server
.mock("GET", mockito::Matcher::Any)
.with_body("oops")
.expect(0)
.create();
let res = list(url.clone().unwrap()).unwrap();
dbg!(&res);
assert!(res.len() == 4);
let iter_res = Crawler::new()
.walk(url.unwrap())
.collect::<Result<Vec<_>>>()
.unwrap();
assert_eq!(res, iter_res);
catch_all.assert();
info_mock.assert();
base_mock.assert();
}