1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
use crate::errors::*;
use crate::html;
use crate::queue:: {Task, QueueClient};
use reqwest::blocking::Client;
use std::collections::VecDeque;
use url::Url;

pub fn queue(client: &mut dyn QueueClient, http: &Client, base: &str) -> Result<()> {
    let mut queue = VecDeque::new();

    let target = base.parse::<Url>()
        .context("Failed to parse target as url")?;
    queue.push_back(target);

    while let Some(target) = queue.pop_front() {
        debug!("Fetching directory listing from url {:?}", target);
        let resp = http.get(target.clone())
            .send()
            .context("Failed to send request")?
            .error_for_status()
            .context("Got http error")?;

        let body = resp.text()?;
        debug!("Downloaded {} bytes", body.as_bytes().len());
        let links = html::parse_links(body.as_bytes())?;
        debug!("Discovered {} <a> tags", links.len());

        for href in &links {
            debug!("Discovered href: {:?}", href);
            let link = target.join(href)?;
            let link_str = link.as_str();
            debug!("Discovered link: {:?}", link_str);
            let target = target.as_str();

            if !link_str.starts_with(target) || link_str == target {
                debug!("Not a child link, skipping");
                continue;
            }

            if link_str.ends_with('/') {
                info!("traversing into directory: {:?}", link_str);
                queue.push_back(link);
            } else {
                let relative = link_str[base.len()..].to_string();
                let task = Task::url(relative, link);
                client.push_work(task)?;
            }
        }
    }

    Ok(())
}