secret_scraper 0.1.3

A URL Crawler tool and library for crawling web targets, discovering links, and detecting secrets with configurable regex rules.
Documentation
use std::{
    cell::RefCell,
    collections::{HashMap, HashSet},
    io::{self, Write},
    rc::Rc,
};

use secret_scraper::{
    handler::Secret,
    output::{Formatter, URLType, output_csv},
    urlparser::{ResponseStatus, URLNode, URLNodeBuilder},
};

fn node(url: &str, status: ResponseStatus) -> URLNode {
    URLNodeBuilder::default()
        .url(url.to_string())
        .response_status(status)
        .depth(0)
        .build()
        .expect("valid URL node")
}

fn detailed_node(
    url: &str,
    status: ResponseStatus,
    content_length: Option<u64>,
    content_type: Option<&'static str>,
    title: Option<&'static str>,
) -> URLNode {
    URLNodeBuilder::default()
        .url(url.to_string())
        .response_status(status)
        .depth(0)
        .content_length(content_length)
        .content_type(content_type.map(str::to_string))
        .title(title.map(str::to_string))
        .build()
        .expect("valid URL node")
}

fn strip_ansi(input: &str) -> String {
    let mut out = String::new();
    let mut chars = input.chars().peekable();
    while let Some(ch) = chars.next() {
        if ch == '\u{1b}' && chars.peek() == Some(&'[') {
            chars.next();
            for code_ch in chars.by_ref() {
                if code_ch == 'm' {
                    break;
                }
            }
        } else {
            out.push(ch);
        }
    }
    out
}

fn domains(items: &[&str]) -> HashSet<String> {
    items.iter().map(|item| (*item).to_string()).collect()
}

fn children(items: Vec<URLNode>) -> HashSet<URLNode> {
    items.into_iter().collect()
}

#[derive(Clone, Default)]
struct SharedBuffer(Rc<RefCell<Vec<u8>>>);

impl SharedBuffer {
    fn into_string(&self) -> String {
        String::from_utf8(self.0.borrow().clone()).expect("utf8 csv")
    }
}

impl Write for SharedBuffer {
    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
        self.0.borrow_mut().extend_from_slice(buf);
        Ok(buf.len())
    }

    fn flush(&mut self) -> io::Result<()> {
        Ok(())
    }
}

#[test]
fn format_url_per_domain_groups_by_root_domain_and_includes_base_url() {
    let formatter = Formatter::new(None);
    let domains = domains(&["example.com"]);

    let base = node("https://www.example.com", ResponseStatus::Valid(200));
    let child = node("https://api.example.com/users", ResponseStatus::Valid(200));
    let mut urls = HashMap::new();
    urls.insert(base, children(vec![child]));

    let output = strip_ansi(&formatter.format_url_per_domain(&domains, &urls, URLType::Url));

    assert!(output.contains("2 URL from example.com:\n"));
    assert!(output.contains("https://www.example.com [200]"));
    assert!(output.contains("https://api.example.com/users [200]"));
    assert!(!output.contains("Other"));
}

#[test]
fn format_url_per_domain_places_external_root_domains_in_other_last() {
    let formatter = Formatter::new(None);
    let domains = domains(&["example.com"]);

    let base = node("https://www.example.com", ResponseStatus::Valid(200));
    let first_party = node("https://cdn.example.com/app.js", ResponseStatus::Valid(200));
    let external = node("https://cdn.other.net/lib.js", ResponseStatus::Valid(200));
    let mut urls = HashMap::new();
    urls.insert(base, children(vec![first_party, external]));

    let output = strip_ansi(&formatter.format_url_per_domain(&domains, &urls, URLType::JS));

    let first_party_section = output
        .find("2 JS from example.com:")
        .expect("first-party section");
    let other_section = output.find("1 JS from Other:").expect("other section");

    assert!(first_party_section < other_section);
    assert!(output.contains("https://www.example.com [200]"));
    assert!(output.contains("https://cdn.example.com/app.js [200]"));
    assert!(output.contains("https://cdn.other.net/lib.js [200]"));
}

#[test]
fn format_url_per_domain_counts_only_filtered_urls() {
    let formatter = Formatter::new(None);
    let domains = domains(&["example.com"]);

    let base = node("https://example.com", ResponseStatus::Valid(200));
    let ok = node("https://example.com/ok", ResponseStatus::Valid(200));
    let missing = node("https://example.com/missing", ResponseStatus::Valid(404));
    let mut urls = HashMap::new();
    urls.insert(base, children(vec![ok, missing]));

    let output = strip_ansi(&formatter.format_url_per_domain(&domains, &urls, URLType::Url));

    assert!(output.contains("2 URL from example.com:"));
    assert!(output.contains("https://example.com [200]"));
    assert!(output.contains("https://example.com/ok [200]"));
    assert!(!output.contains("https://example.com/missing"));
}

#[test]
fn format_url_per_domain_filters_ignored_urls() {
    let formatter = Formatter::new(None);
    let domains = domains(&["example.com"]);

    let base = node("https://example.com", ResponseStatus::Valid(200));
    let ignored = node("https://example.com/image.png", ResponseStatus::Ignore);
    let mut urls = HashMap::new();
    urls.insert(base, children(vec![ignored]));

    let output = strip_ansi(&formatter.format_url_per_domain(&domains, &urls, URLType::Url));

    assert!(output.contains("1 URL from example.com:"));
    assert!(output.contains("https://example.com [200]"));
    assert!(!output.contains("https://example.com/image.png"));
}

#[test]
fn output_csv_writes_secret_and_hierarchy_urls() {
    let base = detailed_node(
        "https://example.com",
        ResponseStatus::Valid(200),
        Some(100),
        Some("text/html"),
        Some("Home"),
    );
    let child = detailed_node(
        "https://example.com/app.js",
        ResponseStatus::Valid(200),
        Some(42),
        Some("application/javascript"),
        None,
    );
    let secret_only = detailed_node(
        "https://api.example.com/config",
        ResponseStatus::Valid(403),
        None,
        Some("application/json"),
        Some("Config"),
    );
    let secrets = HashSet::from([Secret {
        secret_type: "API Key".to_string(),
        data: "secret-value".to_string(),
    }]);

    let children = children(vec![child]);
    let mut urls = HashMap::new();
    urls.insert(base, children);

    let mut url_secrets = HashMap::new();
    url_secrets.insert(secret_only, secrets);

    let output = SharedBuffer::default();
    let count = output_csv(Box::new(output.clone()), &urls, &url_secrets).expect("csv output");
    let csv = output.into_string();

    assert_eq!(count, 3);
    assert!(csv.contains("URL,Title,Response Code,Content Length,Content Type,Secrets"));
    assert!(csv.contains(
        "https://api.example.com/config,Config,403,0,application/json,API Key: secret-value"
    ));
    assert!(csv.contains("https://example.com,Home,200,100,text/html,"));
    assert!(csv.contains("https://example.com/app.js,,200,42,application/javascript,"));
}