bytehaul 0.1.8

Async HTTP download library with resume, multi-connection, rate limiting, and checksum verification
Documentation
use std::path::{Component, Path, PathBuf};

use url::Url;

const WINDOWS_RESERVED_NAMES: &[&str] = &[
    "CON", "PRN", "AUX", "NUL", "COM1", "COM2", "COM3", "COM4", "COM5", "COM6",
    "COM7", "COM8", "COM9", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7",
    "LPT8", "LPT9",
];

pub(crate) fn sanitize_relative_path(path: &Path) -> Option<PathBuf> {
    let mut sanitized = PathBuf::new();

    for component in path.components() {
        match component {
            Component::Normal(value) => {
                let part = sanitize_filename_component(&value.to_string_lossy());
                if part.is_empty() {
                    continue;
                }
                sanitized.push(part);
            }
            Component::CurDir => {}
            Component::ParentDir | Component::RootDir | Component::Prefix(_) => return None,
        }
    }

    if sanitized.as_os_str().is_empty() {
        None
    } else {
        Some(sanitized)
    }
}

pub(crate) fn parse_content_disposition(header_value: &str) -> Option<String> {
    for part in header_value.split(';').map(str::trim) {
        if let Some(value) = part.strip_prefix("filename*=") {
            if let Some(decoded) = decode_rfc5987_value(value.trim_matches('"')) {
                let sanitized = sanitize_filename_component(&decoded);
                if !sanitized.is_empty() {
                    return Some(sanitized);
                }
            }
        }
    }

    for part in header_value.split(';').map(str::trim) {
        if let Some(value) = part.strip_prefix("filename=") {
            let sanitized = sanitize_filename_component(value.trim_matches('"'));
            if !sanitized.is_empty() {
                return Some(sanitized);
            }
        }
    }

    None
}

pub(crate) fn filename_from_url(url: &str) -> Option<String> {
    let parsed = Url::parse(url).ok()?;
    let segment = parsed
        .path_segments()
        .and_then(|mut segments| segments.rfind(|segment: &&str| !segment.is_empty()))?;
    let decoded = percent_decode(segment)?;
    let sanitized = sanitize_filename_component(&decoded);
    if sanitized.is_empty() {
        None
    } else {
        Some(sanitized)
    }
}

pub(crate) fn detect_filename(content_disposition: Option<&str>, url: &str) -> String {
    content_disposition
        .and_then(parse_content_disposition)
        .or_else(|| filename_from_url(url))
        .filter(|value| !value.is_empty())
        .unwrap_or_else(|| "download".to_string())
}

fn decode_rfc5987_value(value: &str) -> Option<String> {
    let (charset, _language, encoded) = split_rfc5987_parts(value)?;
    let bytes = percent_decode_bytes(encoded)?;
    if charset.eq_ignore_ascii_case("utf-8") {
        String::from_utf8(bytes).ok()
    } else if charset.eq_ignore_ascii_case("iso-8859-1") || charset.eq_ignore_ascii_case("latin1") {
        Some(bytes.into_iter().map(char::from).collect())
    } else {
        None
    }
}

fn split_rfc5987_parts(value: &str) -> Option<(&str, &str, &str)> {
    let first = value.find('\'')?;
    let second = value[first + 1..].find('\'')? + first + 1;
    Some((&value[..first], &value[first + 1..second], &value[second + 1..]))
}

fn sanitize_filename_component(value: &str) -> String {
    let filtered: String = value
        .chars()
        .map(|ch| match ch {
            '/' | '\\' | ':' | '*' | '?' | '"' | '<' | '>' | '|' => '_',
            ch if ch.is_control() => '_',
            ch => ch,
        })
        .collect();

    let trimmed = filtered.trim().trim_end_matches('.').trim_end_matches(' ');
    if trimmed.is_empty() {
        return String::new();
    }

    let mut sanitized = trimmed.to_string();
    if WINDOWS_RESERVED_NAMES
        .iter()
        .any(|reserved| sanitized.eq_ignore_ascii_case(reserved))
    {
        sanitized.push('_');
    }

    if sanitized.len() > 255 {
        let (stem, extension) = match sanitized.rsplit_once('.') {
            Some((stem, extension)) if !stem.is_empty() && !extension.is_empty() => {
                (stem.to_string(), Some(extension.to_string()))
            }
            _ => (sanitized.clone(), None),
        };
        if let Some(extension) = extension {
            let keep = 255usize.saturating_sub(extension.len() + 1);
            sanitized = format!("{}.{}", trim_to_boundary(&stem, keep), extension);
        } else {
            sanitized = trim_to_boundary(&sanitized, 255);
        }
    }

    sanitized
}

fn trim_to_boundary(value: &str, max_len: usize) -> String {
    value.chars().take(max_len).collect()
}

fn percent_decode(value: &str) -> Option<String> {
    let bytes = percent_decode_bytes(value)?;
    String::from_utf8(bytes).ok()
}

fn percent_decode_bytes(value: &str) -> Option<Vec<u8>> {
    let bytes = value.as_bytes();
    let mut decoded = Vec::with_capacity(bytes.len());
    let mut index = 0usize;

    while index < bytes.len() {
        if bytes[index] == b'%' {
            if index + 2 >= bytes.len() {
                return None;
            }
            let hex = std::str::from_utf8(&bytes[index + 1..index + 3]).ok()?;
            let byte = u8::from_str_radix(hex, 16).ok()?;
            decoded.push(byte);
            index += 3;
        } else {
            decoded.push(bytes[index]);
            index += 1;
        }
    }

    Some(decoded)
}

#[cfg(test)]
mod tests {
    use std::path::{Path, PathBuf};

    use super::{detect_filename, filename_from_url, parse_content_disposition, sanitize_relative_path};

    #[test]
    fn test_parse_content_disposition_prefers_filename_star() {
        let header = "attachment; filename=plain.txt; filename*=UTF-8''hello%20world.txt";
        assert_eq!(
            parse_content_disposition(header).as_deref(),
            Some("hello world.txt")
        );
    }

    #[test]
    fn test_parse_content_disposition_sanitizes_filename() {
        let header = "attachment; filename=..\\bad:name?.txt";
        assert_eq!(parse_content_disposition(header).as_deref(), Some(".._bad_name_.txt"));
    }

    #[test]
    fn test_parse_content_disposition_supports_latin1_filename_star() {
        let header = "attachment; filename*=ISO-8859-1''caf%E9.txt";
        assert_eq!(parse_content_disposition(header).as_deref(), Some("café.txt"));
    }

    #[test]
    fn test_parse_content_disposition_falls_back_when_filename_star_is_invalid() {
        let header = "attachment; filename*=UTF-8''bad%ZZname.txt; filename=fallback.txt";
        assert_eq!(parse_content_disposition(header).as_deref(), Some("fallback.txt"));
    }

    #[test]
    fn test_parse_content_disposition_rejects_invalid_only_filename_star() {
        let header = "attachment; filename*=UTF-8''bad%ZZname.txt";
        assert_eq!(parse_content_disposition(header), None);
    }

    #[test]
    fn test_parse_content_disposition_trims_reserved_and_long_names() {
        let long_name = format!("{}.txt", "a".repeat(400));
        let header = format!("attachment; filename=\"{long_name}\"");
        let parsed = parse_content_disposition(&header).unwrap();
        assert_eq!(parsed.len(), 255);
        assert!(parsed.ends_with(".txt"));

        let reserved = parse_content_disposition("attachment; filename=CON ").unwrap();
        assert_eq!(reserved, "CON_");
    }

    #[test]
    fn test_filename_from_url_decodes_last_segment() {
        assert_eq!(
            filename_from_url("https://example.com/files/report%20final.zip").as_deref(),
            Some("report final.zip")
        );
    }

    #[test]
    fn test_filename_from_url_rejects_invalid_inputs() {
        assert_eq!(filename_from_url("not a url"), None);
        assert_eq!(filename_from_url("https://example.com/files/bad%ZZname.zip"), None);
    }

    #[test]
    fn test_filename_from_url_rejects_empty_sanitized_segment() {
        assert_eq!(filename_from_url("https://example.com/files/%20"), None);
    }

    #[test]
    fn test_detect_filename_falls_back_to_url_when_header_is_invalid() {
        assert_eq!(
            detect_filename(
                Some("attachment; filename*=UTF-8''bad%ZZname.txt"),
                "https://example.com/files/url-name.bin"
            ),
            "url-name.bin"
        );
    }

    #[test]
    fn test_detect_filename_falls_back_to_default() {
        assert_eq!(detect_filename(None, "https://example.com/"), "download");
    }

    #[test]
    fn test_sanitize_relative_path_rejects_parent_dir() {
        assert_eq!(sanitize_relative_path(Path::new("../bad.txt")), None);
    }

    #[test]
    fn test_sanitize_relative_path_rejects_absolute_and_empty_paths() {
        assert_eq!(sanitize_relative_path(Path::new("/bad.txt")), None);
        assert_eq!(sanitize_relative_path(Path::new(".")), None);
    }

    #[test]
    fn test_sanitize_relative_path_allows_nested_relative_paths() {
        assert_eq!(
            sanitize_relative_path(Path::new("nested/output.txt")),
            Some(PathBuf::from("nested/output.txt"))
        );
    }

    #[test]
    fn test_sanitize_relative_path_normalizes_components() {
        assert_eq!(
            sanitize_relative_path(Path::new("./CON ./child?.txt")),
            Some(PathBuf::from("CON_/child_.txt"))
        );
    }

    #[test]
    fn test_sanitize_relative_path_drops_empty_components() {
        assert_eq!(
            sanitize_relative_path(Path::new("nested/ /file.txt")),
            Some(PathBuf::from("nested/file.txt"))
        );
    }

    #[test]
    fn test_unsupported_charset_in_filename_star() {
        // Triggers the `else { None }` branch in decode_rfc5987_value
        let header = "attachment; filename*=WINDOWS-1252''caf%E9.txt; filename=fallback.txt";
        assert_eq!(parse_content_disposition(header).as_deref(), Some("fallback.txt"));
    }

    #[test]
    fn test_unsupported_charset_only_returns_none() {
        let header = "attachment; filename*=WINDOWS-1252''caf%E9.txt";
        assert_eq!(parse_content_disposition(header), None);
    }

    #[test]
    fn test_filename_from_url_with_trailing_incomplete_percent() {
        // Triggers None from percent_decode_bytes when index + 2 >= bytes.len()
        assert_eq!(filename_from_url("https://example.com/file%2"), None);
    }

    #[test]
    fn test_long_name_without_extension_is_trimmed() {
        let long_name = "a".repeat(300);
        let header = format!("attachment; filename=\"{long_name}\"");
        let parsed = parse_content_disposition(&header).unwrap();
        assert_eq!(parsed.len(), 255);
    }
}