robotparser-fork 0.11.0

robots.txt parser for Rust
Documentation
use robotparser::parser::{parse_robots_txt, WarningReason};
use std::convert::From;
use url::{Host, Origin};

#[derive(PartialEq, Eq, Debug, Clone)]
enum WarningReasonKind {
    InvalidDirectiveFormat,
    DirectiveKeyIsEmpty,
    UnsupportedDirectiveKey,
    UserAgentCannotBeEmpty,
    DirectiveWithoutUserAgent,
    ParseCrawlDelayError,
    WrongRequestRateFormat,
    ParseRequestRate,
    ParseUrl,
    WrongCleanParamFormat,
    IgnoredCleanParams,
    WrongPathFormat,
}

fn validate_warnings(input: &str, expected_warnings: &[WarningReasonKind]) {
    let host = Host::Domain("python.org".into());
    let origin = Origin::Tuple("http".into(), host, 80);
    let warnings = parse_robots_txt(origin, &input).get_warnings().to_vec();
    assert_eq!(warnings.len(), expected_warnings.len());
    for (warning, expected_warning) in warnings.iter().zip(expected_warnings.iter()) {
        let warning: WarningReasonKind = warning.get_reason().into();
        assert_eq!(expected_warning.clone(), warning);
    }
}

#[test]
fn test_warning_invalid_directive_format() {
    let input = "`";
    validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]);
    let input = " \t ` \t ";
    validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]);
}

#[test]
fn test_warning_directive_key_is_empty() {
    let input = ":";
    validate_warnings(input, &[WarningReasonKind::DirectiveKeyIsEmpty]);
}

#[test]
fn test_warning_supported_directive_key() {
    let input = "X-Directive:";
    validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]);
    let input = "\t  X-Directive\t  :\t  ";
    validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]);
}

#[test]
fn test_warning_user_agent_cannot_be_empty() {
    let input = "User-Agent:";
    validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]);
    let input = "\t  User-Agent\t  :\t  ";
    validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]);
    let input = "\t  User-Agent\t  :\t  *";
    validate_warnings(input, &[]);
}

#[test]
fn test_warning_directive_without_user_agent() {
    let input = "Crawl-Delay: 5s";
    validate_warnings(input, &[WarningReasonKind::DirectiveWithoutUserAgent]);
    let input = "User-Agent: *\nCrawl-Delay: 5";
    validate_warnings(input, &[]);
}

#[test]
fn test_warning_parse_crawl_delay_error() {
    let input = "User-Agent: *\nCrawl-Delay: ";
    validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
    let input = "User-Agent: *\nCrawl-Delay: -";
    validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
    let input = "User-Agent: *\nCrawl-Delay: 5h9";
    validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
    let input = "User-Agent: *\nCrawl-Delay: 5";
    validate_warnings(input, &[]);
}

#[test]
fn test_warning_request_rate_format() {
    let input = "User-Agent: *\nRequest-rate: 1/5";
    validate_warnings(input, &[]);
    let input = "User-Agent: *\nRequest-rate: 1//5";
    validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]);
    let input = "User-Agent: *\nRequest-rate: 1";
    validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]);
}

#[test]
fn test_warning_request_rate() {
    let input = "User-Agent: *\nRequest-rate: a/b";
    validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
    let input = "User-Agent: *\nRequest-rate: a/5";
    validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
    let input = "User-Agent: *\nRequest-rate: 5/b";
    validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
    let input = "User-Agent: *\nRequest-rate: 1.0/5.0";
    validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
}

#[test]
fn test_warning_parsing_url() {
    let input = "User-Agent: *\nSitemap: https://python.org/sitemap.xml";
    validate_warnings(input, &[]);
    let input = "User-Agent: *\nSitemap: http$$$://python.org/sitemap.xml";
    validate_warnings(input, &[WarningReasonKind::ParseUrl]);
}

#[test]
fn test_wrong_clean_param() {
    let input = "User-Agent: *\nClean-param: ref ";
    validate_warnings(input, &[]);
    let input = "User-Agent: *\nClean-param: ";
    validate_warnings(input, &[WarningReasonKind::WrongCleanParamFormat]);
    let input = "User-Agent: *\nClean-param: &";
    validate_warnings(input, &[]);
    let input = "User-Agent: *\nClean-param: ?";
    validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]);
    let input = "User-Agent: *\nClean-param: abc$";
    validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]);
}

#[test]
fn test_warning_wrong_path_format() {
    let input = "User-Agent: *\nAllow: \\";
    validate_warnings(input, &[WarningReasonKind::WrongPathFormat]);
    let input = "User-Agent: *\nDisallow: \\";
    validate_warnings(input, &[WarningReasonKind::WrongPathFormat]);
}

impl From<&WarningReason> for WarningReasonKind {
    fn from(reason: &WarningReason) -> Self {
        match *reason {
            WarningReason::InvalidDirectiveFormat => WarningReasonKind::InvalidDirectiveFormat,
            WarningReason::DirectiveKeyIsEmpty => WarningReasonKind::DirectiveKeyIsEmpty,
            WarningReason::UnsupportedDirectiveKey { .. } => WarningReasonKind::UnsupportedDirectiveKey,
            WarningReason::UserAgentCannotBeEmpty => WarningReasonKind::UserAgentCannotBeEmpty,
            WarningReason::DirectiveWithoutUserAgent => WarningReasonKind::DirectiveWithoutUserAgent,
            WarningReason::ParseCrawlDelayError { .. } => WarningReasonKind::ParseCrawlDelayError,
            WarningReason::WrongRequestRateFormat => WarningReasonKind::WrongRequestRateFormat,
            WarningReason::ParseRequestRate { .. } => WarningReasonKind::ParseRequestRate,
            WarningReason::ParseUrl { .. } => WarningReasonKind::ParseUrl,
            WarningReason::WrongCleanParamFormat => WarningReasonKind::WrongCleanParamFormat,
            WarningReason::IgnoredCleanParams { .. } => WarningReasonKind::IgnoredCleanParams,
            WarningReason::WrongPathFormat => WarningReasonKind::WrongPathFormat,
        }
    }
}