use robotparser::parser::{parse_robots_txt, WarningReason};
use std::convert::From;
use url::{Host, Origin};
#[derive(PartialEq, Eq, Debug, Clone)]
enum WarningReasonKind {
InvalidDirectiveFormat,
DirectiveKeyIsEmpty,
UnsupportedDirectiveKey,
UserAgentCannotBeEmpty,
DirectiveWithoutUserAgent,
ParseCrawlDelayError,
WrongRequestRateFormat,
ParseRequestRate,
ParseUrl,
WrongCleanParamFormat,
IgnoredCleanParams,
WrongPathFormat,
}
fn validate_warnings(input: &str, expected_warnings: &[WarningReasonKind]) {
let host = Host::Domain("python.org".into());
let origin = Origin::Tuple("http".into(), host, 80);
let warnings = parse_robots_txt(origin, &input).get_warnings().to_vec();
assert_eq!(warnings.len(), expected_warnings.len());
for (warning, expected_warning) in warnings.iter().zip(expected_warnings.iter()) {
let warning: WarningReasonKind = warning.get_reason().into();
assert_eq!(expected_warning.clone(), warning);
}
}
#[test]
fn test_warning_invalid_directive_format() {
let input = "`";
validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]);
let input = " \t ` \t ";
validate_warnings(input, &[WarningReasonKind::InvalidDirectiveFormat]);
}
#[test]
fn test_warning_directive_key_is_empty() {
let input = ":";
validate_warnings(input, &[WarningReasonKind::DirectiveKeyIsEmpty]);
}
#[test]
fn test_warning_supported_directive_key() {
let input = "X-Directive:";
validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]);
let input = "\t X-Directive\t :\t ";
validate_warnings(input, &[WarningReasonKind::UnsupportedDirectiveKey]);
}
#[test]
fn test_warning_user_agent_cannot_be_empty() {
let input = "User-Agent:";
validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]);
let input = "\t User-Agent\t :\t ";
validate_warnings(input, &[WarningReasonKind::UserAgentCannotBeEmpty]);
let input = "\t User-Agent\t :\t *";
validate_warnings(input, &[]);
}
#[test]
fn test_warning_directive_without_user_agent() {
let input = "Crawl-Delay: 5s";
validate_warnings(input, &[WarningReasonKind::DirectiveWithoutUserAgent]);
let input = "User-Agent: *\nCrawl-Delay: 5";
validate_warnings(input, &[]);
}
#[test]
fn test_warning_parse_crawl_delay_error() {
let input = "User-Agent: *\nCrawl-Delay: ";
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
let input = "User-Agent: *\nCrawl-Delay: -";
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
let input = "User-Agent: *\nCrawl-Delay: 5h9";
validate_warnings(input, &[WarningReasonKind::ParseCrawlDelayError]);
let input = "User-Agent: *\nCrawl-Delay: 5";
validate_warnings(input, &[]);
}
#[test]
fn test_warning_request_rate_format() {
let input = "User-Agent: *\nRequest-rate: 1/5";
validate_warnings(input, &[]);
let input = "User-Agent: *\nRequest-rate: 1//5";
validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]);
let input = "User-Agent: *\nRequest-rate: 1";
validate_warnings(input, &[WarningReasonKind::WrongRequestRateFormat]);
}
#[test]
fn test_warning_request_rate() {
let input = "User-Agent: *\nRequest-rate: a/b";
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
let input = "User-Agent: *\nRequest-rate: a/5";
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
let input = "User-Agent: *\nRequest-rate: 5/b";
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
let input = "User-Agent: *\nRequest-rate: 1.0/5.0";
validate_warnings(input, &[WarningReasonKind::ParseRequestRate]);
}
#[test]
fn test_warning_parsing_url() {
let input = "User-Agent: *\nSitemap: https://python.org/sitemap.xml";
validate_warnings(input, &[]);
let input = "User-Agent: *\nSitemap: http$$$://python.org/sitemap.xml";
validate_warnings(input, &[WarningReasonKind::ParseUrl]);
}
#[test]
fn test_wrong_clean_param() {
let input = "User-Agent: *\nClean-param: ref ";
validate_warnings(input, &[]);
let input = "User-Agent: *\nClean-param: ";
validate_warnings(input, &[WarningReasonKind::WrongCleanParamFormat]);
let input = "User-Agent: *\nClean-param: &";
validate_warnings(input, &[]);
let input = "User-Agent: *\nClean-param: ?";
validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]);
let input = "User-Agent: *\nClean-param: abc$";
validate_warnings(input, &[WarningReasonKind::IgnoredCleanParams]);
}
#[test]
fn test_warning_wrong_path_format() {
let input = "User-Agent: *\nAllow: \\";
validate_warnings(input, &[WarningReasonKind::WrongPathFormat]);
let input = "User-Agent: *\nDisallow: \\";
validate_warnings(input, &[WarningReasonKind::WrongPathFormat]);
}
impl From<&WarningReason> for WarningReasonKind {
fn from(reason: &WarningReason) -> Self {
match *reason {
WarningReason::InvalidDirectiveFormat => WarningReasonKind::InvalidDirectiveFormat,
WarningReason::DirectiveKeyIsEmpty => WarningReasonKind::DirectiveKeyIsEmpty,
WarningReason::UnsupportedDirectiveKey { .. } => WarningReasonKind::UnsupportedDirectiveKey,
WarningReason::UserAgentCannotBeEmpty => WarningReasonKind::UserAgentCannotBeEmpty,
WarningReason::DirectiveWithoutUserAgent => WarningReasonKind::DirectiveWithoutUserAgent,
WarningReason::ParseCrawlDelayError { .. } => WarningReasonKind::ParseCrawlDelayError,
WarningReason::WrongRequestRateFormat => WarningReasonKind::WrongRequestRateFormat,
WarningReason::ParseRequestRate { .. } => WarningReasonKind::ParseRequestRate,
WarningReason::ParseUrl { .. } => WarningReasonKind::ParseUrl,
WarningReason::WrongCleanParamFormat => WarningReasonKind::WrongCleanParamFormat,
WarningReason::IgnoredCleanParams { .. } => WarningReasonKind::IgnoredCleanParams,
WarningReason::WrongPathFormat => WarningReasonKind::WrongPathFormat,
}
}
}