use crate::types::{Group, RequestRate, Rule, RuleKind, RobotsPolicy, FetchStatus};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tracing::{debug, warn};
pub const MAX_ROBOTS_SIZE: usize = 512 * 1024;
const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
fn now_millis() -> u64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as u64
}
fn strip_bom(content: &str) -> &str {
if content.as_bytes().starts_with(UTF8_BOM) {
&content[3..]
} else {
content
}
}
pub struct RobotsParser {
max_size: usize,
}
impl Default for RobotsParser {
fn default() -> Self {
Self::new()
}
}
impl RobotsParser {
pub fn new() -> Self {
Self {
max_size: MAX_ROBOTS_SIZE,
}
}
pub fn with_max_size(max_size: usize) -> Self {
Self { max_size }
}
pub fn parse(&self, content: &str, ttl: Duration) -> RobotsPolicy {
let now = now_millis();
let content = strip_bom(content);
let content = if content.len() > self.max_size {
warn!(
"robots.txt exceeds size limit ({} > {}), truncating",
content.len(),
self.max_size
);
&content[..self.max_size]
} else {
content
};
let content_size = content.len();
let mut groups: Vec<Group> = Vec::new();
let mut sitemaps: Vec<String> = Vec::new();
let mut current_group: Option<Group> = None;
for line in content.lines() {
let line = self.clean_line(line);
if line.is_empty() {
continue;
}
if let Some((directive, value)) = self.parse_directive(&line) {
match directive.to_lowercase().as_str() {
"user-agent" => {
if let Some(ref mut group) = current_group {
if group.rules.is_empty() {
group.user_agents.push(value.to_string());
} else {
groups.push(current_group.take().unwrap());
current_group = Some(Group {
user_agents: vec![value.to_string()],
rules: Vec::new(),
crawl_delay: None,
request_rate: None,
});
}
} else {
current_group = Some(Group {
user_agents: vec![value.to_string()],
rules: Vec::new(),
crawl_delay: None,
request_rate: None,
});
}
}
"allow" => {
if let Some(ref mut group) = current_group {
let pattern = self.normalize_pattern(value);
if !pattern.is_empty() {
group.rules.push(Rule::new(RuleKind::Allow, pattern));
}
}
}
"disallow" => {
if let Some(ref mut group) = current_group {
let pattern = self.normalize_pattern(value);
if !pattern.is_empty() {
group.rules.push(Rule::new(RuleKind::Disallow, pattern));
}
}
}
"crawl-delay" => {
if let Some(ref mut group) = current_group {
if let Ok(delay) = value.trim().parse::<f64>() {
if delay >= 0.0 {
group.crawl_delay = Some(delay);
}
}
}
}
"request-rate" => {
if let Some(ref mut group) = current_group {
if let Some(rate) = Self::parse_request_rate(value) {
group.request_rate = Some(rate);
}
}
}
"sitemap" => {
let sitemap_url = value.trim().to_string();
if !sitemap_url.is_empty() {
sitemaps.push(sitemap_url);
}
}
_ => {
debug!("Ignoring unknown robots.txt directive: {}", directive);
}
}
}
}
if let Some(group) = current_group {
if !group.user_agents.is_empty() {
groups.push(group);
}
}
RobotsPolicy {
fetched_at_ms: now,
expires_at_ms: now + ttl.as_millis() as u64,
fetch_status: FetchStatus::Success,
groups,
sitemaps,
content_size,
etag: None,
last_modified: None,
}
}
fn parse_request_rate(value: &str) -> Option<RequestRate> {
let parts: Vec<&str> = value.trim().split('/').collect();
if parts.len() == 2 {
let requests = parts[0].trim().parse::<u32>().ok()?;
let seconds = parts[1].trim().parse::<u32>().ok()?;
if requests > 0 && seconds > 0 {
return Some(RequestRate::new(requests, seconds));
}
}
None
}
fn clean_line(&self, line: &str) -> String {
let line = match line.find('#') {
Some(pos) => &line[..pos],
None => line,
};
line.trim().to_string()
}
fn parse_directive<'a>(&self, line: &'a str) -> Option<(&'a str, &'a str)> {
let colon_pos = line.find(':')?;
let directive = line[..colon_pos].trim();
let value = line[colon_pos + 1..].trim();
if directive.is_empty() {
return None;
}
Some((directive, value))
}
fn normalize_pattern(&self, pattern: &str) -> String {
let pattern = pattern.trim();
if pattern.is_empty() {
return String::new();
}
if !pattern.starts_with('/') && !pattern.starts_with('*') {
format!("/{}", pattern)
} else {
pattern.to_string()
}
}
}
pub mod encoding {
pub fn normalize_path_for_matching(path: &str) -> String {
let mut result = String::with_capacity(path.len());
let mut chars = path.chars().peekable();
while let Some(c) = chars.next() {
if c == '%' {
let hex: String = chars.by_ref().take(2).collect();
if hex.len() == 2 {
if let Ok(byte) = u8::from_str_radix(&hex, 16) {
let decoded = byte as char;
if is_unreserved(decoded) {
result.push(decoded);
continue;
}
}
}
result.push('%');
result.push_str(&hex);
} else {
result.push(c);
}
}
result
}
fn is_unreserved(c: char) -> bool {
c.is_ascii_alphanumeric() || c == '-' || c == '.' || c == '_' || c == '~'
}
pub fn normalize_for_comparison(s: &str) -> String {
let decoded = normalize_path_for_matching(s);
uppercase_percent_encoding(&decoded)
}
fn uppercase_percent_encoding(s: &str) -> String {
let mut result = String::with_capacity(s.len());
let mut chars = s.chars().peekable();
while let Some(c) = chars.next() {
if c == '%' {
result.push('%');
for _ in 0..2 {
if let Some(hex_char) = chars.next() {
result.push(hex_char.to_ascii_uppercase());
}
}
} else {
result.push(c);
}
}
result
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_simple() {
let parser = RobotsParser::new();
let content = r#"
User-agent: *
Disallow: /private/
Allow: /private/public/
Crawl-delay: 2
"#;
let policy = parser.parse(content, Duration::from_secs(3600));
assert_eq!(policy.groups.len(), 1);
assert_eq!(policy.groups[0].user_agents, vec!["*"]);
assert_eq!(policy.groups[0].rules.len(), 2);
assert_eq!(policy.groups[0].crawl_delay, Some(2.0));
}
#[test]
fn test_parse_multiple_groups() {
let parser = RobotsParser::new();
let content = r#"
User-agent: Googlebot
User-agent: Bingbot
Disallow: /search
User-agent: *
Disallow: /admin
"#;
let policy = parser.parse(content, Duration::from_secs(3600));
assert_eq!(policy.groups.len(), 2);
assert_eq!(policy.groups[0].user_agents, vec!["Googlebot", "Bingbot"]);
assert_eq!(policy.groups[1].user_agents, vec!["*"]);
}
#[test]
fn test_parse_sitemaps() {
let parser = RobotsParser::new();
let content = r#"
User-agent: *
Disallow:
Sitemap: https://example.com/sitemap.xml
Sitemap: https://example.com/sitemap2.xml
"#;
let policy = parser.parse(content, Duration::from_secs(3600));
assert_eq!(policy.sitemaps.len(), 2);
assert_eq!(policy.sitemaps[0], "https://example.com/sitemap.xml");
}
#[test]
fn test_parse_comments() {
let parser = RobotsParser::new();
let content = r#"
# This is a comment
User-agent: * # inline comment
Disallow: /private # another comment
"#;
let policy = parser.parse(content, Duration::from_secs(3600));
assert_eq!(policy.groups.len(), 1);
assert_eq!(policy.groups[0].rules.len(), 1);
}
#[test]
fn test_parse_empty_disallow() {
let parser = RobotsParser::new();
let content = r#"
User-agent: *
Disallow:
"#;
let policy = parser.parse(content, Duration::from_secs(3600));
assert_eq!(policy.groups[0].rules.len(), 0);
}
#[test]
fn test_normalize_pattern() {
let parser = RobotsParser::new();
assert_eq!(parser.normalize_pattern("/path"), "/path");
assert_eq!(parser.normalize_pattern("path"), "/path");
assert_eq!(parser.normalize_pattern("*"), "*");
assert_eq!(parser.normalize_pattern(""), "");
}
#[test]
fn test_encoding_normalize() {
use encoding::normalize_path_for_matching;
assert_eq!(normalize_path_for_matching("/path%2Dtest"), "/path-test");
assert_eq!(normalize_path_for_matching("/path%2Ftest"), "/path%2Ftest");
}
#[test]
fn test_bom_stripping() {
let parser = RobotsParser::new();
let content = "\u{FEFF}User-agent: *\nDisallow: /private";
let policy = parser.parse(content, Duration::from_secs(3600));
assert_eq!(policy.groups.len(), 1);
assert_eq!(policy.groups[0].user_agents, vec!["*"]);
}
#[test]
fn test_request_rate_parsing() {
let parser = RobotsParser::new();
let content = r#"
User-agent: *
Disallow: /private
Request-rate: 1/10
"#;
let policy = parser.parse(content, Duration::from_secs(3600));
assert_eq!(policy.groups.len(), 1);
let rate = policy.groups[0].request_rate.unwrap();
assert_eq!(rate.requests, 1);
assert_eq!(rate.seconds, 10);
assert!((rate.delay_seconds() - 10.0).abs() < 0.001);
}
#[test]
fn test_crawl_delay_float() {
let parser = RobotsParser::new();
let content = r#"
User-agent: *
Crawl-delay: 0.5
"#;
let policy = parser.parse(content, Duration::from_secs(3600));
assert_eq!(policy.groups[0].crawl_delay, Some(0.5));
}
}