use crate::model::{CleanParams, PathPattern, RequestRate, RobotsTxt, Rule};
use crate::parser::line::Line;
use crate::parser::parse_result::ParseResult;
use crate::parser::warning::ParseWarning;
use std::time::Duration;
use url::{Origin, Url};
mod directive;
use self::directive::Directive;
mod group_builder;
pub use self::group_builder::GroupBuilder;
const COMMENT_BEGIN_CHAR: char = '#';
const KV_SEPARATOR: &str = ":";
pub fn parse(origin: Origin, input: &str) -> ParseResult<RobotsTxt> {
let parser = Parser::new(origin);
parser.parse(input)
}
struct Parser {
result: RobotsTxt,
group_builder: GroupBuilder,
warnings: Vec<ParseWarning>,
}
impl Parser {
pub fn new(origin: Origin) -> Parser {
Parser {
result: RobotsTxt::new(origin),
group_builder: GroupBuilder::new(),
warnings: Vec::new(),
}
}
pub fn parse(mut self, input: &str) -> ParseResult<RobotsTxt> {
let input = ignore_bom(input);
for (line_no, line) in input.lines().enumerate() {
let line = Line::new(line, line_no + 1);
match Self::parse_line(&line) {
Ok(Some(line_value)) => {
self.process_line_value(&line, &line_value);
}
Err(warning) => {
self.warnings.push(warning);
}
_ => {}
}
}
self.group_builder.fill_entries(&mut self.result);
ParseResult::new_with_warnings(self.result, self.warnings)
}
fn parse_line<'a>(line: &'a Line) -> Result<Option<Directive<'a>>, ParseWarning> {
let mut kv_part = line.get_line_text();
if let Some(comment_separator_position) = line.get_line_text().find(COMMENT_BEGIN_CHAR) {
kv_part = &kv_part[0..comment_separator_position];
}
if kv_part.is_empty() {
return Ok(None);
}
let separator_index = kv_part
.find(KV_SEPARATOR)
.ok_or_else(|| ParseWarning::invalid_directive_format(line))?;
if separator_index >= kv_part.len() {
return Err(ParseWarning::invalid_directive_format(line));
}
let key = &kv_part[0..separator_index];
let key = key.trim();
if key.is_empty() {
return Err(ParseWarning::directive_key_is_empty(line));
}
let value = &kv_part[separator_index + 1..];
let value = value.trim();
let result = Directive::new(key, value);
Ok(Some(result))
}
fn process_line_value(&mut self, line: &Line, directive: &Directive) {
let key = directive.get_key_lowercase();
match key.as_str() {
"user-agent" => {
self.process_directive_user_agent(line, directive);
}
"allow" => {
self.process_directive_allow(line, directive);
}
"disallow" => {
self.process_directive_disallow(line, directive);
}
"crawl-delay" => {
self.process_directive_crawl_delay(line, directive);
}
"request-rate" => {
self.process_directive_request_rate(line, directive);
}
"sitemap" => {
self.process_directive_sitemap(line, directive);
}
"clean-param" => {
self.process_directive_clean_param(line, directive);
}
_ => {
self.warnings.push(ParseWarning::unsupported_directive_key(line, key));
}
}
}
fn process_directive_user_agent(&mut self, line: &Line, directive: &Directive) {
let user_agent = directive.get_value();
if user_agent.is_empty() {
self.warnings.push(ParseWarning::user_agent_cannot_be_empty(line));
return;
}
self.group_builder.handle_user_agent(user_agent);
}
fn process_directive_allow(&mut self, line: &Line, directive: &Directive) {
if let Some(group) = self.group_builder.get_mut_active_group() {
if directive.get_value() == "" {
} else if directive.get_value().starts_with('*') || directive.get_value().starts_with('/') {
group.push_rule(Rule::new(directive.get_value(), true));
} else {
self.warnings.push(ParseWarning::wrong_path_format(line));
}
} else {
self.warnings.push(ParseWarning::directive_without_user_agent(line));
}
}
fn process_directive_disallow(&mut self, line: &Line, directive: &Directive) {
if let Some(group) = self.group_builder.get_mut_active_group() {
if directive.get_value() == "" {
group.push_rule(Rule::new(PathPattern::all(), true));
} else if directive.get_value().starts_with('*') || directive.get_value().starts_with('/') {
group.push_rule(Rule::new(directive.get_value(), false));
} else {
self.warnings.push(ParseWarning::wrong_path_format(line));
}
} else {
self.warnings.push(ParseWarning::directive_without_user_agent(line));
}
}
fn process_directive_crawl_delay(&mut self, line: &Line, directive: &Directive) {
if let Some(group) = self.group_builder.get_mut_active_group() {
match directive.get_value().parse::<f64>() {
Ok(delay) => {
let delay_seconds = delay.trunc();
let delay_nanoseconds = delay.fract() * 10f64.powi(9);
let delay = Duration::new(delay_seconds as u64, delay_nanoseconds as u32);
group.set_crawl_delay(delay);
}
Err(error) => {
self.warnings.push(ParseWarning::parse_crawl_delay_error(line, error));
}
}
} else {
self.warnings.push(ParseWarning::directive_without_user_agent(line));
}
}
fn process_directive_request_rate(&mut self, line: &Line, directive: &Directive) {
if let Some(group) = self.group_builder.get_mut_active_group() {
let numbers: Vec<&str> = directive.get_value().split('/').collect();
if numbers.len() != 2 {
self.warnings.push(ParseWarning::wrong_request_rate_format(line));
return;
}
let requests = match numbers[0].parse::<usize>() {
Ok(requests) => requests,
Err(error) => {
self.warnings.push(ParseWarning::parse_request_rate(line, error));
return;
}
};
let seconds = match numbers[1].parse::<usize>() {
Ok(seconds) => seconds,
Err(error) => {
self.warnings.push(ParseWarning::parse_request_rate(line, error));
return;
}
};
group.set_req_rate(RequestRate { requests, seconds });
} else {
self.warnings.push(ParseWarning::directive_without_user_agent(line));
}
}
fn process_directive_sitemap(&mut self, line: &Line, directive: &Directive) {
match Url::parse(directive.get_value()) {
Ok(sitemap_url) => {
self.result.add_sitemap(sitemap_url);
}
Err(error) => {
self.warnings.push(ParseWarning::parse_url(line, error));
}
}
}
fn process_directive_clean_param(&mut self, line: &Line, directive: &Directive) {
let parts: Vec<&str> = directive.get_value().split_whitespace().collect();
if parts.len() >= 3 || parts.is_empty() {
self.warnings.push(ParseWarning::wrong_clean_param_format(line));
return;
}
if parts[0].is_empty() {
self.warnings.push(ParseWarning::wrong_clean_param_format(line));
return;
}
let clean_params_path_pattern;
let clean_params;
if let Some(second_param) = parts.get(1) {
if second_param.is_empty() {
self.warnings.push(ParseWarning::wrong_clean_param_format(line));
return;
}
clean_params_path_pattern = PathPattern::new(parts[0]);
clean_params = *second_param;
} else {
clean_params_path_pattern = PathPattern::all();
clean_params = parts[0];
}
let (valid_clean_params, invalid_clean_params) = Self::parse_clean_params(clean_params);
if !invalid_clean_params.is_empty() {
self.warnings
.push(ParseWarning::ignored_clean_params(line, invalid_clean_params));
}
self.result
.add_clean_params(CleanParams::new(clean_params_path_pattern, valid_clean_params));
}
fn parse_clean_params(clean_params: &str) -> (Vec<String>, Vec<String>) {
let mut valid = Vec::new();
let mut invalid = Vec::new();
for clean_param in clean_params.split('&') {
if !clean_param.is_empty() {
if Self::is_valid_clean_param(clean_param) {
valid.push(clean_param.into());
} else {
invalid.push(clean_param.into());
}
}
}
(valid, invalid)
}
fn is_valid_clean_param(clean_param: &str) -> bool {
for c in clean_param.chars() {
let is_valid = ('A'..'Z').contains(&c)
|| ('a'..'z').contains(&c)
|| ('0'..'9').contains(&c)
|| c == '.'
|| c == '-'
|| c == '_';
if !is_valid {
return false;
}
}
true
}
}
fn ignore_bom(input: &str) -> &str {
const BOM: &str = "\u{feff}";
input.trim_start_matches(BOM)
}