use crate::RobotsParseHandler;
#[derive(Eq, PartialEq)]
pub enum ParseKeyType {
UserAgent,
Sitemap,
Allow,
Disallow,
Unknown = 128,
}
pub struct ParsedRobotsKey {
type_: ParseKeyType,
key_text: String,
allow_typo: bool,
}
impl Default for ParsedRobotsKey {
fn default() -> Self {
ParsedRobotsKey {
type_: ParseKeyType::Unknown,
allow_typo: true,
key_text: String::new(),
}
}
}
impl ParsedRobotsKey {
pub fn parse(&mut self, key: &str) {
if self.validate_key(key, &["user-agent"], Some(&["useragent", "user agent"])) {
self.type_ = ParseKeyType::UserAgent;
} else if self.validate_key(key, &["allow"], None) {
self.type_ = ParseKeyType::Allow;
} else if self.validate_key(
key,
&["disallow"],
Some(&["dissallow", "dissalow", "disalow", "diasllow", "disallaw"]),
) {
self.type_ = ParseKeyType::Disallow;
} else if self.validate_key(key, &["sitemap", "site-map"], None) {
self.type_ = ParseKeyType::Sitemap;
} else {
self.type_ = ParseKeyType::Unknown;
self.key_text = key.to_string();
}
}
pub fn get_type(&self) -> &ParseKeyType {
&self.type_
}
pub fn get_unknown_text(&self) -> String {
self.key_text.to_string()
}
fn validate_key(&self, key: &str, targets: &[&str], typo_targets: Option<&[&str]>) -> bool {
let key = key.to_lowercase();
let check = |target: &&str| key.starts_with(&target.to_lowercase());
targets.iter().any(check)
|| (typo_targets.is_some()
&& self.allow_typo
&& typo_targets.unwrap().iter().any(check))
}
}
pub struct RobotsTxtParser<'a, Handler: RobotsParseHandler> {
robots_body: &'a str,
handler: &'a mut Handler,
}
impl<'a, Handler: RobotsParseHandler> RobotsTxtParser<'a, Handler> {
pub fn new(robots_body: &'a str, handler: &'a mut Handler) -> Self {
RobotsTxtParser {
robots_body,
handler,
}
}
pub fn parse(&mut self) {
let utf_bom = [0xEF, 0xBB, 0xBF];
let max_line_len = 2083 * 8;
let mut line_num = 0;
let mut bom_pos = 0;
let mut last_was_carriage_return = false;
self.handler.handle_robots_start();
let mut start = 0;
let mut end = 0;
let mut skip_exceed = 0;
for (ch, char_len_utf8) in self
.robots_body
.chars()
.map(|ch| (ch as usize, ch.len_utf8()))
{
if bom_pos < utf_bom.len() && ch == utf_bom[bom_pos] {
bom_pos += 1;
start += char_len_utf8;
end += char_len_utf8;
continue;
}
bom_pos = utf_bom.len();
if ch != 0x0A && ch != 0x0D {
if (end - start) < max_line_len - 1 {
end += char_len_utf8;
} else {
skip_exceed += 1;
}
} else {
let is_crlf_continuation = end == start && last_was_carriage_return && ch == 0x0A;
if !is_crlf_continuation {
line_num += 1;
self.parse_and_emit_line(line_num, &self.robots_body[start..end]);
}
end += skip_exceed + char_len_utf8;
start = end;
last_was_carriage_return = ch == 0x0D;
skip_exceed = 0;
}
}
line_num += 1;
self.parse_and_emit_line(line_num, &self.robots_body[start..end]);
self.handler.handle_robots_end();
}
pub fn parse_key_value(line: &str) -> (&str, &str, bool) {
let mut line = line;
if let Some(comment) = line.find('#') {
line = &line[..comment].trim();
}
let mut sep = line.find(':');
if sep.is_none() {
let white = " \t";
sep = line.find(|c| white.contains(c));
if let Some(sep) = sep {
let val = &line[sep..].trim();
if val.is_empty() || val.find(|c| white.contains(c)).is_some() {
return ("", "", false);
}
}
}
if let Some(sep) = sep {
let key = &line[..sep];
if key.is_empty() {
return ("", "", false);
}
let value = &line[(sep + 1)..];
(key.trim(), value.trim(), true)
} else {
("", "", false)
}
}
pub fn need_escape_value_for_key(key: &ParsedRobotsKey) -> bool {
!matches!(
key.get_type(),
ParseKeyType::UserAgent | ParseKeyType::Sitemap
)
}
fn parse_and_emit_line(&mut self, current_line: u32, line: &str) {
match Self::parse_key_value(line) {
(_, _, false) => {}
(string_key, value, true) => {
let mut key = ParsedRobotsKey::default();
key.parse(string_key);
if Self::need_escape_value_for_key(&key) {
let value = escape_pattern(value);
self.emit(current_line, &key, &value);
} else {
self.emit(current_line, &key, value);
}
}
}
}
fn emit(&mut self, line: u32, key: &ParsedRobotsKey, value: &str) {
match key.get_type() {
ParseKeyType::UserAgent => self.handler.handle_user_agent(line, value),
ParseKeyType::Sitemap => self.handler.handle_sitemap(line, value),
ParseKeyType::Allow => self.handler.handle_allow(line, value),
ParseKeyType::Disallow => self.handler.handle_disallow(line, value),
ParseKeyType::Unknown => {
self.handler
.handle_unknown_action(line, &key.get_unknown_text(), value)
}
}
}
}
const HEX_DIGITS: [char; 16] = [
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
];
pub fn escape_pattern(path: &str) -> String {
let mut num_to_escape = 0;
let mut need_capitalize = false;
let mut chars = path.bytes();
loop {
match chars.next() {
Some(c) if c as char == '%' => {
match (
chars.next().map(|c| c as char),
chars.next().map(|c| c as char),
) {
(Some(c1), Some(c2)) if c1.is_digit(16) && c2.is_digit(16) => {
if c1.is_ascii_lowercase() || c2.is_ascii_lowercase() {
need_capitalize = true;
}
}
_ => {}
}
}
Some(c) if c >= 0x80 => {
num_to_escape += 1;
}
o => {
if o.is_none() {
break;
}
}
}
}
if num_to_escape == 0 && !need_capitalize {
return path.to_string();
}
let mut dest = String::with_capacity(num_to_escape * 2 + path.len() + 1);
chars = path.bytes();
loop {
match chars.next() {
Some(c) if c as char == '%' => {
match (
chars.next().map(|c| c as char),
chars.next().map(|c| c as char),
) {
(Some(c1), Some(c2)) if c1.is_digit(16) && c2.is_digit(16) => {
dest.push(c as char);
dest.push(c1.to_ascii_uppercase());
dest.push(c2.to_ascii_uppercase());
}
_ => {}
}
}
Some(c) if c >= 0x80 => {
dest.push('%');
dest.push(HEX_DIGITS[(c as usize >> 4) & 0xf]);
dest.push(HEX_DIGITS[c as usize & 0xf]);
}
Some(c) => {
dest.push(c as char);
}
None => {
break;
}
}
}
dest
}
#[cfg(test)]
mod tests {
#![allow(unused_variables)]
use crate::parser::*;
use crate::RobotsParseHandler;
struct FooHandler;
impl RobotsParseHandler for FooHandler {
fn handle_robots_start(&mut self) {
unimplemented!()
}
fn handle_robots_end(&mut self) {
unimplemented!()
}
fn handle_user_agent(&mut self, line_num: u32, user_agent: &str) {
unimplemented!()
}
fn handle_allow(&mut self, line_num: u32, value: &str) {
unimplemented!()
}
fn handle_disallow(&mut self, line_num: u32, value: &str) {
unimplemented!()
}
fn handle_sitemap(&mut self, line_num: u32, value: &str) {
unimplemented!()
}
fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str) {
unimplemented!()
}
}
#[test]
fn test_parse_key_value<'a>() {
type Target<'a> = RobotsTxtParser<'a, FooHandler>;
let negative = ("", "", false);
let positive = ("User-agent", "Googlebot", true);
assert_eq!(negative, Target::parse_key_value("# "));
assert_eq!(negative, Target::parse_key_value("# User-agent: Googlebot"));
assert_eq!(positive, Target::parse_key_value("User-agent: Googlebot"));
assert_eq!(positive, Target::parse_key_value("User-agent Googlebot"));
assert_eq!(positive, Target::parse_key_value("User-agent \t Googlebot"));
assert_eq!(positive, Target::parse_key_value("User-agent\tGooglebot"));
assert_eq!(
positive,
Target::parse_key_value("User-agent: Googlebot # 123")
);
assert_eq!(
positive,
Target::parse_key_value("User-agent\tGooglebot # 123")
);
}
#[test]
fn test_escape_pattern() {
assert_eq!(
"http://www.example.com",
&escape_pattern("http://www.example.com")
);
assert_eq!("/a/b/c", &escape_pattern("/a/b/c"));
assert_eq!("%AA", &escape_pattern("%aa"));
assert_eq!("%AA", &escape_pattern("%aA"));
assert_eq!("/Sanjos%C3%A9Sellers", &escape_pattern("/SanjoséSellers"));
assert_eq!("%C3%A1", &escape_pattern("á"));
}
}