#![allow(unused_variables, dead_code)]
use std::borrow::Cow;
use crate::RobotsParseHandler;
struct Match {
priority: i32,
line: u32,
}
impl Default for Match {
fn default() -> Self {
Match::new(Self::NO_MATCH_PRIORITY, 0)
}
}
impl Match {
const NO_MATCH_PRIORITY: i32 = -1;
pub fn new(priority: i32, line: u32) -> Match {
Match { priority, line }
}
pub fn set(&mut self, priority: i32, line: u32) {
self.priority = priority;
self.line = line;
}
pub fn clear(&mut self) {
self.set(Self::NO_MATCH_PRIORITY, 0);
}
pub fn line(&self) -> u32 {
self.line
}
pub fn priority(&self) -> i32 {
self.priority
}
pub fn higher_priority_match<'a>(a: &'a Match, b: &'a Match) -> &'a Match {
if a.priority() > b.priority() {
a
} else {
b
}
}
}
#[derive(Default)]
struct MatchHierarchy {
global: Match,
specific: Match,
}
impl MatchHierarchy {
pub fn clear(&mut self) {
self.global.clear();
self.specific.clear();
}
}
pub trait RobotsMatchStrategy {
fn match_allow(&self, path: &str, pattern: &str) -> i32;
fn match_disallow(&self, path: &str, pattern: &str) -> i32;
fn matches(path: &str, pattern: &str) -> bool {
let pathlen = path.len();
let mut pos = Vec::with_capacity(pathlen + 1);
let mut numpos: usize = 1;
pos.insert(0, 0);
for (index, pat) in pattern.chars().enumerate() {
if pat == '$' && index + 1 == pattern.len() {
return pos[numpos - 1] == pathlen;
}
if pat == '*' {
numpos = pathlen - pos[0] + 1;
for i in 1..numpos {
pos.insert(i, pos[i - 1] + 1);
}
} else {
let mut new_numpos = 0;
for i in 0..numpos {
if pos[i] < pathlen && path.chars().nth(pos[i]) == Some(pat) {
pos.insert(new_numpos, pos[i] + 1);
new_numpos += 1;
}
}
numpos = new_numpos;
if numpos == 0 {
return false;
}
}
}
true
}
}
#[derive(Default)]
pub struct LongestMatchRobotsMatchStrategy;
impl RobotsMatchStrategy for LongestMatchRobotsMatchStrategy {
fn match_allow(&self, path: &str, pattern: &str) -> i32 {
if Self::matches(path, pattern) {
pattern.len() as i32
} else {
-1
}
}
fn match_disallow(&self, path: &str, pattern: &str) -> i32 {
if Self::matches(path, pattern) {
pattern.len() as i32
} else {
-1
}
}
}
#[derive(Default)]
pub struct RobotsMatcher<'a, S: RobotsMatchStrategy> {
allow: MatchHierarchy,
disallow: MatchHierarchy,
seen_global_agent: bool,
seen_specific_agent: bool,
ever_seen_specific_agent: bool,
seen_separator: bool,
path: Cow<'a, str>,
user_agents: Vec<&'a str>,
match_strategy: S,
}
impl<'a, S: RobotsMatchStrategy> RobotsMatcher<'a, S> {
fn init_user_agents_and_path(&mut self, user_agents: Vec<&'a str>, path: Cow<'a, str>) {
self.path = path;
self.user_agents = user_agents;
}
pub fn allowed_by_robots(
&mut self,
robots_body: &'a str,
user_agents: Vec<&'a str>,
url: &'a str,
) -> bool
where
Self: RobotsParseHandler,
{
let path = super::get_path_params_query(url);
self.init_user_agents_and_path(user_agents, path);
super::parse_robotstxt(robots_body, self);
!self.disallow()
}
pub fn one_agent_allowed_by_robots(
&mut self,
robots_txt: &'a str,
user_agent: &'a str,
url: &'a str,
) -> bool
where
Self: RobotsParseHandler,
{
self.allowed_by_robots(robots_txt, vec![user_agent], url)
}
fn disallow(&self) -> bool {
if self.allow.specific.priority() > 0 || self.disallow.specific.priority() > 0 {
return self.disallow.specific.priority() > self.allow.specific.priority();
}
if self.ever_seen_specific_agent {
return false;
}
if self.disallow.global.priority() > 0 || self.allow.global.priority() > 0 {
return self.disallow.global.priority() > self.allow.global.priority();
}
false
}
fn seen_any_agent(&self) -> bool {
self.seen_global_agent || self.seen_specific_agent
}
fn extract_user_agent(user_agent: &str) -> &str {
if let Some(end) =
user_agent.find(|c: char| !(c.is_ascii_alphabetic() || c == '-' || c == '_'))
{
&user_agent[..end]
} else {
user_agent
}
}
pub fn is_valid_user_agent_to_obey(user_agent: &str) -> bool {
!user_agent.is_empty() && Self::extract_user_agent(user_agent) == user_agent
}
fn disallow_ignore_global(&self) -> bool {
if self.allow.specific.priority() > 0 || self.disallow.specific.priority() > 0 {
return self.disallow.specific.priority() > self.allow.specific.priority();
}
false
}
fn matching_line(&self) -> u32 {
if self.ever_seen_specific_agent {
return Match::higher_priority_match(&self.disallow.specific, &self.allow.specific)
.line();
}
Match::higher_priority_match(&self.disallow.global, &self.allow.global).line()
}
}
impl<S: RobotsMatchStrategy> RobotsParseHandler for RobotsMatcher<'_, S> {
fn handle_robots_start(&mut self) {
self.allow.clear();
self.disallow.clear();
self.seen_global_agent = false;
self.seen_specific_agent = false;
self.ever_seen_specific_agent = false;
self.seen_separator = false;
}
fn handle_robots_end(&mut self) {}
fn handle_user_agent(&mut self, line_num: u32, user_agent: &str) {
if self.seen_separator {
self.seen_specific_agent = false;
self.seen_global_agent = false;
self.seen_separator = false;
}
if !user_agent.is_empty()
&& user_agent.starts_with('*')
&& (user_agent.len() == 1 || user_agent[1..].starts_with(char::is_whitespace))
{
self.seen_global_agent = true;
} else {
let user_agent = Self::extract_user_agent(user_agent);
for agent in &self.user_agents {
if user_agent.eq_ignore_ascii_case(&agent) {
self.ever_seen_specific_agent = true;
self.seen_specific_agent = true;
break;
}
}
}
}
fn handle_allow(&mut self, line_num: u32, value: &str) {
if !self.seen_any_agent() {
return;
}
self.seen_separator = true;
let priority = self.match_strategy.match_disallow(&self.path, value);
if priority >= 0 {
if self.seen_specific_agent {
if self.allow.specific.priority() < priority {
self.allow.specific.set(priority, line_num);
}
} else if self.allow.global.priority() < priority {
self.allow.global.set(priority, line_num);
}
} else {
let slash_pos = value.rfind('/');
if let Some(slash_pos) = slash_pos {
if value[slash_pos..].starts_with("/index.htm") {
let new_pattern = format!("{}{}", &value[..(slash_pos + 1)], "$");
self.handle_allow(line_num, &new_pattern);
}
}
}
}
fn handle_disallow(&mut self, line_num: u32, value: &str) {
if !self.seen_any_agent() {
return;
}
self.seen_separator = true;
let priority = self.match_strategy.match_disallow(&self.path, value);
if priority >= 0 {
if self.seen_specific_agent {
if self.disallow.specific.priority() < priority {
self.disallow.specific.set(priority, line_num);
}
} else if self.disallow.global.priority() < priority {
self.disallow.global.set(priority, line_num);
}
}
}
fn handle_sitemap(&mut self, line_num: u32, value: &str) {
self.seen_separator = true;
}
fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str) {
self.seen_separator = true;
}
}
#[cfg(test)]
mod test {
use crate::matcher::*;
#[test]
fn test_extract_user_agent<'a>() {
type Target<'a> = RobotsMatcher<'a, LongestMatchRobotsMatchStrategy>;
assert_eq!("Googlebot", Target::extract_user_agent("Googlebot/2.1"));
assert_eq!("Googlebot", Target::extract_user_agent("Googlebot"));
assert_eq!("Googlebot-", Target::extract_user_agent("Googlebot-"));
assert_eq!("Googlebot_", Target::extract_user_agent("Googlebot_"));
assert_eq!("Googlebot_", Target::extract_user_agent("Googlebot_2.1"));
assert_eq!("", Target::extract_user_agent("1Googlebot_2.1"));
assert_eq!("Goo", Target::extract_user_agent("Goo1glebot_2.1"));
}
}