use crate::RobotsParseHandler;
pub struct Match {
priority: i32,
line: u32,
}
impl Default for Match {
fn default() -> Self {
Match::new(Self::NO_MATCH_PRIORITY, 0)
}
}
impl Match {
const NO_MATCH_PRIORITY: i32 = -1;
pub fn new(priority: i32, line: u32) -> Match {
Match { priority, line }
}
pub fn set(&mut self, priority: i32, line: u32) {
self.priority = priority;
self.line = line;
}
pub fn clear(&mut self) {
self.set(Self::NO_MATCH_PRIORITY, 0);
}
pub fn line(&self) -> u32 {
self.line
}
pub fn priority(&self) -> i32 {
self.priority
}
pub fn higher_priority_match<'a>(a: &'a Match, b: &'a Match) -> &'a Match {
if a.priority() > b.priority() {
a
} else {
b
}
}
}
#[derive(Default)]
struct MatchHierarchy {
global: Match,
specific: Match,
}
impl MatchHierarchy {
pub fn clear(&mut self) {
self.global.clear();
self.specific.clear();
}
}
pub trait RobotsMatchStrategy: Default {
fn match_allow(&self, path: &str, pattern: &str) -> i32;
fn match_disallow(&self, path: &str, pattern: &str) -> i32;
fn matches(path: &str, pattern: &str) -> bool {
let pathlen = path.len();
let mut pos = Vec::with_capacity(pathlen + 1);
let mut numpos: usize = 1;
pos.insert(0, 0);
for (index, pat) in pattern.chars().enumerate() {
if pat == '$' && index + 1 == pattern.len() {
return pos[numpos - 1] == pathlen;
}
if pat == '*' {
numpos = pathlen - pos[0] + 1;
for i in 1..numpos {
pos.insert(i, pos[i - 1] + 1);
}
} else {
let mut new_numpos = 0;
for i in 0..numpos {
if pos[i] < pathlen && path.chars().nth(pos[i]) == Some(pat) {
pos.insert(new_numpos, pos[i] + 1);
new_numpos += 1;
}
}
numpos = new_numpos;
if numpos == 0 {
return false;
}
}
}
true
}
}
#[derive(Default)]
pub struct LongestMatchRobotsMatchStrategy;
impl RobotsMatchStrategy for LongestMatchRobotsMatchStrategy {
fn match_allow(&self, path: &str, pattern: &str) -> i32 {
if Self::matches(path, pattern) {
pattern.len() as i32
} else {
-1
}
}
fn match_disallow(&self, path: &str, pattern: &str) -> i32 {
if Self::matches(path, pattern) {
pattern.len() as i32
} else {
-1
}
}
}
#[derive(Default)]
pub struct RobotsMatcher<S: RobotsMatchStrategy> {
allow: MatchHierarchy,
disallow: MatchHierarchy,
seen_global_agent: bool,
seen_specific_agent: bool,
ever_seen_specific_agent: bool,
seen_separator: bool,
path: String,
user_agents: Vec<String>,
match_strategy: S,
}
enum ParseInvoke {
UserAgent {
line_num: u32,
user_agent: String,
},
Allow {
line_num: u32,
value: String,
},
Disallow {
line_num: u32,
value: String,
},
Sitemap {
line_num: u32,
value: String,
},
UnknownAction {
line_num: u32,
action: String,
value: String,
},
}
struct CachingRobotsParseHandler<S: RobotsMatchStrategy> {
invokes: Vec<ParseInvoke>,
matcher: RobotsMatcher<S>,
}
impl<S: RobotsMatchStrategy> CachingRobotsParseHandler<S> {
pub fn new(matcher: RobotsMatcher<S>) -> Self {
Self {
invokes: vec![],
matcher,
}
}
fn replay(&mut self) {
self.matcher.handle_robots_start();
for invoke in &self.invokes {
match invoke {
ParseInvoke::UserAgent {
line_num,
user_agent,
} => self.matcher.handle_user_agent(*line_num, &user_agent),
ParseInvoke::Allow { line_num, value } => {
self.matcher.handle_allow(*line_num, &value)
}
ParseInvoke::Disallow { line_num, value } => {
self.matcher.handle_disallow(*line_num, &value)
}
ParseInvoke::Sitemap { line_num, value } => {
self.matcher.handle_sitemap(*line_num, &value)
}
ParseInvoke::UnknownAction {
line_num,
action,
value,
} => self
.matcher
.handle_unknown_action(*line_num, &action, &value),
}
}
self.matcher.handle_robots_end();
}
pub fn allowed_by_robots(&mut self, user_agents: Vec<&str>, url: &str) -> bool {
let path = super::get_path_params_query(&url);
self.matcher.init_user_agents_and_path(user_agents, &path);
self.replay();
!self.matcher.disallow()
}
}
impl<S: RobotsMatchStrategy> RobotsParseHandler for CachingRobotsParseHandler<S> {
fn handle_robots_start(&mut self) {}
fn handle_robots_end(&mut self) {}
fn handle_user_agent(&mut self, line_num: u32, user_agent: &str) {
self.invokes.push(ParseInvoke::UserAgent {
line_num,
user_agent: String::from(user_agent),
})
}
fn handle_allow(&mut self, line_num: u32, value: &str) {
self.invokes.push(ParseInvoke::Allow {
line_num,
value: String::from(value),
})
}
fn handle_disallow(&mut self, line_num: u32, value: &str) {
self.invokes.push(ParseInvoke::Disallow {
line_num,
value: String::from(value),
})
}
fn handle_sitemap(&mut self, line_num: u32, value: &str) {
self.invokes.push(ParseInvoke::Sitemap {
line_num,
value: String::from(value),
})
}
fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str) {
self.invokes.push(ParseInvoke::UnknownAction {
line_num,
action: String::from(action),
value: String::from(value),
})
}
}
pub struct CachingRobotsMatcher<S: RobotsMatchStrategy> {
parse_handler: CachingRobotsParseHandler<S>,
}
impl<S: RobotsMatchStrategy> CachingRobotsMatcher<S> {
pub fn new(matcher: RobotsMatcher<S>) -> Self {
Self {
parse_handler: CachingRobotsParseHandler::new(matcher),
}
}
pub fn parse(&mut self, robots_body: &str) {
super::parse_robotstxt(robots_body, &mut self.parse_handler);
}
pub fn allowed_by_robots(&mut self, user_agents: Vec<&str>, url: &str) -> bool {
self.parse_handler.allowed_by_robots(user_agents, url)
}
pub fn one_agent_allowed_by_robots(&mut self, user_agent: &str, url: &str) -> bool {
self.parse_handler.allowed_by_robots(vec![user_agent], url)
}
}
impl<'a, S: RobotsMatchStrategy> RobotsMatcher<S> {
fn init_user_agents_and_path(&mut self, user_agents: Vec<&str>, path: &str) {
self.path = String::from(path);
self.user_agents = user_agents.into_iter().map(String::from).collect();
}
pub fn allowed_by_robots(
&mut self,
robots_body: &str,
user_agents: Vec<&str>,
url: &str,
) -> bool
where
Self: RobotsParseHandler,
{
let path = super::get_path_params_query(url);
self.init_user_agents_and_path(user_agents, &path);
super::parse_robotstxt(&robots_body, self);
!self.disallow()
}
pub fn one_agent_allowed_by_robots(
&mut self,
robots_txt: &str,
user_agent: &str,
url: &str,
) -> bool
where
Self: RobotsParseHandler,
{
self.allowed_by_robots(robots_txt, vec![user_agent], url)
}
fn disallow(&self) -> bool {
if self.allow.specific.priority() > 0 || self.disallow.specific.priority() > 0 {
return self.disallow.specific.priority() > self.allow.specific.priority();
}
if self.ever_seen_specific_agent {
return false;
}
if self.disallow.global.priority() > 0 || self.allow.global.priority() > 0 {
return self.disallow.global.priority() > self.allow.global.priority();
}
false
}
fn seen_any_agent(&self) -> bool {
self.seen_global_agent || self.seen_specific_agent
}
fn extract_user_agent(user_agent: &str) -> &str {
if let Some(end) =
user_agent.find(|c: char| !(c.is_ascii_alphabetic() || c == '-' || c == '_'))
{
&user_agent[..end]
} else {
user_agent
}
}
pub fn is_valid_user_agent_to_obey(user_agent: &str) -> bool {
!user_agent.is_empty() && Self::extract_user_agent(user_agent) == user_agent
}
pub fn matching_line(&self) -> u32 {
if self.ever_seen_specific_agent {
return Match::higher_priority_match(&self.disallow.specific, &self.allow.specific)
.line();
}
Match::higher_priority_match(&self.disallow.global, &self.allow.global).line()
}
}
impl<S: RobotsMatchStrategy> RobotsParseHandler for RobotsMatcher<S> {
fn handle_robots_start(&mut self) {
self.allow.clear();
self.disallow.clear();
self.seen_global_agent = false;
self.seen_specific_agent = false;
self.ever_seen_specific_agent = false;
self.seen_separator = false;
}
fn handle_robots_end(&mut self) {}
fn handle_user_agent(&mut self, _line_num: u32, user_agent: &str) {
if self.seen_separator {
self.seen_specific_agent = false;
self.seen_global_agent = false;
self.seen_separator = false;
}
if !user_agent.is_empty()
&& user_agent.starts_with('*')
&& (user_agent.len() == 1 || user_agent[1..].starts_with(char::is_whitespace))
{
self.seen_global_agent = true;
} else {
let user_agent = Self::extract_user_agent(user_agent);
for agent in &self.user_agents {
if user_agent.eq_ignore_ascii_case(&agent) {
self.ever_seen_specific_agent = true;
self.seen_specific_agent = true;
break;
}
}
}
}
fn handle_allow(&mut self, line_num: u32, value: &str) {
if !self.seen_any_agent() {
return;
}
self.seen_separator = true;
let priority = self.match_strategy.match_disallow(&self.path, value);
if priority >= 0 {
if self.seen_specific_agent {
if self.allow.specific.priority() < priority {
self.allow.specific.set(priority, line_num);
}
} else if self.allow.global.priority() < priority {
self.allow.global.set(priority, line_num);
}
} else {
let slash_pos = value.rfind('/');
if let Some(slash_pos) = slash_pos {
if value[slash_pos..].starts_with("/index.htm") {
let new_pattern = format!("{}{}", &value[..(slash_pos + 1)], "$");
self.handle_allow(line_num, &new_pattern);
}
}
}
}
fn handle_disallow(&mut self, line_num: u32, value: &str) {
if !self.seen_any_agent() {
return;
}
self.seen_separator = true;
let priority = self.match_strategy.match_disallow(&self.path, value);
if priority >= 0 {
if self.seen_specific_agent {
if self.disallow.specific.priority() < priority {
self.disallow.specific.set(priority, line_num);
}
} else if self.disallow.global.priority() < priority {
self.disallow.global.set(priority, line_num);
}
}
}
fn handle_sitemap(&mut self, _line_num: u32, _value: &str) {
self.seen_separator = true;
}
fn handle_unknown_action(&mut self, _line_num: u32, _action: &str, _value: &str) {
self.seen_separator = true;
}
}
#[cfg(test)]
mod test {
use crate::matcher::*;
#[test]
fn test_extract_user_agent<'a>() {
type Target = RobotsMatcher<LongestMatchRobotsMatchStrategy>;
assert_eq!("Googlebot", Target::extract_user_agent("Googlebot/2.1"));
assert_eq!("Googlebot", Target::extract_user_agent("Googlebot"));
assert_eq!("Googlebot-", Target::extract_user_agent("Googlebot-"));
assert_eq!("Googlebot_", Target::extract_user_agent("Googlebot_"));
assert_eq!("Googlebot_", Target::extract_user_agent("Googlebot_2.1"));
assert_eq!("", Target::extract_user_agent("1Googlebot_2.1"));
assert_eq!("Goo", Target::extract_user_agent("Goo1glebot_2.1"));
}
}