roboto/
lib.rs

1//! Parsing and applying robots.txt files.
2//!
3//! # Examples
4//! ```
5//! use roboto::Robots;
6//!
7//! let robots = r#"
8//! User-agent: *
9//! Disallow: /
10//! "#.parse::<Robots>().unwrap();
11//!
12//! assert!(!robots.is_allowed(&"googlebot".parse().unwrap(), "/"));
13//! assert!(robots.is_allowed(&"googlebot".parse().unwrap(), "/robots.txt"));
14//! assert!(!robots.is_allowed(&"googlebot".parse().unwrap(), "/foo/bar"));
15//! ```
16//!
17//! # References
18//! - [The Web Robots Pages](https://www.robotstxt.org/)
19//! - [RFC1945](https://datatracker.ietf.org/doc/html/rfc1945#section-3.7)
20//! - [WikiPedia](https://en.wikipedia.org/wiki/Robots_exclusion_standard)
21#![deny(missing_docs)]
22#![deny(missing_debug_implementations)]
23#![deny(unsafe_code)]
24
25use std::{fmt, hash::Hash, ops::Deref, str::FromStr};
26
27use camino::Utf8Path;
28
29pub mod error;
30use crate::error::DirectiveParseError;
31use crate::error::DirectivePathParseError;
32use crate::error::RobotParseError;
33use crate::error::UserAgentParseError;
34
35#[doc(hidden)]
36mod readme {
37    #![doc = include_str!("../README.md")]
38}
39
40const TSPECIALS: &str = "()<>@,;:\\\"/[]?={} \t";
41const PSAFE: &str = "$-_.+~";
42const PEXTRA: &str = "!*'(),";
43
44fn is_rfc1945_token(c: char) -> bool {
45    c.is_ascii() && !c.is_ascii_control() || TSPECIALS.contains(c)
46}
47fn is_rfc1945_path(c: char) -> bool {
48    c == '/' || c == '%' || c.is_ascii_alphanumeric() || PSAFE.contains(c) || PEXTRA.contains(c)
49}
50
51/// A User-Agent string.
52///
53/// This type represents a User-Agent string as defined in the [RFC1945](https://datatracker.ietf.org/doc/html/rfc1945#section-3.7).
54///
55/// # Examples
56///
57/// ```
58/// use roboto::UserAgent;
59///
60/// let agent = "googlebot".parse::<UserAgent>().unwrap();
61/// assert_eq!(agent.to_string(), "googlebot");
62///
63/// let agent = "*".parse::<UserAgent>().unwrap();
64/// assert_eq!(agent, UserAgent::ANY);
65///
66/// // User agents must be valid ascii
67/// assert!("😀".parse::<UserAgent>().is_err());
68/// ```
69#[derive(Debug, Clone)]
70pub struct UserAgent(Option<Box<str>>);
71
72impl UserAgent {
73    /// A User-Agent string that matches all User-Agents.
74    ///
75    /// This is normally spelled as `*` in a robots.txt file.
76    pub const ANY: UserAgent = UserAgent(None);
77
78    fn is_wildcard(&self) -> bool {
79        self.0.is_none()
80    }
81}
82
83impl PartialEq for UserAgent {
84    fn eq(&self, other: &Self) -> bool {
85        match (&self.0, &other.0) {
86            (Some(a), Some(b)) => a == b,
87            (None, _) => true,
88            (_, None) => true,
89        }
90    }
91}
92
93impl Eq for UserAgent {}
94
95impl Hash for UserAgent {
96    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
97        match &self.0 {
98            Some(agent) => agent.hash(state),
99            None => "*".hash(state),
100        }
101    }
102}
103
104impl FromStr for UserAgent {
105    type Err = UserAgentParseError;
106
107    fn from_str(s: &str) -> Result<Self, Self::Err> {
108        if s == "*" {
109            return Ok(UserAgent(None));
110        }
111
112        if s.is_empty() {
113            return Err(UserAgentParseError::EmptyUserAgent);
114        }
115
116        if !s.is_ascii() {
117            return Err(UserAgentParseError::InvalidUserAgentEncoding);
118        }
119
120        if !s.chars().all(is_rfc1945_token) {
121            return Err(UserAgentParseError::InvalidCharacters);
122        }
123
124        Ok(UserAgent(Some(s.into())))
125    }
126}
127
128impl fmt::Display for UserAgent {
129    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
130        match &self.0 {
131            Some(agent) => write!(f, "{}", agent),
132            None => write!(f, "*"),
133        }
134    }
135}
136
137#[derive(Debug, Clone, Hash)]
138enum PathInner {
139    None,
140    Any,
141    Path(Box<Utf8Path>),
142    Robots,
143}
144
145/// A path directive in a robots.txt file.
146///
147/// Path directives can match any url path, spelled as `/, no path (left empty) or a specific path.
148///
149/// # Examples
150/// ```
151/// use roboto::DirectivePath;
152///
153/// let path = "/foo/bar".parse::<DirectivePath>().unwrap();
154/// assert!(path.matches("/foo/bar/baz"));
155///
156/// let path = DirectivePath::ANY;
157/// assert!(path.matches("/foo/bar/baz"));
158///
159/// let path = DirectivePath::NONE;
160/// assert!(!path.matches("/foo/bar/baz"));
161/// ````
162#[derive(Debug, Clone)]
163pub struct DirectivePath(PathInner);
164
165impl DirectivePath {
166    /// A directive path which matches all possible paths.
167    pub const ANY: DirectivePath = DirectivePath(PathInner::Any);
168
169    /// A directive path which matches no paths
170    pub const NONE: DirectivePath = DirectivePath(PathInner::None);
171
172    /// Matches just `/robots.txt`
173    pub const ROBOTS: DirectivePath = DirectivePath(PathInner::Robots);
174
175    /// Check if a path matches this directive path.
176    pub fn matches(&self, path: &str) -> bool {
177        match &self.0 {
178            PathInner::None => false,
179            PathInner::Any => true,
180            PathInner::Path(pattern) => {
181                let path = Utf8Path::new(path);
182                path.starts_with(pattern.deref())
183            }
184            PathInner::Robots => {
185                let path = Utf8Path::new(path);
186                path == Utf8Path::new("/robots.txt")
187            }
188        }
189    }
190
191    /// Check if this directive path will match no paths.
192    pub fn is_none(&self) -> bool {
193        matches!(self.0, PathInner::None)
194    }
195
196    /// Check if this directive path will match any path.
197    pub fn is_any(&self) -> bool {
198        matches!(self.0, PathInner::Any)
199    }
200
201    /// Check if this directive path will match `/robots.txt`
202    pub fn is_robots(&self) -> bool {
203        matches!(self.0, PathInner::Robots)
204    }
205}
206
207impl fmt::Display for DirectivePath {
208    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
209        match &self.0 {
210            PathInner::None => write!(f, ""),
211            PathInner::Any => write!(f, "/"),
212            PathInner::Path(path) => write!(f, "{}", path.as_str().trim_end_matches('/')),
213            PathInner::Robots => write!(f, "/robots.txt"),
214        }
215    }
216}
217
218impl PartialEq for DirectivePath {
219    fn eq(&self, other: &Self) -> bool {
220        match (&self.0, &other.0) {
221            (PathInner::None, _) | (_, PathInner::None) => false,
222            (PathInner::Any, _) | (_, PathInner::Any) => true,
223            (PathInner::Path(a), PathInner::Path(b)) => a == b,
224            (PathInner::Robots, PathInner::Robots) => true,
225            _ => false,
226        }
227    }
228}
229
230impl FromStr for DirectivePath {
231    type Err = DirectivePathParseError;
232
233    fn from_str(s: &str) -> Result<Self, Self::Err> {
234        let path = s.trim();
235
236        if path == "/" {
237            return Ok(DirectivePath::ANY);
238        }
239
240        if path == "/robots.txt" || path == "robots.txt" {
241            return Ok(DirectivePath::ROBOTS);
242        }
243
244        if path.is_empty() {
245            return Ok(DirectivePath::NONE);
246        }
247
248        if !path.is_ascii() {
249            return Err(DirectivePathParseError::InvalidCharacters);
250        }
251
252        if !path.starts_with('/') {
253            return Err(DirectivePathParseError::InvalidCharacters);
254        }
255
256        if !path.chars().all(is_rfc1945_path) {
257            return Err(DirectivePathParseError::InvalidPathEncoding);
258        }
259
260        Ok(DirectivePath(PathInner::Path(
261            (path.to_string() + "/").as_str().into(),
262        )))
263    }
264}
265
266/// A directive type in a robots.txt file.
267///
268/// robots.txt files contain a list of directives, which can be either `Allow`, `Disallow` or an extension.
269///
270/// The directives control how to process the associated path.
271#[derive(Debug, Clone, PartialEq, Eq, Hash)]
272pub enum DirectiveType {
273    /// Allow the following paths
274    Allow,
275
276    /// Disallow the following paths
277    Disallow,
278
279    /// An extension directive.
280    Extension(Box<str>),
281}
282
283impl fmt::Display for DirectiveType {
284    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
285        match self {
286            DirectiveType::Allow => write!(f, "Allow"),
287            DirectiveType::Disallow => write!(f, "Disallow"),
288            DirectiveType::Extension(extension) => write!(f, "{}", extension),
289        }
290    }
291}
292
293/// A directive in a robots.txt file, which associates a path with a directive type.
294#[derive(Debug, Clone, PartialEq)]
295pub struct Directive {
296    path: DirectivePath,
297    rule: DirectiveType,
298}
299
300impl FromStr for Directive {
301    type Err = DirectiveParseError;
302
303    fn from_str(s: &str) -> Result<Self, Self::Err> {
304        let d = s.split('#').next().unwrap_or("").trim();
305
306        let mut parts = d.splitn(2, ':');
307        let rule = match parts.next() {
308            Some("Allow") => DirectiveType::Allow,
309            Some("Disallow") => DirectiveType::Disallow,
310            Some(extension) if extension.chars().all(is_rfc1945_token) => {
311                DirectiveType::Extension(extension.into())
312            }
313            _ => return Err(DirectiveParseError::InvalidRule),
314        };
315
316        let path: DirectivePath = match parts.next() {
317            Some(path) => path.parse()?,
318            None => DirectivePath::NONE,
319        };
320
321        Ok(Directive { path, rule })
322    }
323}
324
325impl fmt::Display for Directive {
326    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
327        if self.path.is_none() {
328            // Doesn't print trailing whitespace.
329            write!(f, "{}:", self.rule)
330        } else {
331            write!(f, "{}: {}", self.rule, self.path)
332        }
333    }
334}
335
336/// A set of User-Agents and associated directives.
337///
338/// This type represents a set of User-Agents and their associated directives in a robots.txt file.
339/// Multiple User-Agents can be associated with the same directives, by listing them all in the same block.
340///
341/// This type is used by [`Robots`] to represent sets of rules and apply them together.`
342#[derive(Debug, Clone, PartialEq)]
343pub struct RobotAgent {
344    agents: Vec<UserAgent>,
345    directives: Vec<Directive>,
346}
347
348impl fmt::Display for RobotAgent {
349    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
350        for agent in &self.agents {
351            writeln!(f, "User-agent: {}", agent)?;
352        }
353        for directive in &self.directives {
354            writeln!(f, "{}", directive)?;
355        }
356        Ok(())
357    }
358}
359
360/// A robots.txt file.
361///
362/// The full set of rules for a robots.txt file, including wildcard directives.
363///
364/// This type is used to parse and apply rules to a given path and User-Agent.
365///
366/// # Examples
367/// ```
368/// use roboto::Robots;
369///
370/// let robots = r#"
371/// User-agent: *
372/// Disallow: /foo/bar
373/// Allow: /hello
374/// "#.parse::<Robots>().unwrap();
375///
376/// assert!(robots.is_allowed(&"googlebot".parse().unwrap(), "/hello"));
377/// assert!(!robots.is_allowed(&"googlebot".parse().unwrap(), "/foo/bar"));
378///
379/// let robots = r#"
380/// User-agent: googlebot
381/// Disallow: /foo/bar
382/// "#.parse::<Robots>().unwrap();
383///
384/// assert!(!robots.is_allowed(&"googlebot".parse().unwrap(), "/foo/bar"));
385/// assert!(robots.is_allowed(&"googlebot".parse().unwrap(), "/hello"));
386/// assert!(robots.is_allowed(&"bingbot".parse().unwrap(), "/foo/bar"));
387///
388/// ```
389#[derive(Debug, Clone, Default, PartialEq)]
390pub struct Robots {
391    /// Wildcard directives
392    ///
393    /// Stored separately because they should only be applied when
394    /// no other agent matches.
395    pub wildcard: Vec<Directive>,
396
397    /// Agent-specific directives
398    pub agents: Vec<RobotAgent>,
399}
400
401impl Robots {
402    fn push(&mut self, mut agent: RobotAgent) {
403        if agent.agents.iter().any(|a| a.is_wildcard()) {
404            if self.wildcard.is_empty() {
405                self.wildcard.extend(agent.directives.iter().cloned());
406            }
407            agent.agents.retain(|a| !a.is_wildcard());
408
409            if !agent.agents.is_empty() {
410                self.agents.push(agent);
411            }
412        } else {
413            self.agents.push(agent);
414        }
415    }
416}
417
418impl FromStr for Robots {
419    type Err = RobotParseError;
420
421    fn from_str(s: &str) -> Result<Self, Self::Err> {
422        let mut robots = Robots::default();
423
424        let mut agents = Vec::new();
425        let mut directives = Vec::new();
426
427        for line in s.lines() {
428            let line = line.split('#').next().unwrap_or("").trim();
429
430            if line.is_empty() {
431                continue;
432            }
433
434            // Case-insensitive parse of the user-agent line
435            if line.to_ascii_lowercase().starts_with("user-agent") {
436                if !directives.is_empty() {
437                    robots.push(RobotAgent {
438                        agents: agents.clone(),
439                        directives: directives.clone(),
440                    });
441                    agents.clear();
442                    directives.clear();
443                }
444
445                let agent = line.split_once(':').map(|x| x.1).unwrap_or("").trim();
446                agents.push(
447                    agent
448                        .parse()
449                        .map_err(|err| RobotParseError::InvalidUserAgent(err, agent.to_string()))?,
450                );
451            } else {
452                directives.push(
453                    line.parse()
454                        .map_err(|err| RobotParseError::InvalidDirective(err, line.to_string()))?,
455                );
456            }
457        }
458
459        if !(agents.is_empty() && directives.is_empty()) {
460            robots.push(RobotAgent {
461                agents: agents.clone(),
462                directives: directives.clone(),
463            });
464        }
465
466        Ok(robots)
467    }
468}
469
470impl fmt::Display for Robots {
471    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
472        if let Some((last, remainder)) = self.agents.split_last() {
473            for agent in remainder {
474                writeln!(f, "{}", agent)?;
475            }
476
477            write!(f, "{}", last)?;
478        };
479
480        if !self.wildcard.is_empty() {
481            if !self.agents.is_empty() {
482                writeln!(f)?;
483            }
484            writeln!(f, "User-agent: *")?;
485            for directive in &self.wildcard {
486                writeln!(f, "{}", directive)?;
487            }
488        }
489        Ok(())
490    }
491}
492
493impl Robots {
494    /// Create a new robots.txt with a wildcard directive that disallows everything.
495    pub fn deny() -> Self {
496        Self {
497            wildcard: vec![Directive {
498                path: DirectivePath::ANY,
499                rule: DirectiveType::Disallow,
500            }],
501            agents: Vec::new(),
502        }
503    }
504
505    /// Create a new robots.txt with a wildcard directive that allows everything.
506    pub fn allow() -> Self {
507        Self {
508            wildcard: vec![Directive {
509                path: DirectivePath::ANY,
510                rule: DirectiveType::Allow,
511            }],
512            agents: Vec::new(),
513        }
514    }
515
516    /// Check if a path is allowed for a given User-Agent.
517    pub fn is_allowed(&self, user_agent: &UserAgent, path: &str) -> bool {
518        // robots.txt must be always allowed.
519        if DirectivePath::ROBOTS.matches(path) {
520            return true;
521        }
522
523        for agent in &self.agents {
524            // Check if the User-Agent matches.
525            if agent.agents.iter().any(|a| a == user_agent) {
526                // Check all directives for the matched User-Agent.
527                for directive in &agent.directives {
528                    if directive.path.matches(path) {
529                        match directive.rule {
530                            DirectiveType::Allow => return true,
531                            DirectiveType::Disallow => return false,
532                            DirectiveType::Extension(_) => {}
533                        }
534                    }
535                }
536
537                // Checked all the rules for the matched User-Agent, so we can stop.
538                return true;
539            }
540        }
541
542        // User-agents which don't match any specific agent are checked against the wildcard directives.
543        for directive in &self.wildcard {
544            if directive.path.matches(path) {
545                match directive.rule {
546                    DirectiveType::Allow => return true,
547                    DirectiveType::Disallow => return false,
548                    DirectiveType::Extension(_) => {}
549                }
550            }
551        }
552
553        // By default, all pages are allowed.
554        true
555    }
556}
557
558#[cfg(test)]
559mod test {
560    use super::*;
561
562    use indoc::indoc;
563
564    #[test]
565    fn user_agent() {
566        let ua = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)";
567        let ua = ua.parse::<UserAgent>().unwrap();
568        assert_eq!(ua, UserAgent::ANY);
569        assert_ne!(ua, "Mozilla/5.0".parse().unwrap());
570
571        let ua = "excite".parse::<UserAgent>().unwrap();
572        assert_ne!(&"googlebot".parse::<UserAgent>().unwrap(), &ua);
573        let ua = "*".parse::<UserAgent>().unwrap();
574        assert_eq!(&"googlebot".parse::<UserAgent>().unwrap(), &ua);
575    }
576
577    #[test]
578    fn directive_path() {
579        let path = "/foo/bar".parse::<DirectivePath>().unwrap();
580        assert!(path.matches("/foo/bar/baz"));
581        assert!(!path.matches("/foo"));
582
583        let path = DirectivePath::ANY;
584        assert!(path.matches("/foo/bar/baz"));
585        assert!(path.matches("/foo"));
586
587        let path = DirectivePath::NONE;
588        assert!(!path.matches("/foo/bar/baz"));
589        assert!(!path.matches("/foo"));
590        assert!(!path.matches(""));
591    }
592
593    #[test]
594    fn directive() {
595        let directive = "Allow: /foo/bar".parse::<Directive>().unwrap();
596        assert_eq!(directive.rule, DirectiveType::Allow);
597        assert!(matches!(directive.path, DirectivePath(PathInner::Path(_))));
598        assert!(directive.path.matches("/foo/bar/baz"));
599        assert!(!directive.path.matches("/foo"));
600
601        let directive = "Disallow: /foo/bar".parse::<Directive>().unwrap();
602        assert_eq!(directive.rule, DirectiveType::Disallow);
603        assert!(directive.path.matches("/foo/bar/baz"));
604        assert!(!directive.path.matches("/foo"));
605
606        let directive = "Allow: /foo/bar".parse::<Directive>().unwrap();
607        assert_eq!(directive.rule, DirectiveType::Allow);
608        assert!(directive.path.matches("/foo/bar/baz"));
609        assert!(!directive.path.matches("/foo"));
610
611        let directive = "Allow:".parse::<Directive>().unwrap();
612        assert_eq!(directive.rule, DirectiveType::Allow);
613        assert!(!directive.path.matches("/foo/bar/baz"));
614        assert!(!directive.path.matches("/foo"));
615
616        let directive = "Allow: /".parse::<Directive>().unwrap();
617        assert_eq!(directive.rule, DirectiveType::Allow);
618        assert!(directive.path.matches("/foo/bar/baz"));
619        assert!(directive.path.matches("/foo"));
620    }
621
622    #[test]
623    fn robot_txt() {
624        let example = indoc! {
625            r#"
626      # /robots.txt for http://www.fict.org/
627      # comments to webmaster@fict.org
628
629      User-agent: unhipbot
630      Disallow: /
631
632      User-agent: webcrawler
633      User-agent: excite
634      Disallow:
635
636      User-agent: *
637      Disallow: /org/plans.html
638      Allow: /org/
639      Allow: /serv
640      Allow: /~mak
641      Disallow: /
642            "#
643        }
644        .parse::<Robots>()
645        .unwrap();
646
647        assert!(!example.is_allowed(&"unhipbot".parse().unwrap(), "/org/plans.html"));
648        assert!(example.is_allowed(&"unhipbot".parse().unwrap(), "/robots.txt"));
649
650        assert!(example.is_allowed(&"webcrawler".parse().unwrap(), "/org/plans.html"));
651        assert!(DirectivePath::ANY.matches("/org/plans.html"));
652        assert!(example.is_allowed(&"excite".parse().unwrap(), "/org/plans.html"));
653
654        assert!(example.is_allowed(&"googlebot".parse().unwrap(), "/org/about.html"));
655        assert!(!example.is_allowed(&"googlebot".parse().unwrap(), "/org/plans.html"));
656    }
657
658    #[test]
659    fn default_deny() {
660        let robots = Robots::deny();
661        assert!(!robots.is_allowed(&"googlebot".parse().unwrap(), "/"));
662        assert!(!robots.is_allowed(&"googlebot".parse().unwrap(), "/foo"));
663        assert!(!robots.is_allowed(&"googlebot".parse().unwrap(), "/foo/bar"));
664        assert!(robots.is_allowed(&"googlebot".parse().unwrap(), "/robots.txt"));
665
666        let expected = indoc! {
667            r#"
668            User-agent: *
669            Disallow: /
670            "#
671        };
672
673        assert_eq!(robots.to_string().trim(), expected.trim());
674    }
675
676    macro_rules! test_format {
677        {$doc:tt} => {
678            let expected = indoc! {
679                $doc
680            };
681
682            let robots: Robots = expected.parse().unwrap();
683
684            assert_eq!(robots.to_string(), expected);
685        };
686    }
687
688    #[test]
689    fn format_path() {
690        test_format! {
691            r#"User-agent: *
692            Disallow: /foo/bar
693            Allow: /hello
694            "#
695        };
696    }
697
698    #[test]
699    fn format_blank_last() {
700        test_format! {
701            r#"User-agent: sus
702            Allow: /boobytrap
703            Disallow: /
704
705            User-agent: cool
706            Disallow: /secret
707            Disallow:
708            "#
709        };
710    }
711
712    #[test]
713    fn format_wildcard() {
714        test_format! {
715            r#"User-agent: sus
716            Disallow: /
717
718            User-agent: cool
719            Allow:
720
721            User-agent: *
722            Disallow: /foo/bar
723            Allow: /hello
724            "#
725        };
726    }
727}