Skip to main content

fast_robots/
lib.rs

1#![cfg_attr(docsrs, feature(doc_cfg))]
2//! Fast, zero-copy parsing and matching for `robots.txt` files.
3//!
4//! `fast-robots` parses the standardized `User-agent`, `Allow`, and
5//! `Disallow` records used by crawlers, then evaluates paths using the RFC 9309
6//! matching rules: exact user-agent groups are preferred over `*`, the longest
7//! matching rule wins, and `Allow` wins ties.
8//!
9//! Parsed values borrow from the original input, so parsing avoids copying rule
10//! strings, user agents, and extension metadata. Keep the input string or byte
11//! buffer alive for as long as the returned [`RobotsTxt`] is used.
12//!
13//! # Quick Start
14//!
15//! ```
16//! use fast_robots::RobotsTxt;
17//!
18//! let robots = RobotsTxt::parse(
19//!     "User-agent: *\n\
20//!      Disallow: /private/\n\
21//!      Allow: /private/public/\n",
22//! );
23//!
24//! assert!(!robots.is_allowed("ExampleBot", "/private/file.html"));
25//! assert!(robots.is_allowed("ExampleBot", "/private/public/file.html"));
26//! ```
27//!
28//! # Fallible Byte Parsing
29//!
30//! Use the byte APIs when reading directly from files or HTTP responses. They
31//! reject invalid UTF-8 and inputs larger than [`DEFAULT_MAX_BYTES`] by default.
32//!
33//! ```
34//! # fn main() -> Result<(), fast_robots::ParseError> {
35//! use fast_robots::RobotsTxt;
36//!
37//! let robots = RobotsTxt::parse_bytes(b"User-agent: *\nDisallow: /tmp\n")?;
38//! assert!(!robots.is_allowed("ExampleBot", "/tmp/cache"));
39//! # Ok(())
40//! # }
41//! ```
42//!
43//! # Diagnostics
44//!
45//! The parser is tolerant by default and ignores malformed lines it can recover
46//! from. Use diagnostics when you want validator-style warnings alongside the
47//! parsed rules.
48//!
49//! ```rust
50//! use fast_robots::{ParseWarningKind, RobotsTxt};
51//!
52//! let report = RobotsTxt::parse_with_diagnostics(
53//!     "Disallow: /\nMissing separator\nUser-agent: *\nDisallow: /private\n",
54//! );
55//!
56//! assert!(matches!(
57//!     report.warnings[0].kind,
58//!     ParseWarningKind::RuleBeforeUserAgent { .. }
59//! ));
60//! assert!(matches!(
61//!     report.warnings[1].kind,
62//!     ParseWarningKind::MissingSeparator { .. }
63//! ));
64//! assert!(!report.robots.is_allowed("ExampleBot", "/private"));
65//! ```
66//!
67//! # Extension Metadata
68//!
69//! With the default `extensions` feature, non-core directives such as `Sitemap`
70//! and `Crawl-delay` are preserved as metadata. Extension metadata never changes
71//! [`RobotsTxt::is_allowed`] decisions.
72//!
73//! ```rust
74//! # #[cfg(feature = "extensions")]
75//! # {
76//! use fast_robots::RobotsTxt;
77//!
78//! let robots = RobotsTxt::parse(
79//!     "Sitemap: https://example.com/sitemap.xml\n\
80//!      User-agent: SlowBot\n\
81//!      Crawl-delay: 5\n\
82//!      Disallow: /slow/\n",
83//! );
84//!
85//! assert_eq!(robots.extensions.sitemaps, ["https://example.com/sitemap.xml"]);
86//! assert_eq!(robots.extensions.crawl_delays[0].agents, ["SlowBot"]);
87//! assert!(!robots.is_allowed("SlowBot", "/slow/page.html"));
88//! # }
89//! ```
90
91use std::collections::HashMap;
92
93use memchr::{memchr, memmem};
94use thiserror::Error;
95
96/// Default maximum accepted input size for fallible parsing APIs.
97///
98/// This matches the 500 KiB minimum fetch limit specified by RFC 9309 and is
99/// used by [`ParseOptions::default`]. Set [`ParseOptions::max_bytes`] to `None`
100/// to disable the limit.
101pub const DEFAULT_MAX_BYTES: usize = 512 * 1024;
102
103/// Errors returned by fallible parsing APIs.
104///
105/// Soft syntax issues, such as missing separators, are not hard errors because
106/// crawlers are expected to recover from malformed `robots.txt` files where
107/// possible. Use [`RobotsTxt::parse_with_diagnostics`] or
108/// [`RobotsTxt::parse_bytes_with_diagnostics`] to collect those warnings.
109#[derive(Debug, Error)]
110pub enum ParseError {
111    /// The input bytes were not valid UTF-8.
112    #[error("robots.txt is not valid UTF-8")]
113    Utf8(#[from] std::str::Utf8Error),
114
115    /// The input length exceeded [`ParseOptions::max_bytes`].
116    #[error("robots.txt is too large: {len} bytes exceeds limit of {max} bytes")]
117    TooLarge {
118        /// Actual input length in bytes.
119        len: usize,
120        /// Configured maximum input length in bytes.
121        max: usize,
122    },
123}
124
125/// Options shared by fallible parsing APIs.
126///
127/// # Examples
128///
129/// ```
130/// # fn main() -> Result<(), fast_robots::ParseError> {
131/// use fast_robots::{ParseOptions, RobotsTxt};
132///
133/// let robots = RobotsTxt::parse_with_options(
134///     "User-agent: *\nDisallow: /private\n",
135///     ParseOptions { max_bytes: Some(1024) },
136/// )?;
137///
138/// assert!(!robots.is_allowed("ExampleBot", "/private"));
139/// # Ok(())
140/// # }
141/// ```
142#[derive(Debug, Clone, Copy, PartialEq, Eq)]
143pub struct ParseOptions {
144    /// Maximum accepted input size in bytes.
145    ///
146    /// `Some(DEFAULT_MAX_BYTES)` is used by default. Set to `None` to disable
147    /// size checks for trusted inputs.
148    pub max_bytes: Option<usize>,
149}
150
151impl Default for ParseOptions {
152    fn default() -> Self {
153        Self {
154            max_bytes: Some(DEFAULT_MAX_BYTES),
155        }
156    }
157}
158
159/// Parsed rules plus any diagnostics collected during parsing.
160///
161/// Returned by diagnostics APIs. The parser output remains available even when
162/// warnings were emitted.
163#[derive(Debug, Clone, PartialEq, Eq)]
164pub struct ParseReport<'a> {
165    /// Parsed `robots.txt` rules and extension metadata.
166    pub robots: RobotsTxt<'a>,
167    /// Recoverable parse warnings in source order.
168    pub warnings: Vec<ParseWarning<'a>>,
169}
170
171/// A recoverable parse issue with its one-based line number.
172#[derive(Debug, Clone, PartialEq, Eq)]
173pub struct ParseWarning<'a> {
174    /// One-based line number where the warning was found.
175    pub line: usize,
176    /// Warning category and borrowed source data, when relevant.
177    pub kind: ParseWarningKind<'a>,
178}
179
180/// Recoverable parse warning categories.
181#[derive(Debug, Clone, PartialEq, Eq)]
182pub enum ParseWarningKind<'a> {
183    /// A non-empty, non-comment line did not contain a `:` separator.
184    MissingSeparator {
185        /// Trimmed line contents.
186        line: &'a str,
187    },
188    /// A directive had a `:` separator but no key before it.
189    EmptyDirectiveKey,
190    /// A `User-agent` directive had an empty value.
191    EmptyUserAgent,
192    /// An `Allow` or `Disallow` directive appeared before any `User-agent`.
193    RuleBeforeUserAgent {
194        /// Directive key that appeared before a group was started.
195        key: &'a str,
196    },
197}
198
199/// Parsed `robots.txt` data.
200///
201/// Values inside this type borrow from the original input. Use
202/// [`RobotsTxt::is_allowed`] for access checks and inspect [`RobotsTxt::groups`]
203/// when you need the parsed rule structure.
204///
205/// # Examples
206///
207/// ```
208/// use fast_robots::{RobotsTxt, RuleKind};
209///
210/// let robots = RobotsTxt::parse("User-agent: *\nDisallow: /admin\n");
211///
212/// assert_eq!(robots.groups[0].agents, ["*"]);
213/// assert_eq!(robots.groups[0].rules[0].kind, RuleKind::Disallow);
214/// assert_eq!(robots.groups[0].rules[0].pattern, "/admin");
215/// ```
216#[derive(Debug, Clone, PartialEq, Eq)]
217pub struct RobotsTxt<'a> {
218    /// Standard access-control groups in source order.
219    pub groups: Vec<Group<'a>>,
220    /// Non-core metadata collected when the `extensions` feature is enabled.
221    #[cfg(feature = "extensions")]
222    #[cfg_attr(docsrs, doc(cfg(feature = "extensions")))]
223    pub extensions: Extensions<'a>,
224}
225
226/// A `robots.txt` group containing one or more user agents and their rules.
227///
228/// Consecutive `User-agent` records before the first rule belong to the same
229/// group. A later `User-agent` starts a new group after any `Allow` or
230/// `Disallow` record has been seen.
231#[derive(Debug, Clone, PartialEq, Eq)]
232pub struct Group<'a> {
233    /// User-agent product tokens covered by this group.
234    pub agents: Vec<&'a str>,
235    /// Access-control rules associated with [`Group::agents`].
236    pub rules: Vec<Rule<'a>>,
237}
238
239/// A single `Allow` or `Disallow` rule.
240#[derive(Debug, Clone, Copy, PartialEq, Eq)]
241pub struct Rule<'a> {
242    /// Whether this rule allows or disallows matching paths.
243    pub kind: RuleKind,
244    /// Path pattern borrowed from the directive value.
245    ///
246    /// Patterns may contain `*` wildcards and a trailing `$` end anchor.
247    pub pattern: &'a str,
248}
249
250/// Access-control directive kind.
251#[derive(Debug, Clone, Copy, PartialEq, Eq)]
252pub enum RuleKind {
253    /// An `Allow` directive.
254    Allow,
255    /// A `Disallow` directive.
256    Disallow,
257}
258
259/// Precompiled matcher for repeated access checks against one [`RobotsTxt`].
260///
261/// Build this with [`RobotsTxt::matcher`] when checking many paths against the
262/// same parsed file. Construction allocates an index and precomputes rule
263/// metadata, so [`RobotsTxt::is_allowed`] remains the lower-overhead option for
264/// one-off checks.
265#[derive(Debug, Clone)]
266pub struct RobotsMatcher<'a> {
267    agent_groups: HashMap<String, Vec<usize>>,
268    fallback_groups: Vec<usize>,
269    compiled_rules: Vec<Vec<CompiledRule<'a>>>,
270}
271
272#[derive(Debug, Clone, Copy)]
273struct CompiledRule<'a> {
274    kind: RuleKind,
275    pattern: &'a str,
276    anchored: bool,
277    has_wildcard: bool,
278    specificity: usize,
279}
280
281/// Feature-gated metadata for common non-standard directives.
282///
283/// These values are collected for callers that need them, but they do not affect
284/// access decisions returned by [`RobotsTxt::is_allowed`].
285///
286/// # Examples
287///
288/// ```
289/// use fast_robots::RobotsTxt;
290///
291/// let robots = RobotsTxt::parse(
292///     "Sitemap: https://example.com/sitemap.xml\n\
293///      Host: example.com\n\
294///      User-agent: *\n\
295///      Crawl-delay: 10\n",
296/// );
297///
298/// assert_eq!(robots.extensions.sitemaps, ["https://example.com/sitemap.xml"]);
299/// assert_eq!(robots.extensions.hosts, ["example.com"]);
300/// assert_eq!(robots.extensions.crawl_delays[0].value, "10");
301/// ```
302#[cfg(feature = "extensions")]
303#[cfg_attr(docsrs, doc(cfg(feature = "extensions")))]
304#[derive(Debug, Clone, PartialEq, Eq, Default)]
305pub struct Extensions<'a> {
306    /// `Sitemap` directive values.
307    pub sitemaps: Vec<&'a str>,
308    /// `Crawl-delay` directive values, including the current group agents.
309    pub crawl_delays: Vec<CrawlDelay<'a>>,
310    /// `Host` directive values.
311    pub hosts: Vec<&'a str>,
312    /// `Clean-param` directive values.
313    pub clean_params: Vec<CleanParam<'a>>,
314    /// Unknown non-core directives preserved as key/value pairs.
315    pub other: Vec<Directive<'a>>,
316}
317
318/// A `Crawl-delay` directive and the group agents active when it appeared.
319#[cfg(feature = "extensions")]
320#[cfg_attr(docsrs, doc(cfg(feature = "extensions")))]
321#[derive(Debug, Clone, PartialEq, Eq)]
322pub struct CrawlDelay<'a> {
323    /// Current group agents at the point where the directive appeared.
324    ///
325    /// This is empty when `Crawl-delay` appears before any `User-agent`.
326    pub agents: Vec<&'a str>,
327    /// Raw `Crawl-delay` value.
328    pub value: &'a str,
329}
330
331/// A `Clean-param` directive value.
332#[cfg(feature = "extensions")]
333#[cfg_attr(docsrs, doc(cfg(feature = "extensions")))]
334#[derive(Debug, Clone, Copy, PartialEq, Eq)]
335pub struct CleanParam<'a> {
336    /// Raw `Clean-param` value.
337    pub value: &'a str,
338}
339
340/// A non-core directive preserved as a raw key/value pair.
341#[cfg(feature = "extensions")]
342#[cfg_attr(docsrs, doc(cfg(feature = "extensions")))]
343#[derive(Debug, Clone, Copy, PartialEq, Eq)]
344pub struct Directive<'a> {
345    /// Directive key as written before the `:` separator, after ASCII trim.
346    pub key: &'a str,
347    /// Directive value as written after the `:` separator, after ASCII trim.
348    pub value: &'a str,
349}
350
351impl<'a> RobotsTxt<'a> {
352    /// Parses a UTF-8 `robots.txt` string into access rules.
353    ///
354    /// This is tolerant and infallible: malformed lines are ignored where the
355    /// parser can recover. Use [`RobotsTxt::parse_with_diagnostics`] to collect
356    /// warnings, or [`RobotsTxt::parse_with_options`] to enforce a size limit.
357    ///
358    /// # Examples
359    ///
360    /// ```
361    /// use fast_robots::RobotsTxt;
362    ///
363    /// let robots = RobotsTxt::parse("User-agent: *\nDisallow: /private\n");
364    ///
365    /// assert!(!robots.is_allowed("ExampleBot", "/private/file.html"));
366    /// assert!(robots.is_allowed("ExampleBot", "/public/file.html"));
367    /// ```
368    pub fn parse(input: &'a str) -> Self {
369        parse_inner(input, false).robots
370    }
371
372    /// Parses UTF-8 bytes into access rules using [`ParseOptions::default`].
373    ///
374    /// Returns [`ParseError::Utf8`] for invalid UTF-8 and
375    /// [`ParseError::TooLarge`] when the input is larger than
376    /// [`DEFAULT_MAX_BYTES`].
377    ///
378    /// # Examples
379    ///
380    /// ```
381    /// # fn main() -> Result<(), fast_robots::ParseError> {
382    /// use fast_robots::RobotsTxt;
383    ///
384    /// let robots = RobotsTxt::parse_bytes(b"User-agent: *\nDisallow: /tmp\n")?;
385    /// assert!(!robots.is_allowed("ExampleBot", "/tmp/cache"));
386    /// # Ok(())
387    /// # }
388    /// ```
389    pub fn parse_bytes(input: &'a [u8]) -> Result<Self, ParseError> {
390        Self::parse_bytes_with_options(input, ParseOptions::default())
391    }
392
393    /// Parses UTF-8 bytes into access rules with explicit options.
394    ///
395    /// Use this when reading raw bytes and you need a custom size limit.
396    ///
397    /// # Examples
398    ///
399    /// ```
400    /// # fn main() -> Result<(), fast_robots::ParseError> {
401    /// use fast_robots::{ParseOptions, RobotsTxt};
402    ///
403    /// let robots = RobotsTxt::parse_bytes_with_options(
404    ///     b"User-agent: *\nDisallow: /cache\n",
405    ///     ParseOptions { max_bytes: Some(1024) },
406    /// )?;
407    ///
408    /// assert!(!robots.is_allowed("ExampleBot", "/cache/file"));
409    /// # Ok(())
410    /// # }
411    /// ```
412    pub fn parse_bytes_with_options(
413        input: &'a [u8],
414        options: ParseOptions,
415    ) -> Result<Self, ParseError> {
416        check_size(input.len(), options)?;
417        let input = std::str::from_utf8(input)?;
418        Ok(Self::parse(input))
419    }
420
421    /// Parses a UTF-8 string into access rules with explicit options.
422    ///
423    /// This is useful when the input is already a `str` but should still be
424    /// checked against a maximum size.
425    ///
426    /// # Examples
427    ///
428    /// ```
429    /// # fn main() -> Result<(), fast_robots::ParseError> {
430    /// use fast_robots::{ParseOptions, RobotsTxt};
431    ///
432    /// let robots = RobotsTxt::parse_with_options(
433    ///     "User-agent: *\nDisallow: /private\n",
434    ///     ParseOptions { max_bytes: Some(1024) },
435    /// )?;
436    ///
437    /// assert!(!robots.is_allowed("ExampleBot", "/private"));
438    /// # Ok(())
439    /// # }
440    /// ```
441    pub fn parse_with_options(input: &'a str, options: ParseOptions) -> Result<Self, ParseError> {
442        check_size(input.len(), options)?;
443        Ok(Self::parse(input))
444    }
445
446    /// Parses a UTF-8 string and records recoverable syntax warnings.
447    ///
448    /// Diagnostics do not change parser recovery behavior; they only expose the
449    /// issues that tolerant parsing skipped.
450    ///
451    /// # Examples
452    ///
453    /// ```
454    /// use fast_robots::{ParseWarningKind, RobotsTxt};
455    ///
456    /// let report = RobotsTxt::parse_with_diagnostics(
457    ///     "Disallow: /\nMissing separator\nUser-agent: *\nDisallow: /private\n",
458    /// );
459    ///
460    /// assert_eq!(report.warnings.len(), 2);
461    /// assert!(matches!(
462    ///     report.warnings[0].kind,
463    ///     ParseWarningKind::RuleBeforeUserAgent { .. }
464    /// ));
465    /// assert!(!report.robots.is_allowed("ExampleBot", "/private"));
466    /// ```
467    pub fn parse_with_diagnostics(input: &'a str) -> ParseReport<'a> {
468        parse_inner(input, true)
469    }
470
471    /// Parses a UTF-8 string with diagnostics and explicit options.
472    ///
473    /// # Examples
474    ///
475    /// ```
476    /// # fn main() -> Result<(), fast_robots::ParseError> {
477    /// use fast_robots::{ParseOptions, RobotsTxt};
478    ///
479    /// let report = RobotsTxt::parse_with_diagnostics_options(
480    ///     "User-agent: *\nDisallow: /private\n",
481    ///     ParseOptions { max_bytes: Some(1024) },
482    /// )?;
483    ///
484    /// assert!(report.warnings.is_empty());
485    /// assert!(!report.robots.is_allowed("ExampleBot", "/private"));
486    /// # Ok(())
487    /// # }
488    /// ```
489    pub fn parse_with_diagnostics_options(
490        input: &'a str,
491        options: ParseOptions,
492    ) -> Result<ParseReport<'a>, ParseError> {
493        check_size(input.len(), options)?;
494        Ok(parse_inner(input, true))
495    }
496
497    /// Parses UTF-8 bytes and records recoverable syntax warnings.
498    ///
499    /// Uses [`ParseOptions::default`] for size checking.
500    ///
501    /// # Examples
502    ///
503    /// ```
504    /// # fn main() -> Result<(), fast_robots::ParseError> {
505    /// use fast_robots::RobotsTxt;
506    ///
507    /// let report = RobotsTxt::parse_bytes_with_diagnostics(
508    ///     b"User-agent: *\nDisallow: /private\n",
509    /// )?;
510    ///
511    /// assert!(report.warnings.is_empty());
512    /// assert!(!report.robots.is_allowed("ExampleBot", "/private"));
513    /// # Ok(())
514    /// # }
515    /// ```
516    pub fn parse_bytes_with_diagnostics(input: &'a [u8]) -> Result<ParseReport<'a>, ParseError> {
517        Self::parse_bytes_with_diagnostics_options(input, ParseOptions::default())
518    }
519
520    /// Parses UTF-8 bytes with diagnostics and explicit options.
521    ///
522    /// # Examples
523    ///
524    /// ```
525    /// # fn main() -> Result<(), fast_robots::ParseError> {
526    /// use fast_robots::{ParseOptions, RobotsTxt};
527    ///
528    /// let report = RobotsTxt::parse_bytes_with_diagnostics_options(
529    ///     b"User-agent: *\nDisallow: /private\n",
530    ///     ParseOptions { max_bytes: Some(1024) },
531    /// )?;
532    ///
533    /// assert!(report.warnings.is_empty());
534    /// # Ok(())
535    /// # }
536    /// ```
537    pub fn parse_bytes_with_diagnostics_options(
538        input: &'a [u8],
539        options: ParseOptions,
540    ) -> Result<ParseReport<'a>, ParseError> {
541        check_size(input.len(), options)?;
542        let input = std::str::from_utf8(input)?;
543        Ok(parse_inner(input, true))
544    }
545
546    /// Builds an indexed matcher for repeated access checks.
547    ///
548    /// The returned matcher borrows this parsed file, indexes user-agent groups,
549    /// and precomputes rule metadata. Use it when checking many URLs against the
550    /// same `robots.txt`; for one-off checks, [`RobotsTxt::is_allowed`] avoids
551    /// the upfront allocation cost.
552    ///
553    /// # Examples
554    ///
555    /// ```
556    /// use fast_robots::RobotsTxt;
557    ///
558    /// let robots = RobotsTxt::parse("User-agent: *\nDisallow: /private\n");
559    /// let matcher = robots.matcher();
560    ///
561    /// assert!(!matcher.is_allowed("ExampleBot", "/private/file"));
562    /// assert!(matcher.is_allowed("ExampleBot", "/public/file"));
563    /// ```
564    pub fn matcher(&'a self) -> RobotsMatcher<'a> {
565        RobotsMatcher::new(self)
566    }
567
568    /// Returns whether `user_agent` may crawl `path`.
569    ///
570    /// The matcher implements the core RFC 9309 access semantics used by this
571    /// crate: exact user-agent groups are considered before the `*` fallback,
572    /// matching exact groups are merged, the longest matching pattern wins, and
573    /// `Allow` wins ties. `/robots.txt` is always allowed.
574    ///
575    /// `path` should be the URL path and optional query string, not a full URL.
576    ///
577    /// # Examples
578    ///
579    /// ```
580    /// use fast_robots::RobotsTxt;
581    ///
582    /// let robots = RobotsTxt::parse(
583    ///     "User-agent: *\n\
584    ///      Disallow: /private\n\
585    ///      Allow: /private/public\n",
586    /// );
587    ///
588    /// assert!(!robots.is_allowed("ExampleBot", "/private/file"));
589    /// assert!(robots.is_allowed("ExampleBot", "/private/public/file"));
590    /// assert!(robots.is_allowed("ExampleBot", "/robots.txt"));
591    /// ```
592    pub fn is_allowed(&self, user_agent: &str, path: &str) -> bool {
593        if path == "/robots.txt" {
594            return true;
595        }
596
597        let mut exact_match = false;
598        let mut best: Option<(usize, RuleKind)> = None;
599
600        for group in &self.groups {
601            if group
602                .agents
603                .iter()
604                .any(|agent| *agent != "*" && agent.eq_ignore_ascii_case(user_agent))
605            {
606                exact_match = true;
607                apply_group_rules(group, path, &mut best);
608            }
609        }
610
611        if !exact_match {
612            for group in &self.groups {
613                if group.agents.contains(&"*") {
614                    apply_group_rules(group, path, &mut best);
615                }
616            }
617        }
618
619        rule_decision(best)
620    }
621}
622
623impl<'a> RobotsMatcher<'a> {
624    fn new(robots: &'a RobotsTxt<'a>) -> Self {
625        let groups = robots.groups.as_slice();
626        let mut agent_groups: HashMap<String, Vec<usize>> = HashMap::new();
627        let mut fallback_groups = Vec::new();
628        let mut compiled_rules = Vec::with_capacity(groups.len());
629
630        for (group_index, group) in groups.iter().enumerate() {
631            for agent in &group.agents {
632                if *agent == "*" {
633                    fallback_groups.push(group_index);
634                } else {
635                    let indexes = agent_groups.entry(agent.to_ascii_lowercase()).or_default();
636                    if !indexes.contains(&group_index) {
637                        indexes.push(group_index);
638                    }
639                }
640            }
641
642            compiled_rules.push(group.rules.iter().filter_map(CompiledRule::new).collect());
643        }
644
645        Self {
646            agent_groups,
647            fallback_groups,
648            compiled_rules,
649        }
650    }
651
652    /// Returns whether `user_agent` may crawl `path` using the prebuilt index.
653    ///
654    /// This has the same access semantics as [`RobotsTxt::is_allowed`], including
655    /// exact user-agent precedence over `*`, merged exact groups, longest-match
656    /// rule selection, `Allow` tie wins, and implicit allowance for `/robots.txt`.
657    pub fn is_allowed(&self, user_agent: &str, path: &str) -> bool {
658        if path == "/robots.txt" {
659            return true;
660        }
661
662        let mut best: Option<(usize, RuleKind)> = None;
663        let agent = user_agent.to_ascii_lowercase();
664
665        if let Some(group_indexes) = self.agent_groups.get(&agent) {
666            self.apply_group_indexes(group_indexes, path, &mut best);
667        } else {
668            self.apply_group_indexes(&self.fallback_groups, path, &mut best);
669        }
670
671        rule_decision(best)
672    }
673
674    fn apply_group_indexes(
675        &self,
676        group_indexes: &[usize],
677        path: &str,
678        best: &mut Option<(usize, RuleKind)>,
679    ) {
680        for &group_index in group_indexes {
681            apply_compiled_rules(&self.compiled_rules[group_index], path, best);
682        }
683    }
684}
685
686impl<'a> CompiledRule<'a> {
687    fn new(rule: &Rule<'a>) -> Option<Self> {
688        if rule.pattern.is_empty() {
689            return None;
690        }
691
692        let (pattern, anchored) = strip_end_anchor(rule.pattern);
693
694        Some(Self {
695            kind: rule.kind,
696            pattern,
697            anchored,
698            has_wildcard: pattern.as_bytes().contains(&b'*'),
699            specificity: pattern.len(),
700        })
701    }
702
703    fn matching_specificity(self, path: &str) -> Option<usize> {
704        let matched = if self.has_wildcard {
705            glob_matches(self.pattern.as_bytes(), path.as_bytes(), self.anchored)
706        } else if self.anchored {
707            path == self.pattern
708        } else {
709            path.starts_with(self.pattern)
710        };
711
712        matched.then_some(self.specificity)
713    }
714}
715
716/// Checks an input length against the configured parser size limit.
717fn check_size(len: usize, options: ParseOptions) -> Result<(), ParseError> {
718    if let Some(max) = options.max_bytes {
719        if len > max {
720            return Err(ParseError::TooLarge { len, max });
721        }
722    }
723
724    Ok(())
725}
726
727#[derive(Debug, Clone, Copy, PartialEq, Eq)]
728enum DirectiveKind {
729    UserAgent,
730    Allow,
731    Disallow,
732    #[cfg(feature = "extensions")]
733    Sitemap,
734    #[cfg(feature = "extensions")]
735    CrawlDelay,
736    #[cfg(feature = "extensions")]
737    Host,
738    #[cfg(feature = "extensions")]
739    CleanParam,
740    Other,
741}
742
743fn classify_directive_key(key: &str) -> DirectiveKind {
744    match key.as_bytes() {
745        b"Allow" | b"allow" => return DirectiveKind::Allow,
746        b"Disallow" | b"disallow" => return DirectiveKind::Disallow,
747        b"User-agent" | b"user-agent" => return DirectiveKind::UserAgent,
748        #[cfg(feature = "extensions")]
749        b"Host" | b"host" => return DirectiveKind::Host,
750        #[cfg(feature = "extensions")]
751        b"Sitemap" | b"sitemap" => return DirectiveKind::Sitemap,
752        #[cfg(feature = "extensions")]
753        b"Crawl-delay" | b"crawl-delay" => return DirectiveKind::CrawlDelay,
754        #[cfg(feature = "extensions")]
755        b"Clean-param" | b"clean-param" => return DirectiveKind::CleanParam,
756        _ => {}
757    }
758
759    classify_directive_key_ignore_case(key)
760}
761
762#[cold]
763#[inline(never)]
764fn classify_directive_key_ignore_case(key: &str) -> DirectiveKind {
765    match key.len() {
766        5 if key.eq_ignore_ascii_case("allow") => DirectiveKind::Allow,
767        8 if key.eq_ignore_ascii_case("disallow") => DirectiveKind::Disallow,
768        10 if key.eq_ignore_ascii_case("user-agent") => DirectiveKind::UserAgent,
769        #[cfg(feature = "extensions")]
770        4 if key.eq_ignore_ascii_case("host") => DirectiveKind::Host,
771        #[cfg(feature = "extensions")]
772        7 if key.eq_ignore_ascii_case("sitemap") => DirectiveKind::Sitemap,
773        #[cfg(feature = "extensions")]
774        11 if key.eq_ignore_ascii_case("crawl-delay") => DirectiveKind::CrawlDelay,
775        #[cfg(feature = "extensions")]
776        11 if key.eq_ignore_ascii_case("clean-param") => DirectiveKind::CleanParam,
777        _ => DirectiveKind::Other,
778    }
779}
780
781fn new_group<'a>(agent: &'a str) -> Group<'a> {
782    Group {
783        agents: vec![agent],
784        rules: Vec::with_capacity(4),
785    }
786}
787
788/// Shared parser implementation for tolerant and diagnostics-enabled parsing.
789///
790/// The parser walks the file one line at a time, strips comments and ASCII
791/// whitespace, tracks the current user-agent group, and optionally records soft
792/// failures as [`ParseWarning`] values.
793fn parse_inner<'a>(input: &'a str, diagnostics: bool) -> ParseReport<'a> {
794    let mut groups = vec![];
795    let mut current: Option<Group<'a>> = None;
796    let mut current_has_rules = false;
797    let mut warnings = vec![];
798
799    #[cfg(feature = "extensions")]
800    let mut extensions = Extensions::default();
801
802    for (line_number, line) in Lines::new(input) {
803        let line = trim_ascii(strip_comment(line));
804        if line.is_empty() {
805            continue;
806        }
807
808        let Some((key, value)) = split_directive(line) else {
809            if diagnostics {
810                warnings.push(ParseWarning {
811                    line: line_number,
812                    kind: ParseWarningKind::MissingSeparator { line },
813                });
814            }
815            continue;
816        };
817
818        let key = trim_ascii(key);
819        let value = trim_ascii(value);
820        if key.is_empty() {
821            if diagnostics {
822                warnings.push(ParseWarning {
823                    line: line_number,
824                    kind: ParseWarningKind::EmptyDirectiveKey,
825                });
826            }
827            continue;
828        }
829
830        let directive = classify_directive_key(key);
831
832        match directive {
833            DirectiveKind::UserAgent => {
834                if value.is_empty() {
835                    if diagnostics {
836                        warnings.push(ParseWarning {
837                            line: line_number,
838                            kind: ParseWarningKind::EmptyUserAgent,
839                        });
840                    }
841                    continue;
842                };
843
844                match current.as_mut() {
845                    Some(group) if !current_has_rules => group.agents.push(value),
846                    Some(_) => {
847                        groups.push(current.take().expect("current group exists"));
848                        current = Some(new_group(value));
849                        current_has_rules = false;
850                    }
851                    None => {
852                        current = Some(new_group(value));
853                    }
854                }
855            }
856            DirectiveKind::Allow | DirectiveKind::Disallow => {
857                let Some(group) = current.as_mut() else {
858                    if diagnostics {
859                        warnings.push(ParseWarning {
860                            line: line_number,
861                            kind: ParseWarningKind::RuleBeforeUserAgent { key },
862                        });
863                    }
864                    continue;
865                };
866
867                let kind = match directive {
868                    DirectiveKind::Allow => RuleKind::Allow,
869                    DirectiveKind::Disallow => RuleKind::Disallow,
870                    _ => unreachable!("only allow/disallow directives reach this branch"),
871                };
872
873                group.rules.push(Rule {
874                    kind,
875                    pattern: value,
876                });
877                current_has_rules = true;
878            }
879            _ => {
880                #[cfg(feature = "extensions")]
881                collect_extension(&mut extensions, current.as_ref(), directive, key, value);
882            }
883        }
884    }
885
886    if let Some(group) = current {
887        groups.push(group);
888    }
889
890    ParseReport {
891        robots: RobotsTxt {
892            groups,
893            #[cfg(feature = "extensions")]
894            extensions,
895        },
896        warnings,
897    }
898}
899
900/// Applies matching rules from a group to the current best access decision.
901///
902/// `best` stores the specificity and kind of the strongest matching rule seen
903/// so far. More specific patterns replace less specific ones, and `Allow`
904/// replaces `Disallow` on ties.
905fn apply_group_rules(group: &Group<'_>, path: &str, best: &mut Option<(usize, RuleKind)>) {
906    for rule in &group.rules {
907        let Some(specificity) = matching_specificity(rule.pattern, path) else {
908            continue;
909        };
910
911        apply_rule_decision(specificity, rule.kind, best);
912    }
913}
914
915fn apply_compiled_rules(
916    rules: &[CompiledRule<'_>],
917    path: &str,
918    best: &mut Option<(usize, RuleKind)>,
919) {
920    for rule in rules {
921        let Some(specificity) = rule.matching_specificity(path) else {
922            continue;
923        };
924
925        apply_rule_decision(specificity, rule.kind, best);
926    }
927}
928
929fn apply_rule_decision(specificity: usize, kind: RuleKind, best: &mut Option<(usize, RuleKind)>) {
930    let should_replace = !matches!(
931        *best,
932        Some((best_specificity, best_kind))
933            if specificity < best_specificity
934                || (specificity == best_specificity
935                    && !(kind == RuleKind::Allow && best_kind == RuleKind::Disallow))
936    );
937
938    if should_replace {
939        *best = Some((specificity, kind));
940    }
941}
942
943fn rule_decision(best: Option<(usize, RuleKind)>) -> bool {
944    match best {
945        Some((_, RuleKind::Allow)) | None => true,
946        Some((_, RuleKind::Disallow)) => false,
947    }
948}
949
950/// Returns matching specificity for robots longest-match rule selection.
951///
952/// Patterns without wildcards use the common prefix fast path. A trailing `$`
953/// requires the match to consume the whole path but does not increase
954/// specificity.
955fn matching_specificity(pattern: &str, path: &str) -> Option<usize> {
956    if pattern.is_empty() {
957        return None;
958    }
959
960    let (pattern, anchored) = strip_end_anchor(pattern);
961    let matched = if pattern.as_bytes().contains(&b'*') {
962        glob_matches(pattern.as_bytes(), path.as_bytes(), anchored)
963    } else if anchored {
964        path == pattern
965    } else {
966        path.starts_with(pattern)
967    };
968
969    matched.then_some(pattern.len())
970}
971
972/// Matches a `*` wildcard pattern against a path byte slice.
973///
974/// The first pattern segment must match at the start of the path; remaining
975/// segments are located in order with SIMD-backed substring search.
976fn glob_matches(pattern: &[u8], path: &[u8], anchored: bool) -> bool {
977    let mut parts = pattern.split(|byte| *byte == b'*');
978    let Some(first) = parts.next() else {
979        return true;
980    };
981
982    if !path.starts_with(first) {
983        return false;
984    }
985
986    let mut offset = first.len();
987    let mut ends_with_star = pattern.last() == Some(&b'*');
988
989    for part in parts {
990        if part.is_empty() {
991            ends_with_star = true;
992            continue;
993        }
994
995        ends_with_star = false;
996        let Some(found) = memmem::find(&path[offset..], part) else {
997            return false;
998        };
999        offset += found + part.len();
1000    }
1001
1002    !anchored || ends_with_star || offset == path.len()
1003}
1004
1005fn strip_end_anchor(pattern: &str) -> (&str, bool) {
1006    match pattern.strip_suffix('$') {
1007        Some(pattern) => (pattern, true),
1008        None => (pattern, false),
1009    }
1010}
1011
1012#[cfg(feature = "extensions")]
1013/// Stores a non-core directive in the feature-gated extension metadata.
1014///
1015/// Extension directives intentionally do not alter group boundaries or access
1016/// rules. `Crawl-delay` snapshots the current group agents so callers can
1017/// associate the value with the group where it appeared.
1018fn collect_extension<'a>(
1019    extensions: &mut Extensions<'a>,
1020    current: Option<&Group<'a>>,
1021    directive: DirectiveKind,
1022    key: &'a str,
1023    value: &'a str,
1024) {
1025    match directive {
1026        DirectiveKind::Sitemap => {
1027            if !value.is_empty() {
1028                extensions.sitemaps.push(value);
1029            }
1030        }
1031        DirectiveKind::CrawlDelay => {
1032            extensions.crawl_delays.push(CrawlDelay {
1033                agents: current
1034                    .map(|group| group.agents.clone())
1035                    .unwrap_or_default(),
1036                value,
1037            });
1038        }
1039        DirectiveKind::Host => {
1040            if !value.is_empty() {
1041                extensions.hosts.push(value);
1042            }
1043        }
1044        DirectiveKind::CleanParam => {
1045            if !value.is_empty() {
1046                extensions.clean_params.push(CleanParam { value });
1047            }
1048        }
1049        _ => {
1050            extensions.other.push(Directive { key, value });
1051        }
1052    }
1053}
1054
1055/// Removes an inline `#` comment from a line.
1056fn strip_comment(line: &str) -> &str {
1057    match memchr(b'#', line.as_bytes()) {
1058        Some(index) => &line[..index],
1059        None => line,
1060    }
1061}
1062
1063/// Splits a directive line into raw key and value slices.
1064///
1065/// Only the first `:` is structural; additional colons remain part of the value.
1066fn split_directive(line: &str) -> Option<(&str, &str)> {
1067    let index = memchr(b':', line.as_bytes())?;
1068    Some((&line[..index], &line[index + 1..]))
1069}
1070
1071/// Trims ASCII spaces and tabs from both ends of a directive fragment.
1072///
1073/// Robots directives are byte-oriented, so this deliberately avoids full
1074/// Unicode whitespace handling.
1075fn trim_ascii(value: &str) -> &str {
1076    let bytes = value.as_bytes();
1077    let Some((&first, rest)) = bytes.split_first() else {
1078        return value;
1079    };
1080    let last = rest.last().copied().unwrap_or(first);
1081
1082    if !matches!(first, b' ' | b'\t') && !matches!(last, b' ' | b'\t') {
1083        return value;
1084    }
1085
1086    let mut start = 0;
1087    let mut end = bytes.len();
1088
1089    while start < end && matches!(bytes[start], b' ' | b'\t') {
1090        start += 1;
1091    }
1092    while end > start && matches!(bytes[end - 1], b' ' | b'\t') {
1093        end -= 1;
1094    }
1095
1096    &value[start..end]
1097}
1098
1099/// Iterator over input lines with one-based source line numbers.
1100///
1101/// Handles both LF and CRLF endings while keeping returned line slices borrowed
1102/// from the original input.
1103struct Lines<'a> {
1104    input: &'a str,
1105    offset: usize,
1106    line: usize,
1107}
1108
1109impl<'a> Lines<'a> {
1110    /// Creates a line iterator for `input`.
1111    fn new(input: &'a str) -> Self {
1112        Self {
1113            input,
1114            offset: 0,
1115            line: 1,
1116        }
1117    }
1118}
1119
1120impl<'a> Iterator for Lines<'a> {
1121    type Item = (usize, &'a str);
1122
1123    /// Returns the next line and its one-based line number.
1124    fn next(&mut self) -> Option<Self::Item> {
1125        if self.offset > self.input.len() {
1126            return None;
1127        }
1128
1129        let remaining = &self.input[self.offset..];
1130        if remaining.is_empty() {
1131            self.offset += 1;
1132            return None;
1133        }
1134
1135        let line_end = memchr(b'\n', remaining.as_bytes()).unwrap_or(remaining.len());
1136        let mut line = &remaining[..line_end];
1137        if let Some(stripped) = line.strip_suffix('\r') {
1138            line = stripped;
1139        }
1140
1141        let line_number = self.line;
1142        self.line += 1;
1143        self.offset += line_end + 1;
1144        Some((line_number, line))
1145    }
1146}
1147
1148#[cfg(test)]
1149mod tests {
1150    use super::*;
1151
1152    #[test]
1153    fn parses_groups_comments_and_crlf() {
1154        let robots = RobotsTxt::parse(
1155            "# ignored\r\nUser-agent: FooBot\r\nUser-agent: BarBot # same group\r\nDisallow: /private\r\nAllow: /private/public\r\n",
1156        );
1157
1158        assert_eq!(robots.groups.len(), 1);
1159        assert_eq!(robots.groups[0].agents, vec!["FooBot", "BarBot"]);
1160        assert_eq!(robots.groups[0].rules.len(), 2);
1161        assert!(!robots.is_allowed("FooBot", "/private/file"));
1162        assert!(robots.is_allowed("FooBot", "/private/public/file"));
1163    }
1164
1165    #[test]
1166    fn parses_directive_keys_case_insensitively() {
1167        let robots =
1168            RobotsTxt::parse("uSeR-aGeNt: FooBot\nDiSaLlOw: /private\nAlLoW: /private/public\n");
1169
1170        assert!(!robots.is_allowed("FooBot", "/private/file"));
1171        assert!(robots.is_allowed("FooBot", "/private/public/file"));
1172    }
1173
1174    #[test]
1175    fn ignores_rules_before_first_user_agent() {
1176        let robots = RobotsTxt::parse("Disallow: /\nUser-agent: *\nAllow: /\n");
1177
1178        assert!(robots.is_allowed("AnyBot", "/anything"));
1179    }
1180
1181    #[test]
1182    fn starts_new_group_after_rules() {
1183        let robots = RobotsTxt::parse(
1184            "User-agent: FooBot\nDisallow: /foo\nUser-agent: BarBot\nDisallow: /bar\n",
1185        );
1186
1187        assert_eq!(robots.groups.len(), 2);
1188        assert!(!robots.is_allowed("FooBot", "/foo"));
1189        assert!(robots.is_allowed("FooBot", "/bar"));
1190        assert!(!robots.is_allowed("BarBot", "/bar"));
1191    }
1192
1193    #[test]
1194    fn merges_multiple_exact_matching_groups() {
1195        let robots = RobotsTxt::parse(
1196            "User-agent: FooBot\nDisallow: /foo\n\nUser-agent: FooBot\nDisallow: /bar\n",
1197        );
1198
1199        assert!(!robots.is_allowed("FooBot", "/foo"));
1200        assert!(!robots.is_allowed("FooBot", "/bar"));
1201    }
1202
1203    #[test]
1204    fn falls_back_to_star_group() {
1205        let robots =
1206            RobotsTxt::parse("User-agent: *\nDisallow: /all\nUser-agent: FooBot\nAllow: /\n");
1207
1208        assert!(!robots.is_allowed("OtherBot", "/all"));
1209        assert!(robots.is_allowed("FooBot", "/all"));
1210    }
1211
1212    #[test]
1213    fn longest_match_wins_and_allow_wins_ties() {
1214        let robots = RobotsTxt::parse(
1215            "User-agent: *\nDisallow: /example/\nAllow: /example/public\nDisallow: /tie\nAllow: /tie\n",
1216        );
1217
1218        assert!(!robots.is_allowed("AnyBot", "/example/private"));
1219        assert!(robots.is_allowed("AnyBot", "/example/public/page"));
1220        assert!(robots.is_allowed("AnyBot", "/tie"));
1221    }
1222
1223    #[test]
1224    fn supports_wildcard_and_end_anchor() {
1225        let robots = RobotsTxt::parse("User-agent: *\nDisallow: /*.gif$\nAllow: /public/*.gif$\n");
1226
1227        assert!(!robots.is_allowed("AnyBot", "/images/a.gif"));
1228        assert!(robots.is_allowed("AnyBot", "/images/a.gif?size=large"));
1229        assert!(robots.is_allowed("AnyBot", "/public/a.gif"));
1230    }
1231
1232    #[test]
1233    fn empty_disallow_does_not_block() {
1234        let robots = RobotsTxt::parse("User-agent: *\nDisallow:\n");
1235
1236        assert!(robots.is_allowed("AnyBot", "/anything"));
1237    }
1238
1239    #[test]
1240    fn robots_txt_is_implicitly_allowed() {
1241        let robots = RobotsTxt::parse("User-agent: *\nDisallow: /\n");
1242
1243        assert!(robots.is_allowed("AnyBot", "/robots.txt"));
1244    }
1245
1246    #[test]
1247    fn compiled_matcher_matches_regular_matcher_for_core_rules() {
1248        let robots = RobotsTxt::parse(
1249            "User-agent: FooBot\n\
1250            Disallow: /foo\n\
1251            \n\
1252            User-agent: FooBot\n\
1253            Disallow: /bar\n\
1254            Allow: /bar/public\n\
1255            Disallow: /tie\n\
1256            Allow: /tie\n\
1257            \n\
1258            User-agent: ImageBot\n\
1259            Disallow: /*.gif$\n\
1260            Allow: /public/*.gif$\n\
1261            \n\
1262            User-agent: *\n\
1263            Disallow: /fallback\n",
1264        );
1265        let matcher = robots.matcher();
1266
1267        for (agent, path) in [
1268            ("FooBot", "/foo/page"),
1269            ("FooBot", "/bar/page"),
1270            ("FooBot", "/bar/public/page"),
1271            ("FooBot", "/tie"),
1272            ("ImageBot", "/images/a.gif"),
1273            ("ImageBot", "/images/a.gif?size=large"),
1274            ("ImageBot", "/public/a.gif"),
1275            ("OtherBot", "/fallback/page"),
1276            ("OtherBot", "/public/page"),
1277            ("OtherBot", "/robots.txt"),
1278        ] {
1279            assert_eq!(
1280                matcher.is_allowed(agent, path),
1281                robots.is_allowed(agent, path),
1282                "compiled matcher differed for {agent} {path}"
1283            );
1284        }
1285    }
1286
1287    #[test]
1288    fn parse_bytes_rejects_invalid_utf8() {
1289        let error = RobotsTxt::parse_bytes(&[0xff]).expect_err("invalid UTF-8 should fail");
1290
1291        assert!(matches!(error, ParseError::Utf8(_)));
1292    }
1293
1294    #[test]
1295    fn parse_with_options_rejects_oversized_input() {
1296        let error =
1297            RobotsTxt::parse_with_options("User-agent: *\n", ParseOptions { max_bytes: Some(4) })
1298                .expect_err("oversized input should fail");
1299
1300        assert!(matches!(error, ParseError::TooLarge { len: 14, max: 4 }));
1301    }
1302
1303    #[test]
1304    fn parse_with_options_allows_disabled_limit() {
1305        let robots = RobotsTxt::parse_with_options(
1306            "User-agent: *\nDisallow: /private\n",
1307            ParseOptions { max_bytes: None },
1308        )
1309        .expect("disabled size limit should parse");
1310
1311        assert!(!robots.is_allowed("AnyBot", "/private"));
1312    }
1313
1314    #[test]
1315    fn diagnostics_report_soft_parse_issues() {
1316        let report = RobotsTxt::parse_with_diagnostics(
1317            "Disallow: /\nMissing separator\n: value\nUser-agent:\nUser-agent: *\nDisallow: /private\n",
1318        );
1319
1320        assert_eq!(report.warnings.len(), 4);
1321        assert_eq!(
1322            report.warnings,
1323            vec![
1324                ParseWarning {
1325                    line: 1,
1326                    kind: ParseWarningKind::RuleBeforeUserAgent { key: "Disallow" },
1327                },
1328                ParseWarning {
1329                    line: 2,
1330                    kind: ParseWarningKind::MissingSeparator {
1331                        line: "Missing separator",
1332                    },
1333                },
1334                ParseWarning {
1335                    line: 3,
1336                    kind: ParseWarningKind::EmptyDirectiveKey,
1337                },
1338                ParseWarning {
1339                    line: 4,
1340                    kind: ParseWarningKind::EmptyUserAgent,
1341                },
1342            ]
1343        );
1344        assert!(!report.robots.is_allowed("AnyBot", "/private"));
1345    }
1346
1347    #[cfg(feature = "extensions")]
1348    #[test]
1349    fn collects_extensions_without_changing_groups() {
1350        let robots = RobotsTxt::parse(
1351            "Sitemap: https://example.com/sitemap.xml\nUser-agent: Bingbot\nCrawl-delay: 5\nDisallow: /slow\nHost: example.com\nClean-param: ref /shop\nX-Test: value\n",
1352        );
1353
1354        assert_eq!(
1355            robots.extensions.sitemaps,
1356            vec!["https://example.com/sitemap.xml"]
1357        );
1358        assert_eq!(robots.extensions.crawl_delays.len(), 1);
1359        assert_eq!(robots.extensions.crawl_delays[0].agents, vec!["Bingbot"]);
1360        assert_eq!(robots.extensions.crawl_delays[0].value, "5");
1361        assert_eq!(robots.extensions.hosts, vec!["example.com"]);
1362        assert_eq!(robots.extensions.clean_params[0].value, "ref /shop");
1363        assert_eq!(robots.extensions.other[0].key, "X-Test");
1364        assert!(!robots.is_allowed("Bingbot", "/slow"));
1365    }
1366}