robots_parser/
lib.rs

1//! A robots.txt parser and applicability checker for Rust
2//!
3//! The Parser is implemented and tested after
4//! <http://www.robotstxt.org/norobots-rfc.txt>
5//!
6//! # Usage
7//!
8//! Add it to your ``Cargo.toml``:
9//!
10//! ```toml
11//! [dependencies]
12//! robots-parser = "0.10"
13//! ```
14//!
15//!
16//! # Example
17//!
18//! ```rust,ignore
19//!
20//! use robots::RobotsParser;
21//! use url::Url;
22//!
23//! fn main() {
24//!     let parsed = RobotsParser::parse_url(Url::new("https://www.google.com/robots.txt"))?;
25//!     assert!(parsed.can_fetch("*", "https://www.google.com/search/about"));
26//! }
27//! ```
28
29use nom::branch::alt;
30use nom::bytes::complete::tag;
31use nom::bytes::complete::tag_no_case;
32use nom::bytes::complete::take_until;
33use nom::bytes::complete::take_while;
34use nom::bytes::complete::take_while1;
35use nom::combinator::cond;
36use nom::combinator::map_opt;
37use nom::combinator::opt;
38use nom::sequence::tuple;
39use nom::IResult;
40
41use url::percent_encoding::percent_decode;
42use url::Url;
43
44use std::fs;
45use std::path::Path;
46
47#[derive(Debug, Eq, PartialEq, Clone)]
48pub struct RobotsParser {
49    rules: Vec<Robots>,
50}
51
52#[derive(Debug, Eq, PartialEq, Clone)]
53pub enum Robots {
54    UserAgent(String, Vec<Rule>),
55    GlobalRule(Rule),
56}
57
58impl Robots {
59    fn is_applicable(&self, agent: &str, path: &str) -> bool {
60        match self {
61            Robots::UserAgent(s, _) => {
62                let cleaned_user_agent = agent.split('/').nth(0).unwrap_or("");
63                if s == "*" || *s == cleaned_user_agent.to_lowercase() {
64                    true
65                } else {
66                    false
67                }
68            }
69            Robots::GlobalRule(rule) => rule.is_applicable(path),
70        }
71    }
72
73    // Precondition: Applicability has been proven
74    fn is_allowed(&self, path: &str) -> bool {
75        match self {
76            Robots::UserAgent(_, rules) => {
77                for rule in rules {
78                    if rule.is_applicable(path) {
79                        return rule.allowed();
80                    }
81                }
82            }
83            Robots::GlobalRule(rule) => return rule.allowed(),
84        }
85        false
86    }
87}
88#[derive(Debug, Eq, PartialEq, Clone)]
89pub enum Rule {
90    Allow(String),
91    Disallow(String),
92    Extension,
93}
94
95impl Rule {
96    fn is_applicable(&self, path: &str) -> bool {
97        let own_path = match self {
98            Rule::Allow(s) | Rule::Disallow(s) => s,
99            _ => "",
100        };
101
102        own_path == "*" || path.starts_with(own_path)
103    }
104
105    // Precondition: Applicability has been proven
106    fn allowed(&self) -> bool {
107        match self {
108            Rule::Allow(_) => true,
109            _ => false,
110        }
111    }
112}
113impl RobotsParser {
114
115    /// Creates a new `RobotsParser` from the given `Robots` Rules
116    pub fn new(rules: Vec<Robots>) -> RobotsParser {
117        RobotsParser { rules }
118    }
119
120    /// Parses a robots.txt input string
121    pub fn parse<'a>(input: &'a str) -> Result<RobotsParser, &'static str> {
122        let mut rules = vec![];
123        let mut input = input;
124
125        //Always add a Allow(/robots.txt) at the start
126        rules.push(Robots::GlobalRule(Rule::Allow("/robots.txt".to_owned())));
127
128        loop {
129            let rulers = alt((
130                RobotsParser::comment_line_parser(),
131                map_opt(RobotsParser::crlf_parse(), |_| Some(None::<Robots>)),
132                RobotsParser::parse_user_agent(),
133                map_opt(RobotsParser::parse_rule(), |rule| {
134                    Some(Some(Robots::GlobalRule(rule)))
135                }),
136            ))(input);
137            input = match rulers {
138                Ok((input, Some(rule))) => {
139                    rules.push(rule);
140                    input
141                }
142                Ok((input, None)) => input,
143                Err(_) => {
144                    return Err("Could not parse Robots.txt");
145                }
146            };
147
148            // No more input -> Return
149            if input.is_empty() {
150                break;
151            }
152        }
153
154        Ok(RobotsParser { rules: rules })
155    }
156
157    /// Parses a robots.txt file from the given path
158    pub fn parse_path<P: AsRef<Path>>(path: P) -> Result<RobotsParser, &'static str> {
159        let data = fs::read_to_string(path).expect("Unable to read file");
160        RobotsParser::parse(&data)
161    }
162
163    /// Parses a robots.txt file from the given url
164    #[cfg(feature = "web")]
165    pub fn parse_url<U: Into<Url>>(url: U) -> Result<RobotsParser, &'static str> {
166        let data = reqwest::get(url.into()).expect("Unable to read file from url").text().expect("Unable to rad file from url");
167        RobotsParser::parse(&data)
168    }
169
170    /// Parses a space
171    fn space_parser<'a>() -> impl Fn(&'a str) -> IResult<&'a str, &'a str> {
172        take_while(|c| c == ' ' || c == '\t')
173    }
174
175    // Parses an alphanumeric token or `*`
176    fn token_parser<'a>() -> impl Fn(&'a str) -> IResult<&'a str, &'a str> {
177        take_while1(|c: char| c.is_ascii_alphanumeric() || c == '*')
178    }
179
180    /// Parses a comment and does not consume the linebreak
181    fn comment_parser<'a>() -> impl Fn(&'a str) -> IResult<&'a str, (&'a str, &'a str)> {
182        tuple((tag("#"), take_until("\r\n")))
183    }
184
185    /// Parses a line break
186    fn crlf_parse<'a>() -> impl Fn(&'a str) -> IResult<&'a str, &'a str> {
187        tag("\r\n")
188    }
189
190    /// Parses a comment line and returns an empty Robots.txt line
191    fn comment_line_parser<'a>() -> impl Fn(&'a str) -> IResult<&'a str, Option<Robots>> {
192        map_opt(
193            tuple((RobotsParser::comment_parser(), RobotsParser::crlf_parse())),
194            |_| Some(None),
195        )
196    }
197
198    fn parse_user_agent<'a>() -> impl Fn(&'a str) -> IResult<&'a str, Option<Robots>> {
199        move |input: &'a str| {
200            let (input, _) = tag_no_case("user-agent:")(input)?;
201            let (input, _) = RobotsParser::space_parser()(input)?;
202            let (input, agent) = RobotsParser::token_parser()(input)?;
203            // Parses optional comment after path
204            let (input, _) = opt(RobotsParser::comment_parser())(input).unwrap_or((input, None));
205            let (input, _) = RobotsParser::crlf_parse()(input)?;
206
207            let (input, rules) = RobotsParser::parse_rules()(input)?;
208
209            let rules = if rules.is_empty() {
210                //There could be a second User-Agents
211                let user_agent = RobotsParser::parse_user_agent()(input);
212
213                let rules = match user_agent {
214                    Ok((_, agent)) => match agent.unwrap() {
215                        Robots::UserAgent(_, rules) => rules.clone(),
216                        _ => panic!("User-Agent only retunrs a User-Agent"),
217                    },
218                    _ => rules,
219                };
220                rules
221            } else {
222                rules
223            };
224            Ok((input, Some(Robots::UserAgent(agent.to_owned(), rules))))
225        }
226    }
227
228    /// Parses as many rules it can find
229    fn parse_rules<'a>() -> impl Fn(&'a str) -> IResult<&'a str, Vec<Rule>> {
230        move |input: &'a str| {
231            let mut rules = vec![];
232            let mut input = input;
233            loop {
234                input = match RobotsParser::parse_rule()(input) {
235                    Ok((input, rule)) => {
236                        rules.push(rule);
237                        input
238                    }
239                    Err(_) => match RobotsParser::comment_line_parser()(input) {
240                        Ok((input, _)) => input,
241                        Err(_) => return Ok((input, rules)),
242                    },
243                };
244            }
245        }
246    }
247
248    /// Parses exactly one rule
249    fn parse_rule<'a>() -> impl Fn(&'a str) -> IResult<&'a str, Rule> {
250        move |input: &'a str| {
251            let (input, allowence) = alt((tag("Allow:"), tag("Disallow:")))(input)?;
252            let (input, _) = RobotsParser::space_parser()(input)?;
253            let (input, path) = RobotsParser::parse_file_path(input)?;
254
255            // Parses optional comment after path
256            let (input, _) = opt(RobotsParser::comment_parser())(input).unwrap_or((input, None));
257
258            // CRLF is optional, when the file is empty
259            let (input, _) = cond(input.len() != 0, RobotsParser::crlf_parse())(input)?;
260
261            // Empty Disallow means allow all
262            if allowence == "Disallow:" && path.is_empty() {
263                return Ok((input, Rule::Allow("*".to_owned())));
264            }
265
266            match allowence {
267                "Allow:" => Ok((input, Rule::Allow(path))),
268                "Disallow:" => Ok((input, Rule::Disallow(path))),
269                _ => panic!("Rule must either be allowed or disallowed"),
270            }
271        }
272    }
273
274    /// Parses a path as specified
275    /// Paths do not include `#` as they indicate a comment
276    fn parse_file_path<'a>(input: &'a str) -> IResult<&'a str, String> {
277        let (input, path) = take_while(|c: char| !c.is_whitespace() && c != '#')(input)?;
278        Ok((input, path.to_owned()))
279    }
280
281    /// Decides if a path can be fetched by an agent
282    pub fn can_fetch(&self, agent: &str, path: &str) -> bool {
283        let url = Url::parse(path);
284        match url {
285            Ok(url) => {
286                let path = percent_decode(url.path().as_bytes()).decode_utf8().unwrap();
287                for rule in &*self.rules {
288                    if rule.is_applicable(agent, &path) {
289                        return rule.is_allowed(&path);
290                    }
291                }
292                false
293            }
294            Err(_) => return false,
295        }
296    }
297}
298
299#[test]
300fn user_agent_different_spellings() {
301    assert!(RobotsParser::parse_user_agent()("User-Agent: test\r\n\r\n").is_ok());
302    assert!(RobotsParser::parse_user_agent()("user-agent: test\r\n\r\n").is_ok());
303    assert!(RobotsParser::parse_user_agent()("USER-AGENT: test\r\n\r\n").is_ok());
304}
305
306#[test]
307fn user_agent_empty() {
308    assert!(RobotsParser::parse_user_agent()("User-Agent:\r\n").is_err());
309}
310
311#[test]
312fn comment() {
313    assert!(RobotsParser::comment_parser()("# testtest\r\n").is_ok());
314    assert!(RobotsParser::comment_parser()("testtest\r\n").is_err());
315    assert!(RobotsParser::comment_parser()("#testtest").is_err());
316    assert!(RobotsParser::comment_line_parser()("# testtest\r\n").is_ok());
317    assert!(RobotsParser::comment_line_parser()("testtest\r\n").is_err());
318    assert!(RobotsParser::comment_line_parser()("#testtest").is_err());
319}
320
321#[test]
322fn rule() {
323    assert!(RobotsParser::parse_rule()("Allow: /\r\n").is_ok());
324    assert!(RobotsParser::parse_rule()("Disallow: /\r\n").is_ok());
325    assert!(RobotsParser::parse_rule()("Allow: /#1234 \r\n").is_ok());
326    assert!(RobotsParser::parse_rule()("Disallow: /\r\n").is_ok());
327    assert!(RobotsParser::parse_rule()("Disallow: \r\n").is_ok());
328    assert!(RobotsParser::parse_rule()("Disallow: /org/plans.html\r\n").is_ok());
329    assert!(RobotsParser::parse_rule()("Disallow: /org/\r\n").is_ok());
330    assert!(RobotsParser::parse_rule()("Allow: /serv\r\n").is_ok());
331    assert!(RobotsParser::parse_rule()("Allow: /~mak\r\n").is_ok());
332    assert!(RobotsParser::parse_rule()("Allow: /~mak\r\n").is_ok());
333}
334
335#[test]
336fn rules() {
337    let rules = "Disallow: /index.html?\r\nDisallow: /?\r
338Allow: /?hl=\r
339Disallow: /?hl=*&\r
340Allow: /?hl=*&gws_rd=ssl$\r
341Disallow: /?hl=*&*&gws_rd=ssl\r
342Allow: /?gws_rd=ssl$";
343    let result = vec![
344        Rule::Disallow("/index.html?".to_owned()),
345        Rule::Disallow("/?".to_owned()),
346        Rule::Allow("/?hl=".to_owned()),
347        Rule::Disallow("/?hl=*&".to_owned()),
348        Rule::Allow("/?hl=*&gws_rd=ssl$".to_owned()),
349        Rule::Disallow("/?hl=*&*&gws_rd=ssl".to_owned()),
350        Rule::Allow("/?gws_rd=ssl$".to_owned()),
351    ];
352    let parsed = RobotsParser::parse_rules()(rules);
353    assert!(parsed.is_ok());
354    let (_, parsed) = parsed.unwrap();
355    assert_eq!(parsed, result);
356}