1use nom::branch::alt;
30use nom::bytes::complete::tag;
31use nom::bytes::complete::tag_no_case;
32use nom::bytes::complete::take_until;
33use nom::bytes::complete::take_while;
34use nom::bytes::complete::take_while1;
35use nom::combinator::cond;
36use nom::combinator::map_opt;
37use nom::combinator::opt;
38use nom::sequence::tuple;
39use nom::IResult;
40
41use url::percent_encoding::percent_decode;
42use url::Url;
43
44use std::fs;
45use std::path::Path;
46
47#[derive(Debug, Eq, PartialEq, Clone)]
48pub struct RobotsParser {
49 rules: Vec<Robots>,
50}
51
52#[derive(Debug, Eq, PartialEq, Clone)]
53pub enum Robots {
54 UserAgent(String, Vec<Rule>),
55 GlobalRule(Rule),
56}
57
58impl Robots {
59 fn is_applicable(&self, agent: &str, path: &str) -> bool {
60 match self {
61 Robots::UserAgent(s, _) => {
62 let cleaned_user_agent = agent.split('/').nth(0).unwrap_or("");
63 if s == "*" || *s == cleaned_user_agent.to_lowercase() {
64 true
65 } else {
66 false
67 }
68 }
69 Robots::GlobalRule(rule) => rule.is_applicable(path),
70 }
71 }
72
73 fn is_allowed(&self, path: &str) -> bool {
75 match self {
76 Robots::UserAgent(_, rules) => {
77 for rule in rules {
78 if rule.is_applicable(path) {
79 return rule.allowed();
80 }
81 }
82 }
83 Robots::GlobalRule(rule) => return rule.allowed(),
84 }
85 false
86 }
87}
88#[derive(Debug, Eq, PartialEq, Clone)]
89pub enum Rule {
90 Allow(String),
91 Disallow(String),
92 Extension,
93}
94
95impl Rule {
96 fn is_applicable(&self, path: &str) -> bool {
97 let own_path = match self {
98 Rule::Allow(s) | Rule::Disallow(s) => s,
99 _ => "",
100 };
101
102 own_path == "*" || path.starts_with(own_path)
103 }
104
105 fn allowed(&self) -> bool {
107 match self {
108 Rule::Allow(_) => true,
109 _ => false,
110 }
111 }
112}
113impl RobotsParser {
114
115 pub fn new(rules: Vec<Robots>) -> RobotsParser {
117 RobotsParser { rules }
118 }
119
120 pub fn parse<'a>(input: &'a str) -> Result<RobotsParser, &'static str> {
122 let mut rules = vec![];
123 let mut input = input;
124
125 rules.push(Robots::GlobalRule(Rule::Allow("/robots.txt".to_owned())));
127
128 loop {
129 let rulers = alt((
130 RobotsParser::comment_line_parser(),
131 map_opt(RobotsParser::crlf_parse(), |_| Some(None::<Robots>)),
132 RobotsParser::parse_user_agent(),
133 map_opt(RobotsParser::parse_rule(), |rule| {
134 Some(Some(Robots::GlobalRule(rule)))
135 }),
136 ))(input);
137 input = match rulers {
138 Ok((input, Some(rule))) => {
139 rules.push(rule);
140 input
141 }
142 Ok((input, None)) => input,
143 Err(_) => {
144 return Err("Could not parse Robots.txt");
145 }
146 };
147
148 if input.is_empty() {
150 break;
151 }
152 }
153
154 Ok(RobotsParser { rules: rules })
155 }
156
157 pub fn parse_path<P: AsRef<Path>>(path: P) -> Result<RobotsParser, &'static str> {
159 let data = fs::read_to_string(path).expect("Unable to read file");
160 RobotsParser::parse(&data)
161 }
162
163 #[cfg(feature = "web")]
165 pub fn parse_url<U: Into<Url>>(url: U) -> Result<RobotsParser, &'static str> {
166 let data = reqwest::get(url.into()).expect("Unable to read file from url").text().expect("Unable to rad file from url");
167 RobotsParser::parse(&data)
168 }
169
170 fn space_parser<'a>() -> impl Fn(&'a str) -> IResult<&'a str, &'a str> {
172 take_while(|c| c == ' ' || c == '\t')
173 }
174
175 fn token_parser<'a>() -> impl Fn(&'a str) -> IResult<&'a str, &'a str> {
177 take_while1(|c: char| c.is_ascii_alphanumeric() || c == '*')
178 }
179
180 fn comment_parser<'a>() -> impl Fn(&'a str) -> IResult<&'a str, (&'a str, &'a str)> {
182 tuple((tag("#"), take_until("\r\n")))
183 }
184
185 fn crlf_parse<'a>() -> impl Fn(&'a str) -> IResult<&'a str, &'a str> {
187 tag("\r\n")
188 }
189
190 fn comment_line_parser<'a>() -> impl Fn(&'a str) -> IResult<&'a str, Option<Robots>> {
192 map_opt(
193 tuple((RobotsParser::comment_parser(), RobotsParser::crlf_parse())),
194 |_| Some(None),
195 )
196 }
197
198 fn parse_user_agent<'a>() -> impl Fn(&'a str) -> IResult<&'a str, Option<Robots>> {
199 move |input: &'a str| {
200 let (input, _) = tag_no_case("user-agent:")(input)?;
201 let (input, _) = RobotsParser::space_parser()(input)?;
202 let (input, agent) = RobotsParser::token_parser()(input)?;
203 let (input, _) = opt(RobotsParser::comment_parser())(input).unwrap_or((input, None));
205 let (input, _) = RobotsParser::crlf_parse()(input)?;
206
207 let (input, rules) = RobotsParser::parse_rules()(input)?;
208
209 let rules = if rules.is_empty() {
210 let user_agent = RobotsParser::parse_user_agent()(input);
212
213 let rules = match user_agent {
214 Ok((_, agent)) => match agent.unwrap() {
215 Robots::UserAgent(_, rules) => rules.clone(),
216 _ => panic!("User-Agent only retunrs a User-Agent"),
217 },
218 _ => rules,
219 };
220 rules
221 } else {
222 rules
223 };
224 Ok((input, Some(Robots::UserAgent(agent.to_owned(), rules))))
225 }
226 }
227
228 fn parse_rules<'a>() -> impl Fn(&'a str) -> IResult<&'a str, Vec<Rule>> {
230 move |input: &'a str| {
231 let mut rules = vec![];
232 let mut input = input;
233 loop {
234 input = match RobotsParser::parse_rule()(input) {
235 Ok((input, rule)) => {
236 rules.push(rule);
237 input
238 }
239 Err(_) => match RobotsParser::comment_line_parser()(input) {
240 Ok((input, _)) => input,
241 Err(_) => return Ok((input, rules)),
242 },
243 };
244 }
245 }
246 }
247
248 fn parse_rule<'a>() -> impl Fn(&'a str) -> IResult<&'a str, Rule> {
250 move |input: &'a str| {
251 let (input, allowence) = alt((tag("Allow:"), tag("Disallow:")))(input)?;
252 let (input, _) = RobotsParser::space_parser()(input)?;
253 let (input, path) = RobotsParser::parse_file_path(input)?;
254
255 let (input, _) = opt(RobotsParser::comment_parser())(input).unwrap_or((input, None));
257
258 let (input, _) = cond(input.len() != 0, RobotsParser::crlf_parse())(input)?;
260
261 if allowence == "Disallow:" && path.is_empty() {
263 return Ok((input, Rule::Allow("*".to_owned())));
264 }
265
266 match allowence {
267 "Allow:" => Ok((input, Rule::Allow(path))),
268 "Disallow:" => Ok((input, Rule::Disallow(path))),
269 _ => panic!("Rule must either be allowed or disallowed"),
270 }
271 }
272 }
273
274 fn parse_file_path<'a>(input: &'a str) -> IResult<&'a str, String> {
277 let (input, path) = take_while(|c: char| !c.is_whitespace() && c != '#')(input)?;
278 Ok((input, path.to_owned()))
279 }
280
281 pub fn can_fetch(&self, agent: &str, path: &str) -> bool {
283 let url = Url::parse(path);
284 match url {
285 Ok(url) => {
286 let path = percent_decode(url.path().as_bytes()).decode_utf8().unwrap();
287 for rule in &*self.rules {
288 if rule.is_applicable(agent, &path) {
289 return rule.is_allowed(&path);
290 }
291 }
292 false
293 }
294 Err(_) => return false,
295 }
296 }
297}
298
299#[test]
300fn user_agent_different_spellings() {
301 assert!(RobotsParser::parse_user_agent()("User-Agent: test\r\n\r\n").is_ok());
302 assert!(RobotsParser::parse_user_agent()("user-agent: test\r\n\r\n").is_ok());
303 assert!(RobotsParser::parse_user_agent()("USER-AGENT: test\r\n\r\n").is_ok());
304}
305
306#[test]
307fn user_agent_empty() {
308 assert!(RobotsParser::parse_user_agent()("User-Agent:\r\n").is_err());
309}
310
311#[test]
312fn comment() {
313 assert!(RobotsParser::comment_parser()("# testtest\r\n").is_ok());
314 assert!(RobotsParser::comment_parser()("testtest\r\n").is_err());
315 assert!(RobotsParser::comment_parser()("#testtest").is_err());
316 assert!(RobotsParser::comment_line_parser()("# testtest\r\n").is_ok());
317 assert!(RobotsParser::comment_line_parser()("testtest\r\n").is_err());
318 assert!(RobotsParser::comment_line_parser()("#testtest").is_err());
319}
320
321#[test]
322fn rule() {
323 assert!(RobotsParser::parse_rule()("Allow: /\r\n").is_ok());
324 assert!(RobotsParser::parse_rule()("Disallow: /\r\n").is_ok());
325 assert!(RobotsParser::parse_rule()("Allow: /#1234 \r\n").is_ok());
326 assert!(RobotsParser::parse_rule()("Disallow: /\r\n").is_ok());
327 assert!(RobotsParser::parse_rule()("Disallow: \r\n").is_ok());
328 assert!(RobotsParser::parse_rule()("Disallow: /org/plans.html\r\n").is_ok());
329 assert!(RobotsParser::parse_rule()("Disallow: /org/\r\n").is_ok());
330 assert!(RobotsParser::parse_rule()("Allow: /serv\r\n").is_ok());
331 assert!(RobotsParser::parse_rule()("Allow: /~mak\r\n").is_ok());
332 assert!(RobotsParser::parse_rule()("Allow: /~mak\r\n").is_ok());
333}
334
335#[test]
336fn rules() {
337 let rules = "Disallow: /index.html?\r\nDisallow: /?\r
338Allow: /?hl=\r
339Disallow: /?hl=*&\r
340Allow: /?hl=*&gws_rd=ssl$\r
341Disallow: /?hl=*&*&gws_rd=ssl\r
342Allow: /?gws_rd=ssl$";
343 let result = vec![
344 Rule::Disallow("/index.html?".to_owned()),
345 Rule::Disallow("/?".to_owned()),
346 Rule::Allow("/?hl=".to_owned()),
347 Rule::Disallow("/?hl=*&".to_owned()),
348 Rule::Allow("/?hl=*&gws_rd=ssl$".to_owned()),
349 Rule::Disallow("/?hl=*&*&gws_rd=ssl".to_owned()),
350 Rule::Allow("/?gws_rd=ssl$".to_owned()),
351 ];
352 let parsed = RobotsParser::parse_rules()(rules);
353 assert!(parsed.is_ok());
354 let (_, parsed) = parsed.unwrap();
355 assert_eq!(parsed, result);
356}