1use crate::RobotsParseHandler;
18
19#[derive(Eq, PartialEq)]
20pub enum ParseKeyType {
22 UserAgent,
24 Sitemap,
25
26 Allow,
28 Disallow,
29
30 Unknown = 128,
33}
34
35pub struct ParsedRobotsKey {
43 type_: ParseKeyType,
44 key_text: String,
45 allow_typo: bool,
47}
48
49impl Default for ParsedRobotsKey {
50 fn default() -> Self {
51 ParsedRobotsKey {
52 type_: ParseKeyType::Unknown,
53 allow_typo: true,
54 key_text: String::new(),
55 }
56 }
57}
58
59impl ParsedRobotsKey {
60 pub fn parse(&mut self, key: &str) {
63 if self.validate_key(key, &["user-agent"], Some(&["useragent", "user agent"])) {
64 self.type_ = ParseKeyType::UserAgent;
65 } else if self.validate_key(key, &["allow"], None) {
66 self.type_ = ParseKeyType::Allow;
67 } else if self.validate_key(
68 key,
69 &["disallow"],
70 Some(&["dissallow", "dissalow", "disalow", "diasllow", "disallaw"]),
71 ) {
72 self.type_ = ParseKeyType::Disallow;
73 } else if self.validate_key(key, &["sitemap", "site-map"], None) {
74 self.type_ = ParseKeyType::Sitemap;
75 } else {
76 self.type_ = ParseKeyType::Unknown;
77 self.key_text = key.to_string();
78 }
79 }
80
81 pub fn get_type(&self) -> &ParseKeyType {
83 &self.type_
84 }
85
86 pub fn get_unknown_text(&self) -> String {
88 self.key_text.to_string()
89 }
90
91 fn validate_key(&self, key: &str, targets: &[&str], typo_targets: Option<&[&str]>) -> bool {
92 let key = key.to_lowercase();
93 let check = |target: &&str| key.starts_with(&target.to_lowercase());
94 targets.iter().any(check)
95 || (typo_targets.is_some()
96 && self.allow_typo
97 && typo_targets.unwrap().iter().any(check))
98 }
99}
100
101pub struct RobotsTxtParser<'a, Handler: RobotsParseHandler> {
103 robots_body: &'a str,
104 handler: &'a mut Handler,
105}
106
107impl<'a, Handler: RobotsParseHandler> RobotsTxtParser<'a, Handler> {
108 pub fn new(robots_body: &'a str, handler: &'a mut Handler) -> Self {
109 RobotsTxtParser {
110 robots_body,
111 handler,
112 }
113 }
114
115 pub fn parse(&mut self) {
121 let utf_bom = [0xEF, 0xBB, 0xBF];
122 let max_line_len = 2083 * 8;
128 let mut line_num = 0;
129 let mut bom_pos = 0;
130 let mut last_was_carriage_return = false;
131 self.handler.handle_robots_start();
132
133 let mut start = 0;
134 let mut end = 0;
135 let mut skip_exceed = 0;
138 for (ch, char_len_utf8) in self
139 .robots_body
140 .chars()
141 .map(|ch| (ch as usize, ch.len_utf8()))
142 {
143 if bom_pos < utf_bom.len() && ch == utf_bom[bom_pos] {
147 bom_pos += 1;
148 start += char_len_utf8;
149 end += char_len_utf8;
150 continue;
151 }
152 bom_pos = utf_bom.len();
153
154 if ch != 0x0A && ch != 0x0D {
155 if (end - start) < max_line_len - 1 {
158 end += char_len_utf8;
159 } else {
160 skip_exceed += 1;
161 }
162 } else {
163 let is_crlf_continuation = end == start && last_was_carriage_return && ch == 0x0A;
167 if !is_crlf_continuation {
168 line_num += 1;
169 self.parse_and_emit_line(line_num, &self.robots_body[start..end]);
170 }
171 end += skip_exceed + char_len_utf8;
173 start = end;
174 last_was_carriage_return = ch == 0x0D;
175 skip_exceed = 0;
176 }
177 }
178 line_num += 1;
179 self.parse_and_emit_line(line_num, &self.robots_body[start..end]);
180 self.handler.handle_robots_end();
181 }
182
183 pub fn parse_key_value(line: &str) -> (&str, &str, bool) {
188 let mut line = line;
189 if let Some(comment) = line.find('#') {
191 line = &line[..comment].trim();
192 }
193
194 let mut sep = line.find(':');
197 if sep.is_none() {
198 let white = " \t";
201
202 sep = line.find(|c| white.contains(c));
203 if let Some(sep) = sep {
204 let val = &line[sep..].trim();
205 if val.is_empty() || val.find(|c| white.contains(c)).is_some() {
206 return ("", "", false);
211 }
212 }
213 }
214
215 if let Some(sep) = sep {
216 let key = &line[..sep];
218 if key.is_empty() {
219 return ("", "", false);
220 }
221
222 let value = &line[(sep + 1)..];
224 (key.trim(), value.trim(), true)
225 } else {
226 ("", "", false)
228 }
229 }
230
231 pub fn need_escape_value_for_key(key: &ParsedRobotsKey) -> bool {
232 !matches!(
233 key.get_type(),
234 ParseKeyType::UserAgent | ParseKeyType::Sitemap
235 )
236 }
237
238 fn parse_and_emit_line(&mut self, current_line: u32, line: &str) {
239 match Self::parse_key_value(line) {
240 (_, _, false) => {}
241 (string_key, value, true) => {
242 let mut key = ParsedRobotsKey::default();
243 key.parse(string_key);
244 if Self::need_escape_value_for_key(&key) {
245 let value = escape_pattern(value);
246 self.emit(current_line, &key, &value);
247 } else {
248 self.emit(current_line, &key, value);
249 }
250 }
251 }
252 }
253
254 fn emit(&mut self, line: u32, key: &ParsedRobotsKey, value: &str) {
255 match key.get_type() {
256 ParseKeyType::UserAgent => self.handler.handle_user_agent(line, value),
257 ParseKeyType::Sitemap => self.handler.handle_sitemap(line, value),
258 ParseKeyType::Allow => self.handler.handle_allow(line, value),
259 ParseKeyType::Disallow => self.handler.handle_disallow(line, value),
260 ParseKeyType::Unknown => {
261 self.handler
262 .handle_unknown_action(line, &key.get_unknown_text(), value)
263 }
264 }
265 }
266}
267
268const HEX_DIGITS: [char; 16] = [
269 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
270];
271
272pub fn escape_pattern(path: &str) -> String {
285 let mut num_to_escape = 0;
286 let mut need_capitalize = false;
287
288 let mut chars = path.bytes();
290 loop {
291 match chars.next() {
292 Some(c) if c as char == '%' => {
294 match (
295 chars.next().map(|c| c as char),
296 chars.next().map(|c| c as char),
297 ) {
298 (Some(c1), Some(c2)) if c1.is_digit(16) && c2.is_digit(16) => {
299 if c1.is_ascii_lowercase() || c2.is_ascii_lowercase() {
300 need_capitalize = true;
301 }
302 }
303 _ => {}
304 }
305 }
306 Some(c) if c >= 0x80 => {
307 num_to_escape += 1;
309 }
310 o => {
311 if o.is_none() {
313 break;
314 }
315 }
316 }
317 }
318 if num_to_escape == 0 && !need_capitalize {
320 return path.to_string();
321 }
322
323 let mut dest = String::with_capacity(num_to_escape * 2 + path.len() + 1);
324 chars = path.bytes();
325 loop {
326 match chars.next() {
327 Some(c) if c as char == '%' => {
328 match (
330 chars.next().map(|c| c as char),
331 chars.next().map(|c| c as char),
332 ) {
333 (Some(c1), Some(c2)) if c1.is_digit(16) && c2.is_digit(16) => {
334 dest.push(c as char);
335 dest.push(c1.to_ascii_uppercase());
336 dest.push(c2.to_ascii_uppercase());
337 }
338 _ => {}
339 }
340 }
341 Some(c) if c >= 0x80 => {
342 dest.push('%');
344 dest.push(HEX_DIGITS[(c as usize >> 4) & 0xf]);
345 dest.push(HEX_DIGITS[c as usize & 0xf]);
346 }
347 Some(c) => {
348 dest.push(c as char);
350 }
351 None => {
352 break;
353 }
354 }
355 }
356 dest
357}
358
359#[cfg(test)]
360mod tests {
361 #![allow(unused_variables)]
362
363 use crate::parser::*;
364 use crate::RobotsParseHandler;
365
366 struct FooHandler;
367
368 impl RobotsParseHandler for FooHandler {
369 fn handle_robots_start(&mut self) {
370 unimplemented!()
371 }
372
373 fn handle_robots_end(&mut self) {
374 unimplemented!()
375 }
376
377 fn handle_user_agent(&mut self, line_num: u32, user_agent: &str) {
378 unimplemented!()
379 }
380
381 fn handle_allow(&mut self, line_num: u32, value: &str) {
382 unimplemented!()
383 }
384
385 fn handle_disallow(&mut self, line_num: u32, value: &str) {
386 unimplemented!()
387 }
388
389 fn handle_sitemap(&mut self, line_num: u32, value: &str) {
390 unimplemented!()
391 }
392
393 fn handle_unknown_action(&mut self, line_num: u32, action: &str, value: &str) {
394 unimplemented!()
395 }
396 }
397
398 #[test]
399 fn test_parse_key_value<'a>() {
400 type Target<'a> = RobotsTxtParser<'a, FooHandler>;
401 let negative = ("", "", false);
402 let positive = ("User-agent", "Googlebot", true);
403
404 assert_eq!(negative, Target::parse_key_value("# "));
405 assert_eq!(negative, Target::parse_key_value("# User-agent: Googlebot"));
406
407 assert_eq!(positive, Target::parse_key_value("User-agent: Googlebot"));
408 assert_eq!(positive, Target::parse_key_value("User-agent Googlebot"));
409 assert_eq!(positive, Target::parse_key_value("User-agent \t Googlebot"));
410 assert_eq!(positive, Target::parse_key_value("User-agent\tGooglebot"));
411 assert_eq!(
412 positive,
413 Target::parse_key_value("User-agent: Googlebot # 123")
414 );
415 assert_eq!(
416 positive,
417 Target::parse_key_value("User-agent\tGooglebot # 123")
418 );
419 }
420
421 #[test]
422 fn test_escape_pattern() {
423 assert_eq!(
424 "http://www.example.com",
425 &escape_pattern("http://www.example.com")
426 );
427 assert_eq!("/a/b/c", &escape_pattern("/a/b/c"));
428 assert_eq!("%AA", &escape_pattern("%aa"));
429 assert_eq!("%AA", &escape_pattern("%aA"));
430 assert_eq!("/Sanjos%C3%A9Sellers", &escape_pattern("/SanjoséSellers"));
431 assert_eq!("%C3%A1", &escape_pattern("á"));
432 }
433}