spider/packages/robotparser/
parser.rs1use crate::compact_str::CompactString;
29use crate::Client;
30#[cfg(feature = "regex")]
31use hashbrown::HashSet;
32#[cfg(feature = "regex")]
33use regex::RegexSet;
34use std::time::{Duration, SystemTime, UNIX_EPOCH};
35
36#[derive(Debug, Eq, PartialEq, Clone)]
39#[cfg(not(feature = "regex"))]
40pub struct RuleLine {
41 pub path: String,
43 pub allowance: bool,
45}
46
47#[derive(Debug, Clone)]
50#[cfg(feature = "regex")]
51pub struct RuleLine {
52 pub path: Option<regex::Regex>,
54 pub allowance: bool,
56}
57
58#[derive(Debug, Eq, PartialEq, Clone)]
59#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
60pub struct RequestRate {
62 pub requests: usize,
64 pub seconds: usize,
66}
67
68#[derive(Debug, Clone)]
70#[cfg_attr(not(feature = "regex"), derive(Eq, PartialEq))]
71pub struct Entry {
72 pub useragents: Vec<String>,
74 pub rulelines: Vec<RuleLine>,
76 pub crawl_delay: Option<Duration>,
78 pub req_rate: Option<RequestRate>,
80}
81
82#[derive(Debug, Clone)]
84#[cfg_attr(not(feature = "regex"), derive(Eq, PartialEq))]
85pub struct RobotFileParser {
86 entries: Vec<Entry>,
88 default_entry: Entry,
90 pub disallow_all: bool,
92 pub allow_all: bool,
94 pub last_checked: i64,
96 #[cfg(feature = "regex")]
98 pub disallow_paths_regex: RegexSet,
99 #[cfg(feature = "regex")]
101 pub disallow_paths: HashSet<String>,
102 #[cfg(feature = "regex")]
104 pub disallow_agents_regex: RegexSet,
105 #[cfg(feature = "regex")]
107 pub wild_card_agent: bool,
108 #[cfg(feature = "regex")]
110 pub disallow_agents: HashSet<String>,
111}
112
113impl RuleLine {
114 #[cfg(feature = "regex")]
115 fn new(path: &str, allowance: bool) -> RuleLine {
116 use regex::Regex;
117
118 RuleLine {
119 path: match Regex::new(path) {
120 Ok(r) => Some(r),
121 _ => None,
122 },
123 allowance: path.is_empty() && !allowance || allowance,
124 }
125 }
126
127 #[cfg(not(feature = "regex"))]
128 fn new(path: &str, allowance: bool) -> RuleLine {
129 RuleLine {
130 path: path.into(),
131 allowance: path.is_empty() && !allowance || allowance,
132 }
133 }
134
135 #[cfg(not(feature = "regex"))]
136 fn applies_to(&self, pathname: &str) -> bool {
137 if self.path == "*"
138 || self.path == "/" && pathname == "/"
139 || self.path.ends_with("/") && pathname.starts_with(&self.path)
140 {
141 true
142 } else {
143 self.path
144 .strip_suffix('*')
145 .map_or(false, |prefix| pathname.starts_with(prefix))
146 || pathname == self.path
147 }
148 }
149
150 #[cfg(feature = "regex")]
151 fn applies_to(&self, pathname: &str) -> bool {
152 match self.path {
153 Some(ref regex) => regex.is_match(pathname),
154 _ => false,
155 }
156 }
157}
158
159impl Entry {
160 fn new() -> Entry {
162 Entry {
163 useragents: vec![],
164 rulelines: vec![],
165 crawl_delay: None,
166 req_rate: None,
167 }
168 }
169
170 fn applies_to(&self, useragent: &str) -> bool {
172 let ua = useragent
173 .split('/')
174 .nth(0)
175 .unwrap_or_default()
176 .to_lowercase();
177
178 for agent in &self.useragents {
179 if agent == "*" || ua.contains(agent) {
180 return true;
181 }
182 }
183
184 false
185 }
186
187 fn allowance(&self, filename: &str) -> bool {
191 for line in &self.rulelines {
192 if line.applies_to(filename) {
193 return line.allowance;
194 }
195 }
196 true
197 }
198
199 fn push_useragent(&mut self, useragent: &str) {
201 self.useragents.push(useragent.to_lowercase());
202 }
203
204 fn push_ruleline(&mut self, ruleline: RuleLine) {
206 self.rulelines.push(ruleline);
207 }
208
209 fn has_useragent(&self) -> bool {
211 self.useragents.iter().any(|a| a == "*")
212 }
213
214 fn is_empty(&self) -> bool {
216 self.useragents.is_empty() && self.rulelines.is_empty()
217 }
218
219 fn set_crawl_delay(&mut self, delay: Duration) {
221 self.crawl_delay = Some(delay);
222 }
223
224 fn get_crawl_delay(&self) -> Option<Duration> {
226 self.crawl_delay
227 }
228
229 fn set_req_rate(&mut self, req_rate: RequestRate) {
231 self.req_rate = Some(req_rate);
232 }
233
234 fn get_req_rate(&self) -> Option<RequestRate> {
236 self.req_rate.clone()
237 }
238}
239
240impl Default for Entry {
241 fn default() -> Entry {
242 Entry::new()
243 }
244}
245
246fn extract_path(url: &str) -> &str {
248 if !url.is_empty() {
249 let prefix = if url.starts_with("https://") {
250 8
251 } else if url.starts_with("http://") {
252 7
253 } else {
254 0
255 };
256
257 let url_slice = &url[prefix..];
258
259 if let Some(path_start) = url_slice.find('/') {
260 let path = &url_slice[path_start..];
261
262 if let Some(query_start) = path.find('?') {
263 &path[..query_start]
264 } else {
265 path
266 }
267 } else {
268 "/"
269 }
270 } else {
271 "/"
272 }
273}
274
275impl RobotFileParser {
276 #[cfg(not(feature = "regex"))]
278 pub fn new() -> Box<RobotFileParser> {
279 RobotFileParser {
280 entries: vec![],
281 default_entry: Entry::new(),
282 disallow_all: false,
283 allow_all: false,
284 last_checked: 0i64,
285 }
286 .into()
287 }
288
289 #[cfg(feature = "regex")]
291 pub fn new() -> Box<RobotFileParser> {
292 RobotFileParser {
293 entries: vec![],
294 default_entry: Entry::new(),
295 disallow_all: false,
296 disallow_paths_regex: RegexSet::default(),
297 disallow_agents_regex: RegexSet::default(),
298 disallow_paths: Default::default(),
299 disallow_agents: Default::default(),
300 wild_card_agent: false,
301 allow_all: false,
302 last_checked: 0i64,
303 }
304 .into()
305 }
306
307 pub fn mtime(&self) -> i64 {
312 self.last_checked
313 }
314
315 pub fn modified(&mut self) {
318 if let Ok(time) = SystemTime::now().duration_since(UNIX_EPOCH) {
319 self.last_checked = time.as_secs() as i64;
320 }
321 }
322
323 pub fn get_entries(&self) -> &Vec<Entry> {
325 &self.entries
326 }
327
328 pub fn get_base_entry(&self) -> &Entry {
330 &self.default_entry
331 }
332
333 pub async fn read(&mut self, client: &Client, url: &str) {
335 use crate::client::StatusCode;
336 self.modified();
337
338 let request = client.get(string_concat!(url, "robots.txt"));
339
340 let res = match request.send().await {
341 Ok(res) => res,
342 Err(_) => {
343 return;
344 }
345 };
346 let status = res.status();
347
348 match status {
349 StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => {
350 self.disallow_all = true;
351 }
352 status
353 if status >= StatusCode::BAD_REQUEST
354 && status < StatusCode::INTERNAL_SERVER_ERROR =>
355 {
356 self.allow_all = true;
357 }
358 StatusCode::OK => self.from_response(res).await,
359 _ => (),
360 }
361 }
362
363 pub async fn from_response(&mut self, response: crate::client::Response) {
365 match response.text().await {
366 Ok(buf) => {
367 let lines: Vec<&str> = buf.split('\n').collect();
368
369 self.parse(&lines);
370 }
371 _ => {
372 self.allow_all = true;
373 }
374 }
375 }
376
377 fn _add_entry(&mut self, entry: Entry) {
378 if entry.has_useragent() {
379 if self.default_entry.is_empty() {
381 self.default_entry = entry;
383 }
384 } else {
385 self.entries.push(entry);
386 }
387 }
388
389 pub fn parse<T: AsRef<str>>(&mut self, lines: &[T]) {
396 use percent_encoding::percent_decode;
397
398 let mut state = 0;
403 let mut entry = Entry::new();
404
405 for line in lines {
406 let mut ln = line.as_ref();
407 if ln.is_empty() {
408 match state {
409 1 => {
410 entry = Entry::new();
411 state = 0;
412 }
413 2 => {
414 self._add_entry(entry);
415 entry = Entry::new();
416 state = 0;
417 }
418 _ => {}
419 }
420 }
421 if let Some(i) = ln.find('#') {
423 ln = &ln[0..i];
424 }
425 ln = ln.trim();
426 if ln.is_empty() {
427 continue;
428 }
429 let parts: Vec<&str> = ln.splitn(2, ':').collect();
430
431 if parts.len() == 2 {
432 let part0 = parts[0].trim().to_lowercase();
433 let part1 = String::from_utf8(percent_decode(parts[1].trim().as_bytes()).collect())
434 .unwrap_or_default();
435 match part0 {
436 ref x if x.to_lowercase() == "user-agent" => {
437 if state == 2 {
438 self._add_entry(entry);
439 entry = Entry::new();
440 }
441 entry.push_useragent(&part1);
442 state = 1;
443 self.set_disallow_agents_list(&part1);
444 }
445 ref x if x.to_lowercase() == "disallow" => {
446 if state != 0 {
447 entry.push_ruleline(RuleLine::new(&part1, false));
448 state = 2;
449 self.set_disallow_list(&part1);
450 }
451 }
452 ref x if x.to_lowercase() == "allow" => {
453 if state != 0 {
454 entry.push_ruleline(RuleLine::new(&part1, true));
455 state = 2;
456 }
457 }
458 ref x if x.to_lowercase() == "crawl-delay" => {
459 if state != 0 {
460 if let Ok(delay) = part1.parse::<f64>() {
461 let delay_seconds = delay.trunc();
462 let delay_nanoseconds = delay.fract() * 10f64.powi(9);
463 let delay =
464 Duration::new(delay_seconds as u64, delay_nanoseconds as u32);
465 entry.set_crawl_delay(delay);
466 }
467 state = 2;
468 }
469 }
470 ref x if x.to_lowercase() == "sitemap" => {
471 if state != 0 {
472 state = 2;
473 }
474 }
475 ref x if x.to_lowercase() == "request-rate" => {
476 if state != 0 {
477 let numbers: Vec<Result<usize, _>> =
478 part1.split('/').map(|x| x.parse::<usize>()).collect();
479 if numbers.len() == 2 && numbers[0].is_ok() && numbers[1].is_ok() {
480 let req_rate = RequestRate {
481 requests: numbers[0].clone().unwrap(),
482 seconds: numbers[1].clone().unwrap(),
483 };
484 entry.set_req_rate(req_rate);
485 }
486 state = 2;
487 }
488 }
489 _ => {}
490 }
491 }
492 }
493
494 if state == 2 {
495 self._add_entry(entry);
496 }
497
498 self.build_disallow_list()
499 }
500
501 #[cfg(not(feature = "regex"))]
503 pub fn set_disallow_list(&mut self, _path: &str) {}
504
505 #[cfg(feature = "regex")]
507 pub fn set_disallow_list(&mut self, path: &str) {
508 if !path.is_empty() {
509 self.disallow_paths.insert(path.into());
510 }
511 }
512
513 #[cfg(not(feature = "regex"))]
515 pub fn set_disallow_agents_list(&mut self, _agent: &str) {}
516
517 #[cfg(feature = "regex")]
519 pub fn set_disallow_agents_list(&mut self, agent: &str) {
520 if !agent.is_empty() {
521 if agent == "*" {
522 self.wild_card_agent = true;
523 }
524 self.disallow_agents.insert(agent.into());
525 }
526 }
527
528 #[cfg(not(feature = "regex"))]
530 pub fn build_disallow_list(&mut self) {}
531
532 #[cfg(feature = "regex")]
534 pub fn build_disallow_list(&mut self) {
535 if !self.disallow_paths.is_empty() {
536 match RegexSet::new(&self.disallow_paths) {
537 Ok(s) => self.disallow_paths_regex = s,
538 _ => (),
539 }
540 }
541 if !self.disallow_agents.is_empty() {
542 match RegexSet::new(&self.disallow_agents) {
543 Ok(s) => self.disallow_agents_regex = s,
544 _ => (),
545 }
546 }
547 }
548
549 pub fn can_fetch<T: AsRef<str>>(&self, useragent: T, url: &str) -> bool {
551 if self.allow_all {
556 true
557 } else if self.last_checked == 0 || self.disallow_all {
558 false
559 } else {
560 let url_str = extract_path(url);
563
564 if self.entry_allowed(&useragent, url_str) {
565 true
566 } else {
567 let default_entry = &self.default_entry;
569
570 if !default_entry.is_empty() {
571 default_entry.allowance(url_str)
572 } else {
573 true
575 }
576 }
577 }
578 }
579
580 #[cfg(not(feature = "regex"))]
582 pub fn entry_allowed<T: AsRef<str>>(&self, useragent: &T, url_str: &str) -> bool {
583 for entry in &self.entries {
584 if entry.applies_to(useragent.as_ref()) {
585 return entry.allowance(url_str);
586 }
587 }
588 false
589 }
590
591 #[cfg(feature = "regex")]
593 pub fn entry_allowed<T: AsRef<str>>(&self, useragent: &T, url_str: &str) -> bool {
594 let agent_checked =
595 self.wild_card_agent || self.disallow_agents_regex.is_match(useragent.as_ref());
596 let disallow = agent_checked && self.disallow_paths_regex.is_match(url_str);
597
598 !disallow
599 }
600
601 pub fn get_crawl_delay(&self, useragent: &Option<Box<CompactString>>) -> Option<Duration> {
603 if self.last_checked == 0 {
604 None
605 } else {
606 let useragent = useragent.as_ref();
607 let crawl_delay: Option<Duration> = match useragent {
608 Some(ua) => {
609 for entry in &self.entries {
610 if entry.applies_to(ua) {
611 return entry.get_crawl_delay();
612 }
613 }
614 None
615 }
616 _ => None,
617 };
618
619 if crawl_delay.is_some() {
620 crawl_delay
621 } else {
622 let default_entry = &self.default_entry;
623
624 if !default_entry.is_empty() {
625 return default_entry.get_crawl_delay();
626 }
627
628 None
629 }
630 }
631 }
632
633 pub fn get_req_rate<T: AsRef<str>>(&self, useragent: T) -> Option<RequestRate> {
635 let useragent = useragent.as_ref();
636 if self.last_checked == 0 {
637 return None;
638 }
639 for entry in &self.entries {
640 if entry.applies_to(useragent) {
641 return entry.get_req_rate();
642 }
643 }
644 None
645 }
646}