robotparser_fork/service/
robots_txt.rs1use crate::model::Path;
2use crate::model::RequestRate;
3use crate::model::RobotsTxt;
4use crate::service::RobotsTxtService;
5use std::time::Duration;
6use url::Url;
7
8impl RobotsTxtService for RobotsTxt {
9 fn can_fetch(&self, user_agent: &str, url: &Url) -> bool {
10 if url.origin() != *self.get_origin() {
11 return false;
12 }
13 let path = Path::from_url(url);
14 let rule_decision = self.find_in_group(user_agent, |group| {
15 let rules = group.get_rules_sorted_by_path_len_desc();
16 for rule in rules.iter() {
17 if rule.applies_to(&path) {
18 return Some(rule.get_allowance());
19 }
20 }
21 None
22 });
23 if let Some(rule_decision) = rule_decision {
24 return rule_decision;
25 }
26 true
28 }
29
30 fn get_crawl_delay(&self, user_agent: &str) -> Option<Duration> {
31 self.find_in_group(user_agent, |group| group.get_crawl_delay())
32 }
33
34 fn normalize_url(&self, url: &mut Url) -> bool {
35 if url.origin() != *self.get_origin() {
36 return false;
37 }
38 self.normalize_url_ignore_origin(url);
39 true
40 }
41
42 fn normalize_url_ignore_origin(&self, url: &mut Url) {
43 if url.query().is_none() {
44 return;
45 }
46 let mut query_params_to_filter = Vec::new();
47 let path = Path::from_url(url);
48 for clean_params in self.get_clean_params().iter() {
49 if clean_params.get_path_pattern().applies_to(&path) {
50 query_params_to_filter.extend_from_slice(clean_params.get_params())
51 }
52 }
53 let mut pairs: Vec<(String, String)> = url
54 .query_pairs()
55 .map(|(key, value)| (key.into(), value.into()))
56 .collect();
57 {
58 let mut query_pairs_mut = url.query_pairs_mut();
59 query_pairs_mut.clear();
60 for (key, value) in pairs.drain(..) {
61 if !query_params_to_filter.contains(&key) {
62 query_pairs_mut.append_pair(&key, &value);
63 }
64 }
65 }
66 if url.query() == Some("") {
67 url.set_query(None);
68 }
69 }
70
71 fn get_sitemaps(&self) -> &[Url] {
72 self.get_sitemaps_slice()
73 }
74
75 fn get_req_rate(&self, user_agent: &str) -> Option<RequestRate> {
76 self.find_in_group(user_agent, |group| group.get_req_rate())
77 }
78}