robotparser_fork/service/
robots_txt.rs

1use crate::model::Path;
2use crate::model::RequestRate;
3use crate::model::RobotsTxt;
4use crate::service::RobotsTxtService;
5use std::time::Duration;
6use url::Url;
7
8impl RobotsTxtService for RobotsTxt {
9    fn can_fetch(&self, user_agent: &str, url: &Url) -> bool {
10        if url.origin() != *self.get_origin() {
11            return false;
12        }
13        let path = Path::from_url(url);
14        let rule_decision = self.find_in_group(user_agent, |group| {
15            let rules = group.get_rules_sorted_by_path_len_desc();
16            for rule in rules.iter() {
17                if rule.applies_to(&path) {
18                    return Some(rule.get_allowance());
19                }
20            }
21            None
22        });
23        if let Some(rule_decision) = rule_decision {
24            return rule_decision;
25        }
26        // Empty robots.txt allows crawling. Everything that was not denied must be allowed.
27        true
28    }
29
30    fn get_crawl_delay(&self, user_agent: &str) -> Option<Duration> {
31        self.find_in_group(user_agent, |group| group.get_crawl_delay())
32    }
33
34    fn normalize_url(&self, url: &mut Url) -> bool {
35        if url.origin() != *self.get_origin() {
36            return false;
37        }
38        self.normalize_url_ignore_origin(url);
39        true
40    }
41
42    fn normalize_url_ignore_origin(&self, url: &mut Url) {
43        if url.query().is_none() {
44            return;
45        }
46        let mut query_params_to_filter = Vec::new();
47        let path = Path::from_url(url);
48        for clean_params in self.get_clean_params().iter() {
49            if clean_params.get_path_pattern().applies_to(&path) {
50                query_params_to_filter.extend_from_slice(clean_params.get_params())
51            }
52        }
53        let mut pairs: Vec<(String, String)> = url
54            .query_pairs()
55            .map(|(key, value)| (key.into(), value.into()))
56            .collect();
57        {
58            let mut query_pairs_mut = url.query_pairs_mut();
59            query_pairs_mut.clear();
60            for (key, value) in pairs.drain(..) {
61                if !query_params_to_filter.contains(&key) {
62                    query_pairs_mut.append_pair(&key, &value);
63                }
64            }
65        }
66        if url.query() == Some("") {
67            url.set_query(None);
68        }
69    }
70
71    fn get_sitemaps(&self) -> &[Url] {
72        self.get_sitemaps_slice()
73    }
74
75    fn get_req_rate(&self, user_agent: &str) -> Option<RequestRate> {
76        self.find_in_group(user_agent, |group| group.get_req_rate())
77    }
78}