robotparser_fork/parser/
robots_txt_parser.rs

1use crate::model::{CleanParams, PathPattern, RequestRate, RobotsTxt, Rule};
2use crate::parser::line::Line;
3use crate::parser::parse_result::ParseResult;
4use crate::parser::warning::ParseWarning;
5use std::time::Duration;
6use url::{Origin, Url};
7mod directive;
8use self::directive::Directive;
9mod group_builder;
10pub use self::group_builder::GroupBuilder;
11
12const COMMENT_BEGIN_CHAR: char = '#';
13const KV_SEPARATOR: &str = ":";
14
15/// Parses the text of the robots.txt file located in the specified origin.
16pub fn parse(origin: Origin, input: &str) -> ParseResult<RobotsTxt> {
17    let parser = Parser::new(origin);
18    parser.parse(input)
19}
20
21struct Parser {
22    result: RobotsTxt,
23    group_builder: GroupBuilder,
24    warnings: Vec<ParseWarning>,
25}
26
27impl Parser {
28    pub fn new(origin: Origin) -> Parser {
29        Parser {
30            result: RobotsTxt::new(origin),
31            group_builder: GroupBuilder::new(),
32            warnings: Vec::new(),
33        }
34    }
35
36    pub fn parse(mut self, input: &str) -> ParseResult<RobotsTxt> {
37        let input = ignore_bom(input);
38        for (line_no, line) in input.lines().enumerate() {
39            let line = Line::new(line, line_no + 1);
40            match Self::parse_line(&line) {
41                Ok(Some(line_value)) => {
42                    self.process_line_value(&line, &line_value);
43                }
44                Err(warning) => {
45                    self.warnings.push(warning);
46                }
47                _ => {}
48            }
49        }
50        self.group_builder.fill_entries(&mut self.result);
51        ParseResult::new_with_warnings(self.result, self.warnings)
52    }
53
54    fn parse_line<'a>(line: &'a Line) -> Result<Option<Directive<'a>>, ParseWarning> {
55        let mut kv_part = line.get_line_text();
56        if let Some(comment_separator_position) = line.get_line_text().find(COMMENT_BEGIN_CHAR) {
57            kv_part = &kv_part[0..comment_separator_position];
58        }
59        if kv_part.is_empty() {
60            return Ok(None);
61        }
62        let separator_index = kv_part
63            .find(KV_SEPARATOR)
64            .ok_or_else(|| ParseWarning::invalid_directive_format(line))?;
65        if separator_index >= kv_part.len() {
66            return Err(ParseWarning::invalid_directive_format(line));
67        }
68        let key = &kv_part[0..separator_index];
69        let key = key.trim();
70        if key.is_empty() {
71            return Err(ParseWarning::directive_key_is_empty(line));
72        }
73        let value = &kv_part[separator_index + 1..];
74        let value = value.trim();
75        let result = Directive::new(key, value);
76        Ok(Some(result))
77    }
78
79    fn process_line_value(&mut self, line: &Line, directive: &Directive) {
80        let key = directive.get_key_lowercase();
81        match key.as_str() {
82            // Group specific directives
83            "user-agent" => {
84                self.process_directive_user_agent(line, directive);
85            }
86            "allow" => {
87                self.process_directive_allow(line, directive);
88            }
89            "disallow" => {
90                self.process_directive_disallow(line, directive);
91            }
92            "crawl-delay" => {
93                self.process_directive_crawl_delay(line, directive);
94            }
95            "request-rate" => {
96                self.process_directive_request_rate(line, directive);
97            }
98            // Non-group directives
99            "sitemap" => {
100                self.process_directive_sitemap(line, directive);
101            }
102            "clean-param" => {
103                self.process_directive_clean_param(line, directive);
104            }
105            _ => {
106                self.warnings.push(ParseWarning::unsupported_directive_key(line, key));
107            }
108        }
109    }
110
111    fn process_directive_user_agent(&mut self, line: &Line, directive: &Directive) {
112        let user_agent = directive.get_value();
113        if user_agent.is_empty() {
114            self.warnings.push(ParseWarning::user_agent_cannot_be_empty(line));
115            return;
116        }
117        self.group_builder.handle_user_agent(user_agent);
118    }
119
120    fn process_directive_allow(&mut self, line: &Line, directive: &Directive) {
121        if let Some(group) = self.group_builder.get_mut_active_group() {
122            if directive.get_value() == "" {
123                // Nothing to do. Ignoring.
124            } else if directive.get_value().starts_with('*') || directive.get_value().starts_with('/') {
125                group.push_rule(Rule::new(directive.get_value(), true));
126            } else {
127                self.warnings.push(ParseWarning::wrong_path_format(line));
128            }
129        } else {
130            self.warnings.push(ParseWarning::directive_without_user_agent(line));
131        }
132    }
133
134    fn process_directive_disallow(&mut self, line: &Line, directive: &Directive) {
135        if let Some(group) = self.group_builder.get_mut_active_group() {
136            if directive.get_value() == "" {
137                // Allow all.
138                group.push_rule(Rule::new(PathPattern::all(), true));
139            } else if directive.get_value().starts_with('*') || directive.get_value().starts_with('/') {
140                group.push_rule(Rule::new(directive.get_value(), false));
141            } else {
142                self.warnings.push(ParseWarning::wrong_path_format(line));
143            }
144        } else {
145            self.warnings.push(ParseWarning::directive_without_user_agent(line));
146        }
147    }
148
149    fn process_directive_crawl_delay(&mut self, line: &Line, directive: &Directive) {
150        if let Some(group) = self.group_builder.get_mut_active_group() {
151            match directive.get_value().parse::<f64>() {
152                Ok(delay) => {
153                    let delay_seconds = delay.trunc();
154                    let delay_nanoseconds = delay.fract() * 10f64.powi(9);
155                    let delay = Duration::new(delay_seconds as u64, delay_nanoseconds as u32);
156                    group.set_crawl_delay(delay);
157                }
158                Err(error) => {
159                    self.warnings.push(ParseWarning::parse_crawl_delay_error(line, error));
160                }
161            }
162        } else {
163            self.warnings.push(ParseWarning::directive_without_user_agent(line));
164        }
165    }
166
167    fn process_directive_request_rate(&mut self, line: &Line, directive: &Directive) {
168        if let Some(group) = self.group_builder.get_mut_active_group() {
169            let numbers: Vec<&str> = directive.get_value().split('/').collect();
170            if numbers.len() != 2 {
171                self.warnings.push(ParseWarning::wrong_request_rate_format(line));
172                return;
173            }
174            let requests = match numbers[0].parse::<usize>() {
175                Ok(requests) => requests,
176                Err(error) => {
177                    self.warnings.push(ParseWarning::parse_request_rate(line, error));
178                    return;
179                }
180            };
181            let seconds = match numbers[1].parse::<usize>() {
182                Ok(seconds) => seconds,
183                Err(error) => {
184                    self.warnings.push(ParseWarning::parse_request_rate(line, error));
185                    return;
186                }
187            };
188            group.set_req_rate(RequestRate { requests, seconds });
189        } else {
190            self.warnings.push(ParseWarning::directive_without_user_agent(line));
191        }
192    }
193
194    fn process_directive_sitemap(&mut self, line: &Line, directive: &Directive) {
195        match Url::parse(directive.get_value()) {
196            Ok(sitemap_url) => {
197                self.result.add_sitemap(sitemap_url);
198            }
199            Err(error) => {
200                self.warnings.push(ParseWarning::parse_url(line, error));
201            }
202        }
203    }
204
205    fn process_directive_clean_param(&mut self, line: &Line, directive: &Directive) {
206        let parts: Vec<&str> = directive.get_value().split_whitespace().collect();
207        if parts.len() >= 3 || parts.is_empty() {
208            self.warnings.push(ParseWarning::wrong_clean_param_format(line));
209            return;
210        }
211        if parts[0].is_empty() {
212            self.warnings.push(ParseWarning::wrong_clean_param_format(line));
213            return;
214        }
215        let clean_params_path_pattern;
216        let clean_params;
217        if let Some(second_param) = parts.get(1) {
218            if second_param.is_empty() {
219                self.warnings.push(ParseWarning::wrong_clean_param_format(line));
220                return;
221            }
222            clean_params_path_pattern = PathPattern::new(parts[0]);
223            clean_params = *second_param;
224        } else {
225            clean_params_path_pattern = PathPattern::all();
226            clean_params = parts[0];
227        }
228        let (valid_clean_params, invalid_clean_params) = Self::parse_clean_params(clean_params);
229        if !invalid_clean_params.is_empty() {
230            self.warnings
231                .push(ParseWarning::ignored_clean_params(line, invalid_clean_params));
232        }
233        self.result
234            .add_clean_params(CleanParams::new(clean_params_path_pattern, valid_clean_params));
235    }
236
237    fn parse_clean_params(clean_params: &str) -> (Vec<String>, Vec<String>) {
238        let mut valid = Vec::new();
239        let mut invalid = Vec::new();
240        for clean_param in clean_params.split('&') {
241            if !clean_param.is_empty() {
242                if Self::is_valid_clean_param(clean_param) {
243                    valid.push(clean_param.into());
244                } else {
245                    invalid.push(clean_param.into());
246                }
247            }
248        }
249        (valid, invalid)
250    }
251
252    fn is_valid_clean_param(clean_param: &str) -> bool {
253        for c in clean_param.chars() {
254            let is_valid = ('A'..'Z').contains(&c)
255                || ('a'..'z').contains(&c)
256                || ('0'..'9').contains(&c)
257                || c == '.'
258                || c == '-'
259                || c == '_';
260            if !is_valid {
261                return false;
262            }
263        }
264        true
265    }
266}
267
268fn ignore_bom(input: &str) -> &str {
269    const BOM: &str = "\u{feff}";
270    input.trim_start_matches(BOM)
271}