robotparser_fork/parser/
robots_txt_parser.rs1use crate::model::{CleanParams, PathPattern, RequestRate, RobotsTxt, Rule};
2use crate::parser::line::Line;
3use crate::parser::parse_result::ParseResult;
4use crate::parser::warning::ParseWarning;
5use std::time::Duration;
6use url::{Origin, Url};
7mod directive;
8use self::directive::Directive;
9mod group_builder;
10pub use self::group_builder::GroupBuilder;
11
12const COMMENT_BEGIN_CHAR: char = '#';
13const KV_SEPARATOR: &str = ":";
14
15pub fn parse(origin: Origin, input: &str) -> ParseResult<RobotsTxt> {
17 let parser = Parser::new(origin);
18 parser.parse(input)
19}
20
21struct Parser {
22 result: RobotsTxt,
23 group_builder: GroupBuilder,
24 warnings: Vec<ParseWarning>,
25}
26
27impl Parser {
28 pub fn new(origin: Origin) -> Parser {
29 Parser {
30 result: RobotsTxt::new(origin),
31 group_builder: GroupBuilder::new(),
32 warnings: Vec::new(),
33 }
34 }
35
36 pub fn parse(mut self, input: &str) -> ParseResult<RobotsTxt> {
37 let input = ignore_bom(input);
38 for (line_no, line) in input.lines().enumerate() {
39 let line = Line::new(line, line_no + 1);
40 match Self::parse_line(&line) {
41 Ok(Some(line_value)) => {
42 self.process_line_value(&line, &line_value);
43 }
44 Err(warning) => {
45 self.warnings.push(warning);
46 }
47 _ => {}
48 }
49 }
50 self.group_builder.fill_entries(&mut self.result);
51 ParseResult::new_with_warnings(self.result, self.warnings)
52 }
53
54 fn parse_line<'a>(line: &'a Line) -> Result<Option<Directive<'a>>, ParseWarning> {
55 let mut kv_part = line.get_line_text();
56 if let Some(comment_separator_position) = line.get_line_text().find(COMMENT_BEGIN_CHAR) {
57 kv_part = &kv_part[0..comment_separator_position];
58 }
59 if kv_part.is_empty() {
60 return Ok(None);
61 }
62 let separator_index = kv_part
63 .find(KV_SEPARATOR)
64 .ok_or_else(|| ParseWarning::invalid_directive_format(line))?;
65 if separator_index >= kv_part.len() {
66 return Err(ParseWarning::invalid_directive_format(line));
67 }
68 let key = &kv_part[0..separator_index];
69 let key = key.trim();
70 if key.is_empty() {
71 return Err(ParseWarning::directive_key_is_empty(line));
72 }
73 let value = &kv_part[separator_index + 1..];
74 let value = value.trim();
75 let result = Directive::new(key, value);
76 Ok(Some(result))
77 }
78
79 fn process_line_value(&mut self, line: &Line, directive: &Directive) {
80 let key = directive.get_key_lowercase();
81 match key.as_str() {
82 "user-agent" => {
84 self.process_directive_user_agent(line, directive);
85 }
86 "allow" => {
87 self.process_directive_allow(line, directive);
88 }
89 "disallow" => {
90 self.process_directive_disallow(line, directive);
91 }
92 "crawl-delay" => {
93 self.process_directive_crawl_delay(line, directive);
94 }
95 "request-rate" => {
96 self.process_directive_request_rate(line, directive);
97 }
98 "sitemap" => {
100 self.process_directive_sitemap(line, directive);
101 }
102 "clean-param" => {
103 self.process_directive_clean_param(line, directive);
104 }
105 _ => {
106 self.warnings.push(ParseWarning::unsupported_directive_key(line, key));
107 }
108 }
109 }
110
111 fn process_directive_user_agent(&mut self, line: &Line, directive: &Directive) {
112 let user_agent = directive.get_value();
113 if user_agent.is_empty() {
114 self.warnings.push(ParseWarning::user_agent_cannot_be_empty(line));
115 return;
116 }
117 self.group_builder.handle_user_agent(user_agent);
118 }
119
120 fn process_directive_allow(&mut self, line: &Line, directive: &Directive) {
121 if let Some(group) = self.group_builder.get_mut_active_group() {
122 if directive.get_value() == "" {
123 } else if directive.get_value().starts_with('*') || directive.get_value().starts_with('/') {
125 group.push_rule(Rule::new(directive.get_value(), true));
126 } else {
127 self.warnings.push(ParseWarning::wrong_path_format(line));
128 }
129 } else {
130 self.warnings.push(ParseWarning::directive_without_user_agent(line));
131 }
132 }
133
134 fn process_directive_disallow(&mut self, line: &Line, directive: &Directive) {
135 if let Some(group) = self.group_builder.get_mut_active_group() {
136 if directive.get_value() == "" {
137 group.push_rule(Rule::new(PathPattern::all(), true));
139 } else if directive.get_value().starts_with('*') || directive.get_value().starts_with('/') {
140 group.push_rule(Rule::new(directive.get_value(), false));
141 } else {
142 self.warnings.push(ParseWarning::wrong_path_format(line));
143 }
144 } else {
145 self.warnings.push(ParseWarning::directive_without_user_agent(line));
146 }
147 }
148
149 fn process_directive_crawl_delay(&mut self, line: &Line, directive: &Directive) {
150 if let Some(group) = self.group_builder.get_mut_active_group() {
151 match directive.get_value().parse::<f64>() {
152 Ok(delay) => {
153 let delay_seconds = delay.trunc();
154 let delay_nanoseconds = delay.fract() * 10f64.powi(9);
155 let delay = Duration::new(delay_seconds as u64, delay_nanoseconds as u32);
156 group.set_crawl_delay(delay);
157 }
158 Err(error) => {
159 self.warnings.push(ParseWarning::parse_crawl_delay_error(line, error));
160 }
161 }
162 } else {
163 self.warnings.push(ParseWarning::directive_without_user_agent(line));
164 }
165 }
166
167 fn process_directive_request_rate(&mut self, line: &Line, directive: &Directive) {
168 if let Some(group) = self.group_builder.get_mut_active_group() {
169 let numbers: Vec<&str> = directive.get_value().split('/').collect();
170 if numbers.len() != 2 {
171 self.warnings.push(ParseWarning::wrong_request_rate_format(line));
172 return;
173 }
174 let requests = match numbers[0].parse::<usize>() {
175 Ok(requests) => requests,
176 Err(error) => {
177 self.warnings.push(ParseWarning::parse_request_rate(line, error));
178 return;
179 }
180 };
181 let seconds = match numbers[1].parse::<usize>() {
182 Ok(seconds) => seconds,
183 Err(error) => {
184 self.warnings.push(ParseWarning::parse_request_rate(line, error));
185 return;
186 }
187 };
188 group.set_req_rate(RequestRate { requests, seconds });
189 } else {
190 self.warnings.push(ParseWarning::directive_without_user_agent(line));
191 }
192 }
193
194 fn process_directive_sitemap(&mut self, line: &Line, directive: &Directive) {
195 match Url::parse(directive.get_value()) {
196 Ok(sitemap_url) => {
197 self.result.add_sitemap(sitemap_url);
198 }
199 Err(error) => {
200 self.warnings.push(ParseWarning::parse_url(line, error));
201 }
202 }
203 }
204
205 fn process_directive_clean_param(&mut self, line: &Line, directive: &Directive) {
206 let parts: Vec<&str> = directive.get_value().split_whitespace().collect();
207 if parts.len() >= 3 || parts.is_empty() {
208 self.warnings.push(ParseWarning::wrong_clean_param_format(line));
209 return;
210 }
211 if parts[0].is_empty() {
212 self.warnings.push(ParseWarning::wrong_clean_param_format(line));
213 return;
214 }
215 let clean_params_path_pattern;
216 let clean_params;
217 if let Some(second_param) = parts.get(1) {
218 if second_param.is_empty() {
219 self.warnings.push(ParseWarning::wrong_clean_param_format(line));
220 return;
221 }
222 clean_params_path_pattern = PathPattern::new(parts[0]);
223 clean_params = *second_param;
224 } else {
225 clean_params_path_pattern = PathPattern::all();
226 clean_params = parts[0];
227 }
228 let (valid_clean_params, invalid_clean_params) = Self::parse_clean_params(clean_params);
229 if !invalid_clean_params.is_empty() {
230 self.warnings
231 .push(ParseWarning::ignored_clean_params(line, invalid_clean_params));
232 }
233 self.result
234 .add_clean_params(CleanParams::new(clean_params_path_pattern, valid_clean_params));
235 }
236
237 fn parse_clean_params(clean_params: &str) -> (Vec<String>, Vec<String>) {
238 let mut valid = Vec::new();
239 let mut invalid = Vec::new();
240 for clean_param in clean_params.split('&') {
241 if !clean_param.is_empty() {
242 if Self::is_valid_clean_param(clean_param) {
243 valid.push(clean_param.into());
244 } else {
245 invalid.push(clean_param.into());
246 }
247 }
248 }
249 (valid, invalid)
250 }
251
252 fn is_valid_clean_param(clean_param: &str) -> bool {
253 for c in clean_param.chars() {
254 let is_valid = ('A'..'Z').contains(&c)
255 || ('a'..'z').contains(&c)
256 || ('0'..'9').contains(&c)
257 || c == '.'
258 || c == '-'
259 || c == '_';
260 if !is_valid {
261 return false;
262 }
263 }
264 true
265 }
266}
267
268fn ignore_bom(input: &str) -> &str {
269 const BOM: &str = "\u{feff}";
270 input.trim_start_matches(BOM)
271}