1use crate::nodes::{
7 Comment, EmptyLine, Fakeness, Newline, ParenthesizableWhitespace, ParenthesizedWhitespace,
8 SimpleWhitespace, TrailingWhitespace,
9};
10use memchr::{memchr2, memchr2_iter};
11use thiserror::Error;
12
13use crate::Token;
14
15use super::TokType;
16
17#[allow(clippy::upper_case_acronyms, clippy::enum_variant_names)]
18#[derive(Error, Debug, PartialEq, Eq)]
19pub enum WhitespaceError {
20 #[error("WTF")]
21 WTF,
22 #[error("Internal error while parsing whitespace: {0}")]
23 InternalError(String),
24 #[error("Failed to parse mandatory trailing whitespace")]
25 TrailingWhitespaceError,
26}
27
28type Result<T> = std::result::Result<T, WhitespaceError>;
29
30#[derive(Debug, PartialEq, Eq, Clone)]
31pub struct State<'a> {
32 pub line: usize, pub column: usize, pub column_byte: usize,
35 pub absolute_indent: &'a str,
36 pub is_parenthesized: bool,
37 pub byte_offset: usize,
38}
39
40impl<'a> Default for State<'a> {
41 fn default() -> Self {
42 Self {
43 line: 1,
44 column: 0,
45 column_byte: 0,
46 absolute_indent: "",
47 is_parenthesized: false,
48 byte_offset: 0,
49 }
50 }
51}
52
53pub struct Config<'a> {
55 pub input: &'a str,
56 pub lines: Vec<&'a str>,
57 pub default_newline: &'a str,
58 pub default_indent: &'a str,
59}
60
61impl<'a> Config<'a> {
62 pub fn new(input: &'a str, tokens: &[Token<'a>]) -> Self {
63 let mut default_indent = " ";
64 for tok in tokens {
65 if tok.r#type == TokType::Indent {
66 default_indent = tok.relative_indent.unwrap();
67 break;
68 }
69 }
70
71 let mut lines = Vec::new();
72 let mut start = 0;
73 let mut newline_positions = memchr2_iter(b'\n', b'\r', input.as_bytes());
74
75 while let Some(newline_position) = newline_positions.next() {
76 let newline_character = input.as_bytes()[newline_position] as char;
77
78 let len = if newline_character == '\r'
79 && input.as_bytes().get(newline_position + 1) == Some(&b'\n')
80 {
81 newline_positions.next();
83 2
84 } else {
85 1
86 };
87
88 let end = newline_position + len;
89 lines.push(&input[start..end]);
90 start = end;
91 }
92
93 if start < input.len() {
95 lines.push(&input[start..]);
96 }
97
98 let default_newline = match lines.first().map(|line| line.as_bytes()).unwrap_or(&[]) {
99 [.., b'\r', b'\n'] => "\r\n",
100 [.., b'\n'] => "\n",
101 [.., b'\r'] => "\r",
102 _ => "\n",
103 };
104
105 Self {
106 input,
107 lines,
108 default_newline,
109 default_indent,
110 }
111 }
112
113 pub fn has_trailing_newline(&self) -> bool {
114 self.input.ends_with('\n')
115 && !self.input.ends_with("\\\n")
116 && !self.input.ends_with("\\\r\n")
117 }
118
119 fn get_line(&self, line_number: usize) -> Result<&'a str> {
120 let err_fn = || {
121 WhitespaceError::InternalError(format!(
122 "tried to get line {} which is out of range",
123 line_number
124 ))
125 };
126 self.lines
127 .get(line_number.checked_sub(1).ok_or_else(err_fn)?)
128 .map(|l| &l[..])
129 .ok_or_else(err_fn)
130 }
131
132 fn get_line_after_column(&self, line_number: usize, column_index: usize) -> Result<&'a str> {
133 self.get_line(line_number)?
134 .get(column_index..)
135 .ok_or_else(|| {
136 WhitespaceError::InternalError(format!(
137 "Column index {} out of range for line {}",
138 column_index, line_number
139 ))
140 })
141 }
142}
143
144#[derive(Debug)]
145enum ParsedEmptyLine<'a> {
146 NoIndent,
147 Line(EmptyLine<'a>),
148}
149
150fn parse_empty_line<'a>(
151 config: &Config<'a>,
152 state: &mut State,
153 override_absolute_indent: Option<&'a str>,
154) -> Result<ParsedEmptyLine<'a>> {
155 let mut speculative_state = state.clone();
156 if let Ok(indent) = parse_indent(config, &mut speculative_state, override_absolute_indent) {
157 let whitespace = parse_simple_whitespace(config, &mut speculative_state)?;
158 let comment = parse_comment(config, &mut speculative_state)?;
159 if let Some(newline) = parse_newline(config, &mut speculative_state)? {
160 *state = speculative_state;
161 return Ok(ParsedEmptyLine::Line(EmptyLine {
162 indent,
163 whitespace,
164 comment,
165 newline,
166 }));
167 }
168 }
169 Ok(ParsedEmptyLine::NoIndent)
170}
171
172fn _parse_empty_lines<'a>(
173 config: &Config<'a>,
174 state: &mut State<'a>,
175 override_absolute_indent: Option<&'a str>,
176) -> Result<Vec<(State<'a>, EmptyLine<'a>)>> {
177 let mut lines = vec![];
178 loop {
179 let last_state = state.clone();
180 let parsed_line = parse_empty_line(config, state, override_absolute_indent)?;
181 if *state == last_state {
182 break;
183 }
184 match parsed_line {
185 ParsedEmptyLine::NoIndent => break,
186 ParsedEmptyLine::Line(l) => lines.push((state.clone(), l)),
187 }
188 }
189 Ok(lines)
190}
191
192pub fn parse_empty_lines<'a>(
193 config: &Config<'a>,
194 state: &mut State<'a>,
195 override_absolute_indent: Option<&'a str>,
196) -> Result<Vec<EmptyLine<'a>>> {
197 let mut speculative_state = state.clone();
205 let mut lines = _parse_empty_lines(config, &mut speculative_state, override_absolute_indent)?;
206
207 if override_absolute_indent.is_some() {
208 while let Some((_, empty_line)) = lines.last() {
210 if empty_line.indent {
211 break;
212 }
213 lines.pop();
214 }
215 }
216
217 if let Some((final_state, _)) = lines.last() {
218 *state = final_state.clone();
220 }
221
222 Ok(lines.into_iter().map(|(_, e)| e).collect())
223}
224
225pub fn parse_comment<'a>(config: &Config<'a>, state: &mut State) -> Result<Option<Comment<'a>>> {
226 let newline_after = config.get_line_after_column(state.line, state.column_byte)?;
227 if newline_after.as_bytes().first() != Some(&b'#') {
228 return Ok(None);
229 }
230 let comment_str = if let Some(idx) = memchr2(b'\n', b'\r', newline_after.as_bytes()) {
231 &newline_after[..idx]
232 } else {
233 newline_after
234 };
235 advance_this_line(
236 config,
237 state,
238 comment_str.chars().count(),
239 comment_str.len(),
240 )?;
241 Ok(Some(Comment(comment_str)))
242}
243
244pub fn parse_newline<'a>(config: &Config<'a>, state: &mut State) -> Result<Option<Newline<'a>>> {
245 let newline_after = config.get_line_after_column(state.line, state.column_byte)?;
246 let len = match newline_after.as_bytes() {
247 [b'\n', ..] => 1,
248 [b'\r', b'\n', ..] => 2,
249 [b'\r', ..] => 1,
250 _ => 0,
251 };
252 if len > 0 {
253 let newline_str = &newline_after[..len];
254 advance_this_line(config, state, len, len)?;
255 if state.column_byte != config.get_line(state.line)?.len() {
256 return Err(WhitespaceError::InternalError(format!(
257 "Found newline at ({}, {}) but it's not EOL",
258 state.line, state.column
259 )));
260 }
261 if state.line < config.lines.len() {
262 advance_to_next_line(config, state)?;
263 }
264 return Ok(Some(Newline(
265 if newline_str == config.default_newline {
266 None
267 } else {
268 Some(newline_str)
269 },
270 Fakeness::Real,
271 )));
272 }
273
274 if state.byte_offset == config.input.len() && state.column_byte != 0 {
277 return Ok(Some(Newline(None, Fakeness::Fake)));
278 }
279 Ok(None)
280}
281
282pub fn parse_optional_trailing_whitespace<'a>(
283 config: &Config<'a>,
284 state: &mut State,
285) -> Result<Option<TrailingWhitespace<'a>>> {
286 let mut speculative_state = state.clone();
287 let whitespace = parse_simple_whitespace(config, &mut speculative_state)?;
288 let comment = parse_comment(config, &mut speculative_state)?;
289 if let Some(newline) = parse_newline(config, &mut speculative_state)? {
290 *state = speculative_state;
291 Ok(Some(TrailingWhitespace {
292 whitespace,
293 comment,
294 newline,
295 }))
296 } else {
297 Ok(None)
298 }
299}
300
301pub fn parse_trailing_whitespace<'a>(
302 config: &Config<'a>,
303 state: &mut State,
304) -> Result<TrailingWhitespace<'a>> {
305 match parse_optional_trailing_whitespace(config, state)? {
306 Some(ws) => Ok(ws),
307 _ => Err(WhitespaceError::TrailingWhitespaceError),
308 }
309}
310
311fn parse_indent<'a>(
312 config: &Config<'a>,
313 state: &mut State,
314 override_absolute_indent: Option<&'a str>,
315) -> Result<bool> {
316 let absolute_indent = override_absolute_indent.unwrap_or(state.absolute_indent);
317 if state.column_byte != 0 {
318 if state.column_byte == config.get_line(state.line)?.len()
319 && state.line == config.lines.len()
320 {
321 Ok(false)
322 } else {
323 Err(WhitespaceError::InternalError(
324 "Column should not be 0 when parsing an index".to_string(),
325 ))
326 }
327 } else {
328 Ok(
329 if config
330 .get_line_after_column(state.line, state.column_byte)?
331 .starts_with(absolute_indent)
332 {
333 state.column_byte += absolute_indent.len();
334 state.column += absolute_indent.chars().count();
335 state.byte_offset += absolute_indent.len();
336 true
337 } else {
338 false
339 },
340 )
341 }
342}
343
344fn advance_to_next_line<'a>(config: &Config<'a>, state: &mut State) -> Result<()> {
345 let cur_line = config.get_line(state.line)?;
346 state.byte_offset += cur_line.len() - state.column_byte;
347 state.column = 0;
348 state.column_byte = 0;
349 state.line += 1;
350 Ok(())
351}
352
353fn advance_this_line<'a>(
354 config: &Config<'a>,
355 state: &mut State,
356 char_count: usize,
357 offset: usize,
358) -> Result<()> {
359 let cur_line = config.get_line(state.line)?;
360 if cur_line.len() < state.column_byte + offset {
361 return Err(WhitespaceError::InternalError(format!(
362 "Tried to advance past line {}'s end",
363 state.line
364 )));
365 }
366 state.column += char_count;
367 state.column_byte += offset;
368 state.byte_offset += offset;
369 Ok(())
370}
371
372pub fn parse_simple_whitespace<'a>(
373 config: &Config<'a>,
374 state: &mut State,
375) -> Result<SimpleWhitespace<'a>> {
376 let capture_ws = |line, col| -> Result<&'a str> {
377 let line = config.get_line_after_column(line, col)?;
378 let bytes = line.as_bytes();
379 let mut idx = 0;
380 while idx < bytes.len() {
381 match bytes[idx..] {
382 [b' ' | b'\t' | b'\x0c', ..] => idx += 1,
383 [b'\\', b'\r', b'\n', ..] => idx += 3,
384 [b'\\', b'\r' | b'\n', ..] => idx += 2,
385 _ => break,
386 }
387 }
388 Ok(&line[..idx])
389 };
390 let start_offset = state.byte_offset;
391 let mut prev_line: &str;
392 loop {
393 prev_line = capture_ws(state.line, state.column_byte)?;
394 if !prev_line.contains('\\') {
395 break;
396 }
397 advance_to_next_line(config, state)?;
398 }
399 advance_this_line(config, state, prev_line.chars().count(), prev_line.len())?;
400
401 Ok(SimpleWhitespace(
402 &config.input[start_offset..state.byte_offset],
403 ))
404}
405
406pub fn parse_parenthesizable_whitespace<'a>(
407 config: &Config<'a>,
408 state: &mut State<'a>,
409) -> Result<ParenthesizableWhitespace<'a>> {
410 if state.is_parenthesized {
411 if let Some(ws) = parse_parenthesized_whitespace(config, state)? {
412 return Ok(ParenthesizableWhitespace::ParenthesizedWhitespace(ws));
413 }
414 }
415 parse_simple_whitespace(config, state).map(ParenthesizableWhitespace::SimpleWhitespace)
416}
417
418pub fn parse_parenthesized_whitespace<'a>(
419 config: &Config<'a>,
420 state: &mut State<'a>,
421) -> Result<Option<ParenthesizedWhitespace<'a>>> {
422 if let Some(first_line) = parse_optional_trailing_whitespace(config, state)? {
423 let empty_lines = _parse_empty_lines(config, state, None)?
424 .into_iter()
425 .map(|(_, line)| line)
426 .collect();
427 let indent = parse_indent(config, state, None)?;
428 let last_line = parse_simple_whitespace(config, state)?;
429 Ok(Some(ParenthesizedWhitespace {
430 first_line,
431 empty_lines,
432 indent,
433 last_line,
434 }))
435 } else {
436 Ok(None)
437 }
438}
439
440#[cfg(test)]
441mod tests {
442 use crate::{tokenize, Comment, Config, Result, SimpleWhitespace};
443
444 use super::{parse_comment, parse_simple_whitespace};
445
446 #[test]
447 fn config_mixed_newlines() -> Result<'static, ()> {
448 let source = "'' % {\n'test1': '',\r 'test2': '',\r\n}";
449 let tokens = tokenize(source)?;
450
451 let config = Config::new(source, &tokens);
452
453 assert_eq!(
454 &config.lines,
455 &["'' % {\n", "'test1': '',\r", " 'test2': '',\r\n", "}"]
456 );
457
458 Ok(())
459 }
460
461 fn _parse_simple_whitespace(src: &str) -> Result<SimpleWhitespace> {
462 let tokens = tokenize(src)?;
463 let config = Config::new(src, &tokens);
464 let mut state = Default::default();
465 Ok(parse_simple_whitespace(&config, &mut state)?)
466 }
467
468 #[test]
469 fn simple_whitespace_line_continuations() -> Result<'static, ()> {
470 assert_eq!(
471 _parse_simple_whitespace(" \\\n # foo")?,
472 SimpleWhitespace(" \\\n ")
473 );
474
475 assert_eq!(
476 _parse_simple_whitespace(" \\\r # foo")?,
477 SimpleWhitespace(" \\\r ")
478 );
479 assert_eq!(
480 _parse_simple_whitespace(" \\\r\n # foo")?,
481 SimpleWhitespace(" \\\r\n ")
482 );
483
484 assert_eq!(
485 _parse_simple_whitespace(" \\\r\n\\\n # foo")?,
486 SimpleWhitespace(" \\\r\n\\\n ")
487 );
488
489 Ok(())
490 }
491
492 #[test]
493 fn simple_whitespace_mixed() -> Result<'static, ()> {
494 assert_eq!(
495 _parse_simple_whitespace(" \t\x0clol")?,
496 SimpleWhitespace(" \t\x0c"),
497 );
498
499 Ok(())
500 }
501
502 fn _parse_comment(src: &str) -> Result<Option<Comment>> {
503 let tokens = tokenize(src)?;
504 let config = Config::new(src, &tokens);
505 let mut state = Default::default();
506 Ok(parse_comment(&config, &mut state)?)
507 }
508
509 #[test]
510 fn single_comment() -> Result<'static, ()> {
511 assert_eq!(_parse_comment("# foo\n# bar")?, Some(Comment("# foo")));
512 Ok(())
513 }
514
515 #[test]
516 fn comment_until_eof() -> Result<'static, ()> {
517 assert_eq!(_parse_comment("#")?, Some(Comment("#")));
518 Ok(())
519 }
520
521 #[test]
522 fn no_comment() -> Result<'static, ()> {
523 assert_eq!(_parse_comment("foo")?, None);
524 assert_eq!(_parse_comment("\n")?, None);
525 Ok(())
526 }
527}