libcst_native/tokenizer/
whitespace_parser.rs

1// Copyright (c) Meta Platforms, Inc. and affiliates.
2//
3// This source code is licensed under the MIT license found in the
4// LICENSE file in the root directory of this source tree
5
6use crate::nodes::{
7    Comment, EmptyLine, Fakeness, Newline, ParenthesizableWhitespace, ParenthesizedWhitespace,
8    SimpleWhitespace, TrailingWhitespace,
9};
10use memchr::{memchr2, memchr2_iter};
11use thiserror::Error;
12
13use crate::Token;
14
15use super::TokType;
16
17#[allow(clippy::upper_case_acronyms, clippy::enum_variant_names)]
18#[derive(Error, Debug, PartialEq, Eq)]
19pub enum WhitespaceError {
20    #[error("WTF")]
21    WTF,
22    #[error("Internal error while parsing whitespace: {0}")]
23    InternalError(String),
24    #[error("Failed to parse mandatory trailing whitespace")]
25    TrailingWhitespaceError,
26}
27
28type Result<T> = std::result::Result<T, WhitespaceError>;
29
30#[derive(Debug, PartialEq, Eq, Clone)]
31pub struct State<'a> {
32    pub line: usize,   // one-indexed (to match parso's behavior)
33    pub column: usize, // zero-indexed (to match parso's behavior)
34    pub column_byte: usize,
35    pub absolute_indent: &'a str,
36    pub is_parenthesized: bool,
37    pub byte_offset: usize,
38}
39
40impl<'a> Default for State<'a> {
41    fn default() -> Self {
42        Self {
43            line: 1,
44            column: 0,
45            column_byte: 0,
46            absolute_indent: "",
47            is_parenthesized: false,
48            byte_offset: 0,
49        }
50    }
51}
52
53// TODO
54pub struct Config<'a> {
55    pub input: &'a str,
56    pub lines: Vec<&'a str>,
57    pub default_newline: &'a str,
58    pub default_indent: &'a str,
59}
60
61impl<'a> Config<'a> {
62    pub fn new(input: &'a str, tokens: &[Token<'a>]) -> Self {
63        let mut default_indent = "    ";
64        for tok in tokens {
65            if tok.r#type == TokType::Indent {
66                default_indent = tok.relative_indent.unwrap();
67                break;
68            }
69        }
70
71        let mut lines = Vec::new();
72        let mut start = 0;
73        let mut newline_positions = memchr2_iter(b'\n', b'\r', input.as_bytes());
74
75        while let Some(newline_position) = newline_positions.next() {
76            let newline_character = input.as_bytes()[newline_position] as char;
77
78            let len = if newline_character == '\r'
79                && input.as_bytes().get(newline_position + 1) == Some(&b'\n')
80            {
81                // Skip the next '\n'
82                newline_positions.next();
83                2
84            } else {
85                1
86            };
87
88            let end = newline_position + len;
89            lines.push(&input[start..end]);
90            start = end;
91        }
92
93        // Push the last line if it isn't terminated by a newline character
94        if start < input.len() {
95            lines.push(&input[start..]);
96        }
97
98        let default_newline = match lines.first().map(|line| line.as_bytes()).unwrap_or(&[]) {
99            [.., b'\r', b'\n'] => "\r\n",
100            [.., b'\n'] => "\n",
101            [.., b'\r'] => "\r",
102            _ => "\n",
103        };
104
105        Self {
106            input,
107            lines,
108            default_newline,
109            default_indent,
110        }
111    }
112
113    pub fn has_trailing_newline(&self) -> bool {
114        self.input.ends_with('\n')
115            && !self.input.ends_with("\\\n")
116            && !self.input.ends_with("\\\r\n")
117    }
118
119    fn get_line(&self, line_number: usize) -> Result<&'a str> {
120        let err_fn = || {
121            WhitespaceError::InternalError(format!(
122                "tried to get line {} which is out of range",
123                line_number
124            ))
125        };
126        self.lines
127            .get(line_number.checked_sub(1).ok_or_else(err_fn)?)
128            .map(|l| &l[..])
129            .ok_or_else(err_fn)
130    }
131
132    fn get_line_after_column(&self, line_number: usize, column_index: usize) -> Result<&'a str> {
133        self.get_line(line_number)?
134            .get(column_index..)
135            .ok_or_else(|| {
136                WhitespaceError::InternalError(format!(
137                    "Column index {} out of range for line {}",
138                    column_index, line_number
139                ))
140            })
141    }
142}
143
144#[derive(Debug)]
145enum ParsedEmptyLine<'a> {
146    NoIndent,
147    Line(EmptyLine<'a>),
148}
149
150fn parse_empty_line<'a>(
151    config: &Config<'a>,
152    state: &mut State,
153    override_absolute_indent: Option<&'a str>,
154) -> Result<ParsedEmptyLine<'a>> {
155    let mut speculative_state = state.clone();
156    if let Ok(indent) = parse_indent(config, &mut speculative_state, override_absolute_indent) {
157        let whitespace = parse_simple_whitespace(config, &mut speculative_state)?;
158        let comment = parse_comment(config, &mut speculative_state)?;
159        if let Some(newline) = parse_newline(config, &mut speculative_state)? {
160            *state = speculative_state;
161            return Ok(ParsedEmptyLine::Line(EmptyLine {
162                indent,
163                whitespace,
164                comment,
165                newline,
166            }));
167        }
168    }
169    Ok(ParsedEmptyLine::NoIndent)
170}
171
172fn _parse_empty_lines<'a>(
173    config: &Config<'a>,
174    state: &mut State<'a>,
175    override_absolute_indent: Option<&'a str>,
176) -> Result<Vec<(State<'a>, EmptyLine<'a>)>> {
177    let mut lines = vec![];
178    loop {
179        let last_state = state.clone();
180        let parsed_line = parse_empty_line(config, state, override_absolute_indent)?;
181        if *state == last_state {
182            break;
183        }
184        match parsed_line {
185            ParsedEmptyLine::NoIndent => break,
186            ParsedEmptyLine::Line(l) => lines.push((state.clone(), l)),
187        }
188    }
189    Ok(lines)
190}
191
192pub fn parse_empty_lines<'a>(
193    config: &Config<'a>,
194    state: &mut State<'a>,
195    override_absolute_indent: Option<&'a str>,
196) -> Result<Vec<EmptyLine<'a>>> {
197    // If override_absolute_indent is Some, then we need to parse all lines up to and including the
198    // last line that is indented at our level. These all belong to the footer and not to the next
199    // line's leading_lines.
200    //
201    // We don't know what the last line with indent=True is, and there could be indent=False lines
202    // interspersed with indent=True lines, so we need to speculatively parse all possible empty
203    // lines, and then unwind to find the last empty line with indent=True.
204    let mut speculative_state = state.clone();
205    let mut lines = _parse_empty_lines(config, &mut speculative_state, override_absolute_indent)?;
206
207    if override_absolute_indent.is_some() {
208        // Remove elements from the end until we find an indented line.
209        while let Some((_, empty_line)) = lines.last() {
210            if empty_line.indent {
211                break;
212            }
213            lines.pop();
214        }
215    }
216
217    if let Some((final_state, _)) = lines.last() {
218        // update the state to match the last line that we captured
219        *state = final_state.clone();
220    }
221
222    Ok(lines.into_iter().map(|(_, e)| e).collect())
223}
224
225pub fn parse_comment<'a>(config: &Config<'a>, state: &mut State) -> Result<Option<Comment<'a>>> {
226    let newline_after = config.get_line_after_column(state.line, state.column_byte)?;
227    if newline_after.as_bytes().first() != Some(&b'#') {
228        return Ok(None);
229    }
230    let comment_str = if let Some(idx) = memchr2(b'\n', b'\r', newline_after.as_bytes()) {
231        &newline_after[..idx]
232    } else {
233        newline_after
234    };
235    advance_this_line(
236        config,
237        state,
238        comment_str.chars().count(),
239        comment_str.len(),
240    )?;
241    Ok(Some(Comment(comment_str)))
242}
243
244pub fn parse_newline<'a>(config: &Config<'a>, state: &mut State) -> Result<Option<Newline<'a>>> {
245    let newline_after = config.get_line_after_column(state.line, state.column_byte)?;
246    let len = match newline_after.as_bytes() {
247        [b'\n', ..] => 1,
248        [b'\r', b'\n', ..] => 2,
249        [b'\r', ..] => 1,
250        _ => 0,
251    };
252    if len > 0 {
253        let newline_str = &newline_after[..len];
254        advance_this_line(config, state, len, len)?;
255        if state.column_byte != config.get_line(state.line)?.len() {
256            return Err(WhitespaceError::InternalError(format!(
257                "Found newline at ({}, {}) but it's not EOL",
258                state.line, state.column
259            )));
260        }
261        if state.line < config.lines.len() {
262            advance_to_next_line(config, state)?;
263        }
264        return Ok(Some(Newline(
265            if newline_str == config.default_newline {
266                None
267            } else {
268                Some(newline_str)
269            },
270            Fakeness::Real,
271        )));
272    }
273
274    // If we're at the end of the file but not on BOL, that means this is the fake
275    // newline inserted by the tokenizer.
276    if state.byte_offset == config.input.len() && state.column_byte != 0 {
277        return Ok(Some(Newline(None, Fakeness::Fake)));
278    }
279    Ok(None)
280}
281
282pub fn parse_optional_trailing_whitespace<'a>(
283    config: &Config<'a>,
284    state: &mut State,
285) -> Result<Option<TrailingWhitespace<'a>>> {
286    let mut speculative_state = state.clone();
287    let whitespace = parse_simple_whitespace(config, &mut speculative_state)?;
288    let comment = parse_comment(config, &mut speculative_state)?;
289    if let Some(newline) = parse_newline(config, &mut speculative_state)? {
290        *state = speculative_state;
291        Ok(Some(TrailingWhitespace {
292            whitespace,
293            comment,
294            newline,
295        }))
296    } else {
297        Ok(None)
298    }
299}
300
301pub fn parse_trailing_whitespace<'a>(
302    config: &Config<'a>,
303    state: &mut State,
304) -> Result<TrailingWhitespace<'a>> {
305    match parse_optional_trailing_whitespace(config, state)? {
306        Some(ws) => Ok(ws),
307        _ => Err(WhitespaceError::TrailingWhitespaceError),
308    }
309}
310
311fn parse_indent<'a>(
312    config: &Config<'a>,
313    state: &mut State,
314    override_absolute_indent: Option<&'a str>,
315) -> Result<bool> {
316    let absolute_indent = override_absolute_indent.unwrap_or(state.absolute_indent);
317    if state.column_byte != 0 {
318        if state.column_byte == config.get_line(state.line)?.len()
319            && state.line == config.lines.len()
320        {
321            Ok(false)
322        } else {
323            Err(WhitespaceError::InternalError(
324                "Column should not be 0 when parsing an index".to_string(),
325            ))
326        }
327    } else {
328        Ok(
329            if config
330                .get_line_after_column(state.line, state.column_byte)?
331                .starts_with(absolute_indent)
332            {
333                state.column_byte += absolute_indent.len();
334                state.column += absolute_indent.chars().count();
335                state.byte_offset += absolute_indent.len();
336                true
337            } else {
338                false
339            },
340        )
341    }
342}
343
344fn advance_to_next_line<'a>(config: &Config<'a>, state: &mut State) -> Result<()> {
345    let cur_line = config.get_line(state.line)?;
346    state.byte_offset += cur_line.len() - state.column_byte;
347    state.column = 0;
348    state.column_byte = 0;
349    state.line += 1;
350    Ok(())
351}
352
353fn advance_this_line<'a>(
354    config: &Config<'a>,
355    state: &mut State,
356    char_count: usize,
357    offset: usize,
358) -> Result<()> {
359    let cur_line = config.get_line(state.line)?;
360    if cur_line.len() < state.column_byte + offset {
361        return Err(WhitespaceError::InternalError(format!(
362            "Tried to advance past line {}'s end",
363            state.line
364        )));
365    }
366    state.column += char_count;
367    state.column_byte += offset;
368    state.byte_offset += offset;
369    Ok(())
370}
371
372pub fn parse_simple_whitespace<'a>(
373    config: &Config<'a>,
374    state: &mut State,
375) -> Result<SimpleWhitespace<'a>> {
376    let capture_ws = |line, col| -> Result<&'a str> {
377        let line = config.get_line_after_column(line, col)?;
378        let bytes = line.as_bytes();
379        let mut idx = 0;
380        while idx < bytes.len() {
381            match bytes[idx..] {
382                [b' ' | b'\t' | b'\x0c', ..] => idx += 1,
383                [b'\\', b'\r', b'\n', ..] => idx += 3,
384                [b'\\', b'\r' | b'\n', ..] => idx += 2,
385                _ => break,
386            }
387        }
388        Ok(&line[..idx])
389    };
390    let start_offset = state.byte_offset;
391    let mut prev_line: &str;
392    loop {
393        prev_line = capture_ws(state.line, state.column_byte)?;
394        if !prev_line.contains('\\') {
395            break;
396        }
397        advance_to_next_line(config, state)?;
398    }
399    advance_this_line(config, state, prev_line.chars().count(), prev_line.len())?;
400
401    Ok(SimpleWhitespace(
402        &config.input[start_offset..state.byte_offset],
403    ))
404}
405
406pub fn parse_parenthesizable_whitespace<'a>(
407    config: &Config<'a>,
408    state: &mut State<'a>,
409) -> Result<ParenthesizableWhitespace<'a>> {
410    if state.is_parenthesized {
411        if let Some(ws) = parse_parenthesized_whitespace(config, state)? {
412            return Ok(ParenthesizableWhitespace::ParenthesizedWhitespace(ws));
413        }
414    }
415    parse_simple_whitespace(config, state).map(ParenthesizableWhitespace::SimpleWhitespace)
416}
417
418pub fn parse_parenthesized_whitespace<'a>(
419    config: &Config<'a>,
420    state: &mut State<'a>,
421) -> Result<Option<ParenthesizedWhitespace<'a>>> {
422    if let Some(first_line) = parse_optional_trailing_whitespace(config, state)? {
423        let empty_lines = _parse_empty_lines(config, state, None)?
424            .into_iter()
425            .map(|(_, line)| line)
426            .collect();
427        let indent = parse_indent(config, state, None)?;
428        let last_line = parse_simple_whitespace(config, state)?;
429        Ok(Some(ParenthesizedWhitespace {
430            first_line,
431            empty_lines,
432            indent,
433            last_line,
434        }))
435    } else {
436        Ok(None)
437    }
438}
439
440#[cfg(test)]
441mod tests {
442    use crate::{tokenize, Comment, Config, Result, SimpleWhitespace};
443
444    use super::{parse_comment, parse_simple_whitespace};
445
446    #[test]
447    fn config_mixed_newlines() -> Result<'static, ()> {
448        let source = "'' % {\n'test1': '',\r  'test2': '',\r\n}";
449        let tokens = tokenize(source)?;
450
451        let config = Config::new(source, &tokens);
452
453        assert_eq!(
454            &config.lines,
455            &["'' % {\n", "'test1': '',\r", "  'test2': '',\r\n", "}"]
456        );
457
458        Ok(())
459    }
460
461    fn _parse_simple_whitespace(src: &str) -> Result<SimpleWhitespace> {
462        let tokens = tokenize(src)?;
463        let config = Config::new(src, &tokens);
464        let mut state = Default::default();
465        Ok(parse_simple_whitespace(&config, &mut state)?)
466    }
467
468    #[test]
469    fn simple_whitespace_line_continuations() -> Result<'static, ()> {
470        assert_eq!(
471            _parse_simple_whitespace("  \\\n  # foo")?,
472            SimpleWhitespace("  \\\n  ")
473        );
474
475        assert_eq!(
476            _parse_simple_whitespace("  \\\r  # foo")?,
477            SimpleWhitespace("  \\\r  ")
478        );
479        assert_eq!(
480            _parse_simple_whitespace("  \\\r\n  # foo")?,
481            SimpleWhitespace("  \\\r\n  ")
482        );
483
484        assert_eq!(
485            _parse_simple_whitespace("  \\\r\n\\\n  # foo")?,
486            SimpleWhitespace("  \\\r\n\\\n  ")
487        );
488
489        Ok(())
490    }
491
492    #[test]
493    fn simple_whitespace_mixed() -> Result<'static, ()> {
494        assert_eq!(
495            _parse_simple_whitespace(" \t\x0clol")?,
496            SimpleWhitespace(" \t\x0c"),
497        );
498
499        Ok(())
500    }
501
502    fn _parse_comment(src: &str) -> Result<Option<Comment>> {
503        let tokens = tokenize(src)?;
504        let config = Config::new(src, &tokens);
505        let mut state = Default::default();
506        Ok(parse_comment(&config, &mut state)?)
507    }
508
509    #[test]
510    fn single_comment() -> Result<'static, ()> {
511        assert_eq!(_parse_comment("# foo\n# bar")?, Some(Comment("# foo")));
512        Ok(())
513    }
514
515    #[test]
516    fn comment_until_eof() -> Result<'static, ()> {
517        assert_eq!(_parse_comment("#")?, Some(Comment("#")));
518        Ok(())
519    }
520
521    #[test]
522    fn no_comment() -> Result<'static, ()> {
523        assert_eq!(_parse_comment("foo")?, None);
524        assert_eq!(_parse_comment("\n")?, None);
525        Ok(())
526    }
527}