rustpython_parser_vendored/source_location/
newlines.rs

1use crate::text_size::{TextLen, TextRange, TextSize};
2use memchr::{memchr2, memrchr2};
3use std::iter::FusedIterator;
4use std::ops::Deref;
5
6/// Extension trait for [`str`] that provides a [`UniversalNewlineIterator`].
7pub trait StrExt {
8    fn universal_newlines(&self) -> UniversalNewlineIterator<'_>;
9}
10
11impl StrExt for str {
12    fn universal_newlines(&self) -> UniversalNewlineIterator<'_> {
13        UniversalNewlineIterator::from(self)
14    }
15}
16
17/// Like [`str#lines`], but accommodates LF, CRLF, and CR line endings,
18/// the latter of which are not supported by [`str#lines`].
19///
20/// ## Examples
21///
22/// ```rust
23/// # use rustpython_parser_vendored::text_size::TextSize;
24/// # use rustpython_parser_vendored::source_location::newlines::{Line, UniversalNewlineIterator};
25/// let mut lines = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop");
26///
27/// assert_eq!(lines.next_back(), Some(Line::new("bop", TextSize::from(14))));
28/// assert_eq!(lines.next(), Some(Line::new("foo\n", TextSize::from(0))));
29/// assert_eq!(lines.next_back(), Some(Line::new("baz\r", TextSize::from(10))));
30/// assert_eq!(lines.next(), Some(Line::new("bar\n", TextSize::from(4))));
31/// assert_eq!(lines.next_back(), Some(Line::new("\r\n", TextSize::from(8))));
32/// assert_eq!(lines.next(), None);
33/// ```
34pub struct UniversalNewlineIterator<'a> {
35    text: &'a str,
36    offset: TextSize,
37    offset_back: TextSize,
38}
39
40impl<'a> UniversalNewlineIterator<'a> {
41    pub fn with_offset(text: &'a str, offset: TextSize) -> UniversalNewlineIterator<'a> {
42        UniversalNewlineIterator {
43            text,
44            offset,
45            offset_back: offset + text.text_len(),
46        }
47    }
48
49    pub fn from(text: &'a str) -> UniversalNewlineIterator<'a> {
50        Self::with_offset(text, TextSize::default())
51    }
52}
53
54/// Finds the next newline character. Returns its position and the [`LineEnding`].
55#[inline]
56pub fn find_newline(text: &str) -> Option<(usize, LineEnding)> {
57    let bytes = text.as_bytes();
58    if let Some(position) = memchr2(b'\n', b'\r', bytes) {
59        // SAFETY: memchr guarantees to return valid positions
60        #[allow(unsafe_code)]
61        let newline_character = unsafe { *bytes.get_unchecked(position) };
62
63        let line_ending = match newline_character {
64            // Explicit branch for `\n` as this is the most likely path
65            b'\n' => LineEnding::Lf,
66            // '\r\n'
67            b'\r' if bytes.get(position.saturating_add(1)) == Some(&b'\n') => LineEnding::CrLf,
68            // '\r'
69            _ => LineEnding::Cr,
70        };
71
72        Some((position, line_ending))
73    } else {
74        None
75    }
76}
77
78impl<'a> Iterator for UniversalNewlineIterator<'a> {
79    type Item = Line<'a>;
80
81    #[inline]
82    fn next(&mut self) -> Option<Line<'a>> {
83        if self.text.is_empty() {
84            return None;
85        }
86
87        let line = if let Some((newline_position, line_ending)) = find_newline(self.text) {
88            let (text, remainder) = self.text.split_at(newline_position + line_ending.len());
89
90            let line = Line {
91                offset: self.offset,
92                text,
93            };
94
95            self.text = remainder;
96            self.offset += text.text_len();
97
98            line
99        }
100        // Last line
101        else {
102            Line {
103                offset: self.offset,
104                text: std::mem::take(&mut self.text),
105            }
106        };
107
108        Some(line)
109    }
110
111    fn last(mut self) -> Option<Self::Item> {
112        self.next_back()
113    }
114}
115
116impl DoubleEndedIterator for UniversalNewlineIterator<'_> {
117    #[inline]
118    fn next_back(&mut self) -> Option<Self::Item> {
119        if self.text.is_empty() {
120            return None;
121        }
122
123        let len = self.text.len();
124
125        // Trim any trailing newlines.
126        let haystack = match self.text.as_bytes()[len - 1] {
127            b'\n' if len > 1 && self.text.as_bytes()[len - 2] == b'\r' => &self.text[..len - 2],
128            b'\n' | b'\r' => &self.text[..len - 1],
129            _ => self.text,
130        };
131
132        // Find the end of the previous line. The previous line is the text up to, but not including
133        // the newline character.
134        let line = if let Some(line_end) = memrchr2(b'\n', b'\r', haystack.as_bytes()) {
135            // '\n' or '\r' or '\r\n'
136            let (remainder, line) = self.text.split_at(line_end + 1);
137            self.text = remainder;
138            self.offset_back -= line.text_len();
139
140            Line {
141                text: line,
142                offset: self.offset_back,
143            }
144        } else {
145            // Last line
146            let offset = self.offset_back - self.text.text_len();
147            Line {
148                text: std::mem::take(&mut self.text),
149                offset,
150            }
151        };
152
153        Some(line)
154    }
155}
156
157impl FusedIterator for UniversalNewlineIterator<'_> {}
158
159/// Like [`UniversalNewlineIterator`], but includes a trailing newline as an empty line.
160pub struct NewlineWithTrailingNewline<'a> {
161    trailing: Option<Line<'a>>,
162    underlying: UniversalNewlineIterator<'a>,
163}
164
165impl<'a> NewlineWithTrailingNewline<'a> {
166    pub fn from(input: &'a str) -> NewlineWithTrailingNewline<'a> {
167        Self::with_offset(input, TextSize::default())
168    }
169
170    pub fn with_offset(input: &'a str, offset: TextSize) -> Self {
171        NewlineWithTrailingNewline {
172            underlying: UniversalNewlineIterator::with_offset(input, offset),
173            trailing: if input.ends_with(['\r', '\n']) {
174                Some(Line {
175                    text: "",
176                    offset: offset + input.text_len(),
177                })
178            } else {
179                None
180            },
181        }
182    }
183}
184
185impl<'a> Iterator for NewlineWithTrailingNewline<'a> {
186    type Item = Line<'a>;
187
188    #[inline]
189    fn next(&mut self) -> Option<Line<'a>> {
190        self.underlying.next().or_else(|| self.trailing.take())
191    }
192}
193
194#[derive(Debug, Clone, Eq, PartialEq)]
195pub struct Line<'a> {
196    text: &'a str,
197    offset: TextSize,
198}
199
200impl<'a> Line<'a> {
201    pub fn new(text: &'a str, offset: TextSize) -> Self {
202        Self { text, offset }
203    }
204
205    #[inline]
206    pub const fn start(&self) -> TextSize {
207        self.offset
208    }
209
210    /// Returns the byte offset where the line ends, including its terminating new line character.
211    #[inline]
212    pub fn full_end(&self) -> TextSize {
213        self.offset + self.full_text_len()
214    }
215
216    /// Returns the byte offset where the line ends, excluding its new line character
217    #[inline]
218    pub fn end(&self) -> TextSize {
219        self.offset + self.as_str().text_len()
220    }
221
222    /// Returns the range of the line, including its terminating new line character.
223    #[inline]
224    pub fn full_range(&self) -> TextRange {
225        TextRange::at(self.offset, self.text.text_len())
226    }
227
228    /// Returns the range of the line, excluding its terminating new line character
229    #[inline]
230    pub fn range(&self) -> TextRange {
231        TextRange::new(self.start(), self.end())
232    }
233
234    /// Returns the text of the line, excluding the terminating new line character.
235    #[inline]
236    pub fn as_str(&self) -> &'a str {
237        let mut bytes = self.text.bytes().rev();
238
239        let newline_len = match bytes.next() {
240            Some(b'\n') => {
241                if bytes.next() == Some(b'\r') {
242                    2
243                } else {
244                    1
245                }
246            }
247            Some(b'\r') => 1,
248            _ => 0,
249        };
250
251        &self.text[..self.text.len() - newline_len]
252    }
253
254    /// Returns the line's text, including the terminating new line character.
255    #[inline]
256    pub fn as_full_str(&self) -> &'a str {
257        self.text
258    }
259
260    #[inline]
261    pub fn full_text_len(&self) -> TextSize {
262        self.text.text_len()
263    }
264}
265
266impl Deref for Line<'_> {
267    type Target = str;
268
269    fn deref(&self) -> &Self::Target {
270        self.as_str()
271    }
272}
273
274impl PartialEq<&str> for Line<'_> {
275    fn eq(&self, other: &&str) -> bool {
276        self.as_str() == *other
277    }
278}
279
280impl PartialEq<Line<'_>> for &str {
281    fn eq(&self, other: &Line<'_>) -> bool {
282        *self == other.as_str()
283    }
284}
285
286/// The line ending style used in Python source code.
287/// See <https://docs.python.org/3/reference/lexical_analysis.html#physical-lines>
288#[derive(Debug, PartialEq, Eq, Copy, Clone)]
289pub enum LineEnding {
290    Lf,
291    Cr,
292    CrLf,
293}
294
295impl Default for LineEnding {
296    fn default() -> Self {
297        if cfg!(windows) {
298            LineEnding::CrLf
299        } else {
300            LineEnding::Lf
301        }
302    }
303}
304
305impl LineEnding {
306    pub const fn as_str(&self) -> &'static str {
307        match self {
308            LineEnding::Lf => "\n",
309            LineEnding::CrLf => "\r\n",
310            LineEnding::Cr => "\r",
311        }
312    }
313
314    #[allow(clippy::len_without_is_empty)]
315    pub const fn len(&self) -> usize {
316        match self {
317            LineEnding::Lf | LineEnding::Cr => 1,
318            LineEnding::CrLf => 2,
319        }
320    }
321
322    pub const fn text_len(&self) -> TextSize {
323        match self {
324            LineEnding::Lf | LineEnding::Cr => TextSize::new(1),
325            LineEnding::CrLf => TextSize::new(2),
326        }
327    }
328}
329
330impl Deref for LineEnding {
331    type Target = str;
332
333    fn deref(&self) -> &Self::Target {
334        self.as_str()
335    }
336}
337
338#[cfg(test)]
339mod tests {
340    use super::Line;
341    use super::UniversalNewlineIterator;
342    use crate::text_size::TextSize;
343
344    #[test]
345    fn universal_newlines_empty_str() {
346        let lines: Vec<_> = UniversalNewlineIterator::from("").collect();
347        assert_eq!(lines, Vec::<Line>::new());
348
349        let lines: Vec<_> = UniversalNewlineIterator::from("").rev().collect();
350        assert_eq!(lines, Vec::<Line>::new());
351    }
352
353    #[test]
354    fn universal_newlines_forward() {
355        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop").collect();
356        assert_eq!(
357            lines,
358            vec![
359                Line::new("foo\n", TextSize::from(0)),
360                Line::new("bar\n", TextSize::from(4)),
361                Line::new("\r\n", TextSize::from(8)),
362                Line::new("baz\r", TextSize::from(10)),
363                Line::new("bop", TextSize::from(14)),
364            ]
365        );
366
367        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop\n").collect();
368        assert_eq!(
369            lines,
370            vec![
371                Line::new("foo\n", TextSize::from(0)),
372                Line::new("bar\n", TextSize::from(4)),
373                Line::new("\r\n", TextSize::from(8)),
374                Line::new("baz\r", TextSize::from(10)),
375                Line::new("bop\n", TextSize::from(14)),
376            ]
377        );
378
379        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop\n\n").collect();
380        assert_eq!(
381            lines,
382            vec![
383                Line::new("foo\n", TextSize::from(0)),
384                Line::new("bar\n", TextSize::from(4)),
385                Line::new("\r\n", TextSize::from(8)),
386                Line::new("baz\r", TextSize::from(10)),
387                Line::new("bop\n", TextSize::from(14)),
388                Line::new("\n", TextSize::from(18)),
389            ]
390        );
391    }
392
393    #[test]
394    fn universal_newlines_backwards() {
395        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop")
396            .rev()
397            .collect();
398        assert_eq!(
399            lines,
400            vec![
401                Line::new("bop", TextSize::from(14)),
402                Line::new("baz\r", TextSize::from(10)),
403                Line::new("\r\n", TextSize::from(8)),
404                Line::new("bar\n", TextSize::from(4)),
405                Line::new("foo\n", TextSize::from(0)),
406            ]
407        );
408
409        let lines: Vec<_> = UniversalNewlineIterator::from("foo\nbar\n\nbaz\rbop\n")
410            .rev()
411            .map(|line| line.as_str())
412            .collect();
413
414        assert_eq!(
415            lines,
416            vec![
417                Line::new("bop\n", TextSize::from(13)),
418                Line::new("baz\r", TextSize::from(9)),
419                Line::new("\n", TextSize::from(8)),
420                Line::new("bar\n", TextSize::from(4)),
421                Line::new("foo\n", TextSize::from(0)),
422            ]
423        );
424    }
425
426    #[test]
427    fn universal_newlines_mixed() {
428        let mut lines = UniversalNewlineIterator::from("foo\nbar\n\r\nbaz\rbop");
429
430        assert_eq!(
431            lines.next_back(),
432            Some(Line::new("bop", TextSize::from(14)))
433        );
434        assert_eq!(lines.next(), Some(Line::new("foo\n", TextSize::from(0))));
435        assert_eq!(
436            lines.next_back(),
437            Some(Line::new("baz\r", TextSize::from(10)))
438        );
439        assert_eq!(lines.next(), Some(Line::new("bar\n", TextSize::from(4))));
440        assert_eq!(
441            lines.next_back(),
442            Some(Line::new("\r\n", TextSize::from(8)))
443        );
444        assert_eq!(lines.next(), None);
445    }
446}