rustpython_parser_vendored/source_location/
line_index.rs

1use super::SourceLocation;
2use crate::text_size::{TextLen, TextRange, TextSize};
3#[cfg(feature = "serde")]
4use serde::{Deserialize, Serialize};
5use std::fmt;
6use std::fmt::{Debug, Formatter};
7use std::num::NonZeroU32;
8use std::ops::Deref;
9use std::sync::Arc;
10
11/// Index for fast [byte offset](TextSize) to [`SourceLocation`] conversions.
12///
13/// Cloning a [`LineIndex`] is cheap because it only requires bumping a reference count.
14#[derive(Clone)]
15pub struct LineIndex {
16    inner: Arc<LineIndexInner>,
17}
18
19struct LineIndexInner {
20    line_starts: Vec<TextSize>,
21    kind: IndexKind,
22}
23
24impl LineIndex {
25    /// Builds the [`LineIndex`] from the source text of a file.
26    pub fn from_source_text(text: &str) -> Self {
27        let mut line_starts: Vec<TextSize> = Vec::with_capacity(text.len() / 88);
28        line_starts.push(TextSize::default());
29
30        let bytes = text.as_bytes();
31        let mut utf8 = false;
32
33        assert!(u32::try_from(bytes.len()).is_ok());
34
35        for (i, byte) in bytes.iter().enumerate() {
36            utf8 |= !byte.is_ascii();
37
38            match byte {
39                // Only track one line break for `\r\n`.
40                b'\r' if bytes.get(i + 1) == Some(&b'\n') => continue,
41                b'\n' | b'\r' => {
42                    // SAFETY: Assertion above guarantees `i <= u32::MAX`
43                    #[allow(clippy::cast_possible_truncation)]
44                    line_starts.push(TextSize::from(i as u32) + TextSize::from(1));
45                }
46                _ => {}
47            }
48        }
49
50        let kind = if utf8 {
51            IndexKind::Utf8
52        } else {
53            IndexKind::Ascii
54        };
55
56        Self {
57            inner: Arc::new(LineIndexInner { line_starts, kind }),
58        }
59    }
60
61    fn kind(&self) -> IndexKind {
62        self.inner.kind
63    }
64
65    /// Returns the row and column index for an offset.
66    ///
67    /// ## Examples
68    ///
69    /// ```
70    /// # use rustpython_parser_vendored::text_size::TextSize;
71    /// # use rustpython_parser_vendored::source_location::{LineIndex, OneIndexed, SourceLocation};
72    /// let source = "def a():\n    pass";
73    /// let index = LineIndex::from_source_text(source);
74    ///
75    /// assert_eq!(
76    ///     index.source_location(TextSize::from(0), source),
77    ///     SourceLocation { row: OneIndexed::from_zero_indexed(0), column: OneIndexed::from_zero_indexed(0) }
78    /// );
79    ///
80    /// assert_eq!(
81    ///     index.source_location(TextSize::from(4), source),
82    ///     SourceLocation { row: OneIndexed::from_zero_indexed(0), column: OneIndexed::from_zero_indexed(4) }
83    /// );
84    /// assert_eq!(
85    ///     index.source_location(TextSize::from(13), source),
86    ///     SourceLocation { row: OneIndexed::from_zero_indexed(1), column: OneIndexed::from_zero_indexed(4) }
87    /// );
88    /// ```
89    ///
90    /// ## Panics
91    ///
92    /// If the offset is out of bounds.
93    pub fn source_location(&self, offset: TextSize, content: &str) -> SourceLocation {
94        match self.binary_search_line(&offset) {
95            // Offset is at the start of a line
96            Ok(row) => SourceLocation {
97                row: OneIndexed::from_zero_indexed(row),
98                column: OneIndexed::from_zero_indexed(0),
99            },
100            Err(next_row) => {
101                // SAFETY: Safe because the index always contains an entry for the offset 0
102                let row = next_row - 1;
103                let mut line_start = self.line_starts()[row as usize];
104
105                let column = if self.kind().is_ascii() {
106                    u32::from(offset - line_start)
107                } else {
108                    // Don't count the BOM character as a column.
109                    if line_start == TextSize::from(0) && content.starts_with('\u{feff}') {
110                        line_start = '\u{feff}'.text_len();
111                    }
112
113                    let range = TextRange::new(line_start, offset);
114                    content[range].chars().count().try_into().unwrap()
115                };
116
117                SourceLocation {
118                    row: OneIndexed::from_zero_indexed(row),
119                    column: OneIndexed::from_zero_indexed(column),
120                }
121            }
122        }
123    }
124
125    /// Return the number of lines in the source code.
126    pub(crate) fn line_count(&self) -> usize {
127        self.line_starts().len()
128    }
129
130    /// Returns the row number for a given offset.
131    ///
132    /// ## Examples
133    ///
134    /// ```
135    /// # use rustpython_parser_vendored::text_size::TextSize;
136    /// # use rustpython_parser_vendored::source_location::{LineIndex, OneIndexed, SourceLocation};
137    /// let source = "def a():\n    pass";
138    /// let index = LineIndex::from_source_text(source);
139    ///
140    /// assert_eq!(index.line_index(TextSize::from(0)), OneIndexed::from_zero_indexed(0));
141    /// assert_eq!(index.line_index(TextSize::from(4)), OneIndexed::from_zero_indexed(0));
142    /// assert_eq!(index.line_index(TextSize::from(13)), OneIndexed::from_zero_indexed(1));
143    /// ```
144    ///
145    /// ## Panics
146    ///
147    /// If the offset is out of bounds.
148    pub fn line_index(&self, offset: TextSize) -> OneIndexed {
149        match self.binary_search_line(&offset) {
150            // Offset is at the start of a line
151            Ok(row) => OneIndexed::from_zero_indexed(row),
152            Err(row) => {
153                // SAFETY: Safe because the index always contains an entry for the offset 0
154                OneIndexed::from_zero_indexed(row - 1)
155            }
156        }
157    }
158
159    /// Returns the [byte offset](TextSize) for the `line` with the given index.
160    pub(crate) fn line_start(&self, line: OneIndexed, contents: &str) -> TextSize {
161        let row_index = line.to_zero_indexed_usize();
162        let starts = self.line_starts();
163
164        // If start-of-line position after last line
165        if row_index == starts.len() {
166            contents.text_len()
167        } else {
168            starts[row_index]
169        }
170    }
171
172    /// Returns the [byte offset](TextSize) of the `line`'s end.
173    /// The offset is the end of the line, up to and including the newline character ending the line (if any).
174    pub(crate) fn line_end(&self, line: OneIndexed, contents: &str) -> TextSize {
175        let row_index = line.to_zero_indexed_usize();
176        let starts = self.line_starts();
177
178        // If start-of-line position after last line
179        if row_index.saturating_add(1) >= starts.len() {
180            contents.text_len()
181        } else {
182            starts[row_index + 1]
183        }
184    }
185
186    /// Returns the [`TextRange`] of the `line` with the given index.
187    /// The start points to the first character's [byte offset](TextSize), the end up to, and including
188    /// the newline character ending the line (if any).
189    pub(crate) fn line_range(&self, line: OneIndexed, contents: &str) -> TextRange {
190        let starts = self.line_starts();
191
192        if starts.len() == line.to_zero_indexed_usize() {
193            TextRange::empty(contents.text_len())
194        } else {
195            TextRange::new(
196                self.line_start(line, contents),
197                self.line_start(line.saturating_add(1), contents),
198            )
199        }
200    }
201
202    /// Returns the [byte offsets](TextSize) for every line
203    pub fn line_starts(&self) -> &[TextSize] {
204        &self.inner.line_starts
205    }
206
207    #[allow(clippy::trivially_copy_pass_by_ref)] // to keep same interface as `[T]::binary_search`
208    fn binary_search_line(&self, offset: &TextSize) -> Result<u32, u32> {
209        // `try_into()` always success as long as TextSize is u32
210        match self.line_starts().binary_search(offset) {
211            Ok(index) => Ok(index.try_into().unwrap()),
212            Err(index) => Err(index.try_into().unwrap()),
213        }
214    }
215}
216
217impl Deref for LineIndex {
218    type Target = [TextSize];
219
220    fn deref(&self) -> &Self::Target {
221        self.line_starts()
222    }
223}
224
225impl Debug for LineIndex {
226    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
227        f.debug_list().entries(self.line_starts()).finish()
228    }
229}
230
231#[derive(Debug, Clone, Copy)]
232enum IndexKind {
233    /// Optimized index for an ASCII only document
234    Ascii,
235
236    /// Index for UTF8 documents
237    Utf8,
238}
239
240impl IndexKind {
241    const fn is_ascii(self) -> bool {
242        matches!(self, IndexKind::Ascii)
243    }
244}
245
246/// Type-safe wrapper for a value whose logical range starts at `1`, for
247/// instance the line or column numbers in a file
248///
249/// Internally this is represented as a [`NonZeroU32`], this enables some
250/// memory optimizations
251#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
252#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
253pub struct OneIndexed(NonZeroU32);
254
255#[allow(clippy::cast_possible_truncation)] // manually checked
256const fn try_to_u32(value: usize) -> Result<u32, usize> {
257    if value <= u32::MAX as usize {
258        Ok(value as u32)
259    } else {
260        Err(value)
261    }
262}
263
264impl OneIndexed {
265    // SAFETY: These constants are being initialized with non-zero values
266    /// The smallest value that can be represented by this integer type.
267    pub const MIN: Self = unwrap(Self::new(1));
268    /// The largest value that can be represented by this integer type
269    pub const MAX: Self = unwrap(Self::new(u32::MAX));
270
271    const ONE: NonZeroU32 = unwrap(NonZeroU32::new(1));
272
273    /// Creates a non-zero if the given value is not zero.
274    pub const fn new(value: u32) -> Option<Self> {
275        match NonZeroU32::new(value) {
276            Some(value) => Some(Self(value)),
277            None => None,
278        }
279    }
280
281    /// Construct a new [`OneIndexed`] from a zero-indexed value
282    pub const fn from_zero_indexed(value: u32) -> Self {
283        Self(Self::ONE.saturating_add(value))
284    }
285
286    /// Construct a new [`OneIndexed`] from a zero-indexed usize value
287    pub const fn try_from_zero_indexed(value: usize) -> Result<Self, usize> {
288        match try_to_u32(value) {
289            Ok(value) => Ok(Self(Self::ONE.saturating_add(value))),
290            Err(value) => Err(value),
291        }
292    }
293
294    /// Returns the value as a primitive type.
295    pub const fn get(self) -> u32 {
296        self.0.get()
297    }
298
299    /// Return the usize value for this [`OneIndexed`]
300    pub const fn to_usize(self) -> usize {
301        self.get() as _
302    }
303
304    /// Return the zero-indexed primitive value for this [`OneIndexed`]
305    pub const fn to_zero_indexed(self) -> u32 {
306        self.0.get() - 1
307    }
308
309    /// Return the zero-indexed usize value for this [`OneIndexed`]
310    pub const fn to_zero_indexed_usize(self) -> usize {
311        self.to_zero_indexed() as _
312    }
313
314    /// Saturating integer addition. Computes `self + rhs`, saturating at
315    /// the numeric bounds instead of overflowing.
316    #[must_use]
317    pub const fn saturating_add(self, rhs: u32) -> Self {
318        match NonZeroU32::new(self.0.get().saturating_add(rhs)) {
319            Some(value) => Self(value),
320            None => Self::MAX,
321        }
322    }
323
324    /// Saturating integer subtraction. Computes `self - rhs`, saturating
325    /// at the numeric bounds instead of overflowing.
326    #[must_use]
327    pub const fn saturating_sub(self, rhs: u32) -> Self {
328        match NonZeroU32::new(self.0.get().saturating_sub(rhs)) {
329            Some(value) => Self(value),
330            None => Self::MIN,
331        }
332    }
333}
334
335impl std::fmt::Display for OneIndexed {
336    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
337        std::fmt::Debug::fmt(&self.0.get(), f)
338    }
339}
340
341/// A const `Option::unwrap` without nightly features:
342/// [Tracking issue](https://github.com/rust-lang/rust/issues/67441)
343const fn unwrap<T: Copy>(option: Option<T>) -> T {
344    match option {
345        Some(value) => value,
346        None => panic!("unwrapping None"),
347    }
348}
349
350#[cfg(test)]
351mod tests {
352    use crate::source_location::line_index::LineIndex;
353    use crate::source_location::{OneIndexed, SourceLocation};
354    use crate::text_size::TextSize;
355
356    #[test]
357    fn ascii_index() {
358        let index = LineIndex::from_source_text("");
359        assert_eq!(index.line_starts(), &[TextSize::from(0)]);
360
361        let index = LineIndex::from_source_text("x = 1");
362        assert_eq!(index.line_starts(), &[TextSize::from(0)]);
363
364        let index = LineIndex::from_source_text("x = 1\n");
365        assert_eq!(index.line_starts(), &[TextSize::from(0), TextSize::from(6)]);
366
367        let index = LineIndex::from_source_text("x = 1\ny = 2\nz = x + y\n");
368        assert_eq!(
369            index.line_starts(),
370            &[
371                TextSize::from(0),
372                TextSize::from(6),
373                TextSize::from(12),
374                TextSize::from(22)
375            ]
376        );
377    }
378
379    #[test]
380    fn ascii_source_location() {
381        let contents = "x = 1\ny = 2";
382        let index = LineIndex::from_source_text(contents);
383
384        // First row.
385        let loc = index.source_location(TextSize::from(2), contents);
386        assert_eq!(
387            loc,
388            SourceLocation {
389                row: OneIndexed::from_zero_indexed(0),
390                column: OneIndexed::from_zero_indexed(2)
391            }
392        );
393
394        // Second row.
395        let loc = index.source_location(TextSize::from(6), contents);
396        assert_eq!(
397            loc,
398            SourceLocation {
399                row: OneIndexed::from_zero_indexed(1),
400                column: OneIndexed::from_zero_indexed(0)
401            }
402        );
403
404        let loc = index.source_location(TextSize::from(11), contents);
405        assert_eq!(
406            loc,
407            SourceLocation {
408                row: OneIndexed::from_zero_indexed(1),
409                column: OneIndexed::from_zero_indexed(5)
410            }
411        );
412    }
413
414    #[test]
415    fn ascii_carriage_return() {
416        let contents = "x = 4\ry = 3";
417        let index = LineIndex::from_source_text(contents);
418        assert_eq!(index.line_starts(), &[TextSize::from(0), TextSize::from(6)]);
419
420        assert_eq!(
421            index.source_location(TextSize::from(4), contents),
422            SourceLocation {
423                row: OneIndexed::from_zero_indexed(0),
424                column: OneIndexed::from_zero_indexed(4)
425            }
426        );
427        assert_eq!(
428            index.source_location(TextSize::from(6), contents),
429            SourceLocation {
430                row: OneIndexed::from_zero_indexed(1),
431                column: OneIndexed::from_zero_indexed(0)
432            }
433        );
434        assert_eq!(
435            index.source_location(TextSize::from(7), contents),
436            SourceLocation {
437                row: OneIndexed::from_zero_indexed(1),
438                column: OneIndexed::from_zero_indexed(1)
439            }
440        );
441    }
442
443    #[test]
444    fn ascii_carriage_return_newline() {
445        let contents = "x = 4\r\ny = 3";
446        let index = LineIndex::from_source_text(contents);
447        assert_eq!(index.line_starts(), &[TextSize::from(0), TextSize::from(7)]);
448
449        assert_eq!(
450            index.source_location(TextSize::from(4), contents),
451            SourceLocation {
452                row: OneIndexed::from_zero_indexed(0),
453                column: OneIndexed::from_zero_indexed(4)
454            }
455        );
456        assert_eq!(
457            index.source_location(TextSize::from(7), contents),
458            SourceLocation {
459                row: OneIndexed::from_zero_indexed(1),
460                column: OneIndexed::from_zero_indexed(0)
461            }
462        );
463        assert_eq!(
464            index.source_location(TextSize::from(8), contents),
465            SourceLocation {
466                row: OneIndexed::from_zero_indexed(1),
467                column: OneIndexed::from_zero_indexed(1)
468            }
469        );
470    }
471
472    #[test]
473    fn utf8_index() {
474        let index = LineIndex::from_source_text("x = '🫣'");
475        assert_eq!(index.line_count(), 1);
476        assert_eq!(index.line_starts(), &[TextSize::from(0)]);
477
478        let index = LineIndex::from_source_text("x = '🫣'\n");
479        assert_eq!(index.line_count(), 2);
480        assert_eq!(
481            index.line_starts(),
482            &[TextSize::from(0), TextSize::from(11)]
483        );
484
485        let index = LineIndex::from_source_text("x = '🫣'\ny = 2\nz = x + y\n");
486        assert_eq!(index.line_count(), 4);
487        assert_eq!(
488            index.line_starts(),
489            &[
490                TextSize::from(0),
491                TextSize::from(11),
492                TextSize::from(17),
493                TextSize::from(27)
494            ]
495        );
496
497        let index = LineIndex::from_source_text("# 🫣\nclass Foo:\n    \"\"\".\"\"\"");
498        assert_eq!(index.line_count(), 3);
499        assert_eq!(
500            index.line_starts(),
501            &[TextSize::from(0), TextSize::from(7), TextSize::from(18)]
502        );
503    }
504
505    #[test]
506    fn utf8_carriage_return() {
507        let contents = "x = '🫣'\ry = 3";
508        let index = LineIndex::from_source_text(contents);
509        assert_eq!(index.line_count(), 2);
510        assert_eq!(
511            index.line_starts(),
512            &[TextSize::from(0), TextSize::from(11)]
513        );
514
515        // Second '
516        assert_eq!(
517            index.source_location(TextSize::from(9), contents),
518            SourceLocation {
519                row: OneIndexed::from_zero_indexed(0),
520                column: OneIndexed::from_zero_indexed(6)
521            }
522        );
523        assert_eq!(
524            index.source_location(TextSize::from(11), contents),
525            SourceLocation {
526                row: OneIndexed::from_zero_indexed(1),
527                column: OneIndexed::from_zero_indexed(0)
528            }
529        );
530        assert_eq!(
531            index.source_location(TextSize::from(12), contents),
532            SourceLocation {
533                row: OneIndexed::from_zero_indexed(1),
534                column: OneIndexed::from_zero_indexed(1)
535            }
536        );
537    }
538
539    #[test]
540    fn utf8_carriage_return_newline() {
541        let contents = "x = '🫣'\r\ny = 3";
542        let index = LineIndex::from_source_text(contents);
543        assert_eq!(index.line_count(), 2);
544        assert_eq!(
545            index.line_starts(),
546            &[TextSize::from(0), TextSize::from(12)]
547        );
548
549        // Second '
550        assert_eq!(
551            index.source_location(TextSize::from(9), contents),
552            SourceLocation {
553                row: OneIndexed::from_zero_indexed(0),
554                column: OneIndexed::from_zero_indexed(6)
555            }
556        );
557        assert_eq!(
558            index.source_location(TextSize::from(12), contents),
559            SourceLocation {
560                row: OneIndexed::from_zero_indexed(1),
561                column: OneIndexed::from_zero_indexed(0)
562            }
563        );
564        assert_eq!(
565            index.source_location(TextSize::from(13), contents),
566            SourceLocation {
567                row: OneIndexed::from_zero_indexed(1),
568                column: OneIndexed::from_zero_indexed(1)
569            }
570        );
571    }
572
573    #[test]
574    fn utf8_byte_offset() {
575        let contents = "x = '☃'\ny = 2";
576        let index = LineIndex::from_source_text(contents);
577        assert_eq!(
578            index.line_starts(),
579            &[TextSize::from(0), TextSize::from(10)]
580        );
581
582        // First row.
583        let loc = index.source_location(TextSize::from(0), contents);
584        assert_eq!(
585            loc,
586            SourceLocation {
587                row: OneIndexed::from_zero_indexed(0),
588                column: OneIndexed::from_zero_indexed(0)
589            }
590        );
591
592        let loc = index.source_location(TextSize::from(5), contents);
593        assert_eq!(
594            loc,
595            SourceLocation {
596                row: OneIndexed::from_zero_indexed(0),
597                column: OneIndexed::from_zero_indexed(5)
598            }
599        );
600
601        let loc = index.source_location(TextSize::from(8), contents);
602        assert_eq!(
603            loc,
604            SourceLocation {
605                row: OneIndexed::from_zero_indexed(0),
606                column: OneIndexed::from_zero_indexed(6)
607            }
608        );
609
610        // Second row.
611        let loc = index.source_location(TextSize::from(10), contents);
612        assert_eq!(
613            loc,
614            SourceLocation {
615                row: OneIndexed::from_zero_indexed(1),
616                column: OneIndexed::from_zero_indexed(0)
617            }
618        );
619
620        // One-past-the-end.
621        let loc = index.source_location(TextSize::from(15), contents);
622        assert_eq!(
623            loc,
624            SourceLocation {
625                row: OneIndexed::from_zero_indexed(1),
626                column: OneIndexed::from_zero_indexed(5)
627            }
628        );
629    }
630}