lintspec_core/
textindex.rs

1//! A text index type
2//!
3//! The [`TextIndex`] type holds the line, column (both zero-indexed) and utf-8/utf-16 offsets for a given position
4//! in the text.
5use std::{fmt, ops::Range};
6
7use derive_more::Add;
8use serde::Serialize;
9use wide::{CmpEq as _, CmpLt as _, i8x32};
10use zerocopy::transmute_ref;
11
12const SIMD_LANES: usize = i8x32::LANES as usize;
13
14/// A span of source code
15pub type TextRange = Range<TextIndex>;
16
17/// A position inside of the source code
18///
19/// Lines and columns start at 0.
20#[derive(Default, Hash, Copy, Clone, PartialEq, Eq, Debug, Serialize, Add)]
21pub struct TextIndex {
22    pub utf8: usize,
23    pub utf16: usize,
24    pub line: usize,
25    pub column: usize,
26}
27
28impl TextIndex {
29    /// Shorthand for `TextIndex { utf8: 0, utf16: 0, line: 0, char: 0 }`.
30    pub const ZERO: TextIndex = TextIndex {
31        utf8: 0,
32        utf16: 0,
33        line: 0,
34        column: 0,
35    };
36
37    /// Advance the index, accounting for lf/nl/ls/ps characters and combinations.
38    ///
39    /// This is *not* derived from the definition of 'newline' in the language definition,
40    /// nor is it a complete implementation of the Unicode line breaking algorithm.
41    ///
42    /// Implementation inspired by [`slang_solidity`](https://crates.io/crates/slang_solidity).
43    #[inline]
44    pub fn advance(&mut self, c: char, next: Option<&char>) {
45        // fast path for ASCII characters
46        if c.is_ascii() {
47            self.utf8 += 1;
48            self.utf16 += 1;
49            match (c, next) {
50                ('\r', Some(&'\n')) => {
51                    // ignore for now, we will increment the line number when we process the \n
52                }
53                ('\n' | '\r', _) => {
54                    self.line += 1;
55                    self.column = 0;
56                }
57                _ => {
58                    self.column += 1;
59                }
60            }
61        } else {
62            // slow path for Unicode
63            self.utf8 += c.len_utf8();
64            self.utf16 += c.len_utf16();
65            match c {
66                '\u{2028}' | '\u{2029}' => {
67                    self.line += 1;
68                    self.column = 0;
69                }
70                _ => {
71                    self.column += 1;
72                }
73            }
74        }
75    }
76
77    /// Advance the `TextIndex` knowing the char `c` is non-ASCII
78    #[inline]
79    fn advance_unicode(&mut self, c: char) {
80        debug_assert!(!c.is_ascii());
81        self.utf8 += c.len_utf8();
82        self.utf16 += c.len_utf16();
83        match c {
84            '\u{2028}' | '\u{2029}' => {
85                self.line += 1;
86                self.column = 0;
87            }
88            _ => {
89                self.column += 1;
90            }
91        }
92    }
93
94    /// Advance this index according to the `Advance` parameter.
95    #[inline]
96    fn advance_by(&mut self, advance: &Advance) {
97        self.utf8 += advance.bytes;
98        self.utf16 += advance.bytes;
99        self.line += advance.lines;
100        match advance.column {
101            Column::Increment(n) => self.column += n,
102            Column::Set(n) => self.column = n,
103        }
104    }
105}
106
107impl fmt::Display for TextIndex {
108    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
109        write!(f, "{}:{}", self.line + 1, self.column + 1)
110    }
111}
112
113impl PartialOrd for TextIndex {
114    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
115        Some(self.cmp(other))
116    }
117}
118
119impl Ord for TextIndex {
120    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
121        self.utf8.cmp(&other.utf8)
122    }
123}
124
125/// The type of operation to perform on the `TextIndex`'s `column` field
126#[derive(Debug, Clone, Copy, PartialEq, Eq)]
127enum Column {
128    Increment(usize),
129    Set(usize),
130}
131
132/// An update to perform on `TextIndex` after scanning a chunk of the input text
133#[derive(Debug, Clone, PartialEq, Eq)]
134struct Advance {
135    bytes: usize,
136    lines: usize,
137    column: Column,
138}
139
140impl Advance {
141    /// Scan a chunk of text and compute how to advance the `TextIndex`
142    ///
143    /// The return value calculates how much the index can be advanced, until either a non-ASCII character is
144    /// encountered, or the next offset of interest is reached.
145    #[inline]
146    #[must_use]
147    fn scan(slice: &[i8], start: usize, next_offset: usize) -> Self {
148        let bytes = &slice[start..next_offset];
149        let arr: [i8; SIMD_LANES] = bytes.first_chunk().copied().unwrap_or_else(|| {
150            // if we have fewer than the required bytes, we pad with `-1` which corresponds to a non-ASCII character
151            let mut arr = [-1; SIMD_LANES];
152            arr[0..bytes.len()].copy_from_slice(bytes);
153            arr
154        });
155        Self::from(arr)
156    }
157}
158
159impl From<[i8; SIMD_LANES]> for Advance {
160    /// Scan a chunk of text and compute how to advance the `TextIndex`
161    ///
162    /// The return value calculates how much the index can be advanced, until either a non-ASCII character is
163    /// encountered, or the next offset of interest is reached.
164    ///
165    /// Simplified example with a 16 bytes chunk:
166    ///
167    /// ```norust
168    /// input:          "ab\r\nefg\rijkl🦀"
169    /// non-ASCII mask: 0000_0000_0000_1111
170    /// LF mask:        0001_0000_0000_0000
171    /// CR mask:        0010_0001_0000_0000
172    /// to keep:        ^^^^ ^^^^ ^^^^
173    /// ```
174    ///
175    /// We will process an amount of bytes equal to `nonascii_mask.trailing_zeros()` (a contiguous segment of ASCII
176    /// bytes at the start of the chunk). This is the increment that will be applied to the `utf8` and `utf16` fields
177    /// of `TextIndex` (the bytes/code units offset).
178    ///
179    /// First we ignore everything starting at the first non-ASCII byte by shifting the masks:
180    ///
181    /// ```norust
182    /// padding:        vvvv
183    /// non-ASCII mask: 0000_0000_0000_0000
184    /// LF mask:        0000_0001_0000_0000
185    /// CR mask:        0000_0010_0001_0000
186    /// interesting:         ^^^^ ^^^^ ^^^^
187    /// to keep:                  ^^^^ ^^^^
188    /// ```
189    ///
190    /// The number of ASCII bytes on the last line is `lf_mask.leading_zeros()`. The total number of line returns is
191    /// `lf_mask.count_ones()`. We will increment the `line` field of `TextIndex` by this number.
192    ///
193    /// Then we ignore everything but the last line (what comes after the last LF) by shifting the masks in the other
194    /// direction:
195    ///
196    /// ```norust
197    /// padding:                  vvvv vvvv
198    /// non-ASCII mask: 0000_0000_0000_0000
199    /// LF mask:        0000_0000_0000_0000
200    /// CR mask:        0001_0000_0000_0000
201    /// interesting:    ^^^^ ^^^^
202    /// ```
203    ///
204    /// Finally, we subtract the number of `\r` bytes from the last line (`cr_mask.count_ones()`, which do not
205    /// increment the column count) from the number of bytes on the last line which we calculated before. This number
206    /// is the new value of the `column` field of `TextIndex`.
207    #[inline]
208    fn from(chunk: [i8; SIMD_LANES]) -> Self {
209        let bytes = i8x32::new(chunk);
210        let nonascii_mask = bytes.simd_lt(i8x32::ZERO).to_bitmask();
211        #[expect(clippy::cast_possible_wrap)]
212        let lf_bytes = i8x32::splat(b'\n' as i8);
213        let mut lf_mask = bytes.simd_eq(lf_bytes).to_bitmask();
214        #[expect(clippy::cast_possible_wrap)]
215        let cr_bytes = i8x32::splat(b'\r' as i8);
216        let mut cr_mask = bytes.simd_eq(cr_bytes).to_bitmask();
217
218        // ignore non-ASCII characters at the end
219        let n_ascii = nonascii_mask.trailing_zeros() as usize;
220        if n_ascii == 0 {
221            // there are not ASCII bytes at the start of the chunk
222            return Advance {
223                bytes: 0,
224                column: Column::Increment(0),
225                lines: 0,
226            };
227        }
228        let shift = SIMD_LANES - n_ascii; // this is < SIMD_LANES
229        lf_mask <<= shift;
230        cr_mask <<= shift;
231
232        let mut n_lines = 0;
233        let column = if lf_mask > 0 {
234            // the chunk contains multiple lines, we ignore everything but the last line
235            n_lines = lf_mask.count_ones() as usize;
236            let n_last_line = lf_mask.leading_zeros() as usize;
237            if n_last_line == 0 {
238                // edge case where the last byte is \n
239                return Advance {
240                    bytes: n_ascii,
241                    column: Column::Set(0),
242                    lines: n_lines,
243                };
244            }
245            // we ignore the \r in the last line for the columns count
246            cr_mask >>= SIMD_LANES - n_last_line; // the shift amount is < SIMD_LANES
247            Column::Set(n_last_line - cr_mask.count_ones() as usize)
248        } else {
249            Column::Increment(n_ascii - cr_mask.count_ones() as usize)
250        };
251        Advance {
252            bytes: n_ascii,
253            column,
254            lines: n_lines,
255        }
256    }
257}
258
259/// Compute the [`TextIndex`] list corresponding to the byte offsets in the given source.
260///
261/// The list of offsets _MUST_ be sorted, but it can contain duplicates. The source string slice _MUST NOT_ be empty.
262///
263/// This routine iterates through the characters and advances a running [`TextIndex`], storing a copy in the output
264/// if it matches a desired offset.
265///
266/// SIMD is used to accelerate processing of ASCII-only sections in the source.
267pub fn compute_indices(source: &str, offsets: &[usize]) -> Vec<TextIndex> {
268    assert!(!source.is_empty(), "source cannot be empty");
269    let mut text_indices = Vec::with_capacity(offsets.len()); // upper bound for the size
270    let mut current = TextIndex::ZERO;
271
272    let mut ofs_iter = offsets.iter();
273    let Some(mut next_offset) = ofs_iter.next() else {
274        return text_indices;
275    };
276    // need to cast the bytes to `i8` to work with SIMD instructions from `wide`.
277    let bytes: &[i8] = transmute_ref!(source.as_bytes());
278    'outer: loop {
279        // process ASCII chunks with SIMD
280        loop {
281            let advance = Advance::scan(bytes, current.utf8, *next_offset);
282            current.advance_by(&advance);
283            if &current.utf8 == next_offset {
284                // we reached a target position, store it
285                text_indices.push(current);
286                // skip duplicates and advance to next offset
287                next_offset = match ofs_iter.find(|o| o != &next_offset) {
288                    Some(o) => o,
289                    None => break 'outer, // all interesting offsets have been found
290                };
291            }
292            if bytes[current.utf8] < 0 {
293                // a non-ASCII character was found, let's go to char-by-char processing
294                break;
295            }
296        }
297        // fall back to char-by-char processing for unicode
298        let remaining_source = &source[current.utf8..];
299        let mut char_iter = remaining_source.chars().peekable();
300        while let Some(c) = char_iter.next() {
301            debug_assert!(
302                next_offset >= &current.utf8,
303                "next offset {next_offset} is smaller than current {}",
304                current.utf8
305            );
306            current.advance_unicode(c);
307            if &current.utf8 >= next_offset {
308                // we reached a target position, store it
309                // for offsets that fall in the middle of a unicode character, we store the next valid position
310                text_indices.push(current);
311                // skip duplicates and advance to next offset
312                next_offset = match ofs_iter.find(|o| o > &&current.utf8) {
313                    Some(o) => o,
314                    None => break 'outer, // all interesting offsets have been found
315                };
316            }
317            if char_iter.peek().is_some_and(char::is_ascii) {
318                // we're done processing the non-ASCII characters, let's go back to SIMD-optimized processing
319                break;
320            }
321        }
322        if current.utf8 >= bytes.len() - 1 {
323            break; // done with the input
324        }
325    }
326    text_indices
327}
328
329#[cfg(test)]
330#[expect(clippy::cast_possible_wrap)]
331mod tests {
332    use similar_asserts::assert_eq;
333
334    use super::*;
335
336    #[test]
337    fn test_advance_simple() {
338        let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabcdabcd"
339            .iter()
340            .map(|b| *b as i8)
341            .collect();
342        let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
343        let advance = Advance::from(chunk);
344        assert_eq!(
345            advance,
346            Advance {
347                bytes: 32,
348                lines: 0,
349                column: Column::Increment(32)
350            }
351        );
352    }
353
354    #[test]
355    fn test_advance_newline() {
356        let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabc\nabcd"
357            .iter()
358            .map(|b| *b as i8)
359            .collect();
360        let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
361        let advance = Advance::from(chunk);
362        assert_eq!(
363            advance,
364            Advance {
365                bytes: 32,
366                lines: 1,
367                column: Column::Set(4)
368            }
369        );
370    }
371
372    #[test]
373    fn test_advance_multiple_newlines() {
374        let chunk: Vec<_> = b"abcdabcdabc\nabcdabcdabcdab\r\nabc\r"
375            .iter()
376            .map(|b| *b as i8)
377            .collect();
378        let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
379        let advance = Advance::from(chunk);
380        assert_eq!(
381            advance,
382            Advance {
383                bytes: 32,
384                lines: 2,
385                column: Column::Set(3)
386            }
387        );
388    }
389
390    #[test]
391    fn test_advance_unicode() {
392        let chunk: Vec<_> = "abcdabcdabcdabcdabcdabcdabcd🦀"
393            .bytes()
394            .map(|b| b as i8)
395            .collect();
396        let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
397        let advance = Advance::from(chunk);
398        assert_eq!(
399            advance,
400            Advance {
401                bytes: 28,
402                lines: 0,
403                column: Column::Increment(28)
404            }
405        );
406    }
407
408    #[test]
409    fn test_advance_unicode_newlines() {
410        let chunk: Vec<_> = "abcdabcdabc\nabcdabcdabc\nabcd🦀"
411            .bytes()
412            .map(|b| b as i8)
413            .collect();
414        let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
415        let advance = Advance::from(chunk);
416        assert_eq!(
417            advance,
418            Advance {
419                bytes: 28,
420                lines: 2,
421                column: Column::Set(4)
422            }
423        );
424    }
425
426    #[test]
427    fn test_advance_next_offset() {
428        let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabcdabcd"
429            .iter()
430            .map(|b| *b as i8)
431            .collect();
432        let advance = Advance::scan(chunk.as_slice(), 0, 28);
433        assert_eq!(
434            advance,
435            Advance {
436                bytes: 28,
437                lines: 0,
438                column: Column::Increment(28)
439            }
440        );
441    }
442
443    #[test]
444    fn test_advance_next_offset_newline() {
445        let chunk: Vec<_> = b"abcdabcdabcdabc\nabcdabcdabc\nabcd"
446            .iter()
447            .map(|b| *b as i8)
448            .collect();
449        let advance = Advance::scan(chunk.as_slice(), 0, 28);
450        assert_eq!(
451            advance,
452            Advance {
453                bytes: 28,
454                lines: 2,
455                column: Column::Set(0)
456            }
457        );
458    }
459
460    #[test]
461    fn test_compute_indices_simple() {
462        let source = "hello world";
463        let offsets = vec![0, 5, 6, 10]; // h, space, w, d
464        let result = compute_indices(source, &offsets);
465
466        assert_eq!(result.len(), 4);
467        assert_eq!(result[0], TextIndex::ZERO);
468        assert_eq!(
469            result[1],
470            TextIndex {
471                utf8: 5,
472                utf16: 5,
473                line: 0,
474                column: 5
475            }
476        );
477        assert_eq!(
478            result[2],
479            TextIndex {
480                utf8: 6,
481                utf16: 6,
482                line: 0,
483                column: 6
484            }
485        );
486        assert_eq!(
487            result[3],
488            TextIndex {
489                utf8: 10,
490                utf16: 10,
491                line: 0,
492                column: 10
493            }
494        );
495    }
496
497    #[test]
498    fn test_compute_indices_with_newlines() {
499        let source = "hello\nworld\ntest";
500        let offsets = vec![0, 5, 6, 12, 15]; // h, nl, w, t, t
501        let result = compute_indices(source, &offsets);
502
503        assert_eq!(result.len(), 5);
504        assert_eq!(
505            result[0],
506            TextIndex {
507                utf8: 0,
508                utf16: 0,
509                line: 0,
510                column: 0
511            }
512        );
513        assert_eq!(
514            result[1],
515            TextIndex {
516                utf8: 5,
517                utf16: 5,
518                line: 0,
519                column: 5
520            }
521        );
522        assert_eq!(
523            result[2],
524            TextIndex {
525                utf8: 6,
526                utf16: 6,
527                line: 1,
528                column: 0
529            }
530        );
531        assert_eq!(
532            result[3],
533            TextIndex {
534                utf8: 12,
535                utf16: 12,
536                line: 2,
537                column: 0
538            }
539        );
540        assert_eq!(
541            result[4],
542            TextIndex {
543                utf8: 15,
544                utf16: 15,
545                line: 2,
546                column: 3
547            }
548        );
549    }
550
551    #[test]
552    fn test_compute_indices_with_unicode() {
553        let source = "hel🦀lo";
554        let offsets = vec![0, 3, 7]; // h, crab, l
555        let result = compute_indices(source, &offsets);
556
557        assert_eq!(
558            result[0],
559            TextIndex {
560                utf8: 0,
561                utf16: 0,
562                line: 0,
563                column: 0
564            }
565        );
566        assert_eq!(
567            result[1],
568            TextIndex {
569                utf8: 3,
570                utf16: 3,
571                line: 0,
572                column: 3
573            }
574        );
575        assert_eq!(
576            result[2],
577            TextIndex {
578                utf8: 7,
579                utf16: 5,
580                line: 0,
581                column: 4
582            }
583        );
584    }
585
586    #[test]
587    fn test_compute_indices_with_carriage_return() {
588        let source = "padding_hello\r\nworld_padding_padding";
589        let offsets = vec![8, 13, 14, 16, 36]; // h, \r, \n, o, g (last)
590        let result = compute_indices(source, &offsets);
591
592        assert_eq!(
593            result[0],
594            TextIndex {
595                utf8: 8,
596                utf16: 8,
597                line: 0,
598                column: 8
599            }
600        );
601        assert_eq!(
602            result[1],
603            TextIndex {
604                utf8: 13,
605                utf16: 13,
606                line: 0,
607                column: 13
608            }
609        );
610        assert_eq!(
611            result[2],
612            TextIndex {
613                utf8: 14,
614                utf16: 14,
615                line: 0,
616                column: 13 // \r doesn't advance
617            }
618        );
619        assert_eq!(
620            result[3],
621            TextIndex {
622                utf8: 16,
623                utf16: 16,
624                line: 1,
625                column: 1
626            }
627        );
628        assert_eq!(
629            result[4],
630            TextIndex {
631                utf8: 36,
632                utf16: 36,
633                line: 1,
634                column: 21
635            }
636        );
637    }
638
639    #[test]
640    fn test_compute_indices_duplicate_offsets() {
641        let source = "hello";
642        let offsets = vec![0, 0, 2, 2, 4];
643        let result = compute_indices(source, &offsets);
644
645        assert_eq!(result.len(), 3); // duplicates should be handled
646        assert_eq!(
647            result[0],
648            TextIndex {
649                utf8: 0,
650                utf16: 0,
651                line: 0,
652                column: 0
653            }
654        );
655        assert_eq!(
656            result[1],
657            TextIndex {
658                utf8: 2,
659                utf16: 2,
660                line: 0,
661                column: 2
662            }
663        );
664        assert_eq!(
665            result[2],
666            TextIndex {
667                utf8: 4,
668                utf16: 4,
669                line: 0,
670                column: 4
671            }
672        );
673    }
674
675    #[test]
676    fn test_compute_indices_unicode_line_separators() {
677        let source = "hello\u{2028}world\u{2029}test";
678        // Unicode line separator (\u{2028}) and paragraph separator (\u{2029}) are 3 bytes in UTF-8
679        // and 1 code unit in UTF-16
680        let offsets = vec![0, 5, 8, 16]; // h, ls, w, t
681        let result = compute_indices(source, &offsets);
682
683        assert_eq!(result[0], TextIndex::ZERO);
684        assert_eq!(
685            result[1],
686            TextIndex {
687                utf8: 5,
688                utf16: 5,
689                line: 0,
690                column: 5
691            }
692        );
693        assert_eq!(
694            result[2],
695            TextIndex {
696                utf8: 8,
697                utf16: 6,
698                line: 1,
699                column: 0
700            }
701        );
702        assert_eq!(
703            result[3],
704            TextIndex {
705                utf8: 16,
706                utf16: 12,
707                line: 2,
708                column: 0
709            }
710        );
711    }
712
713    #[test]
714    #[should_panic(expected = "source cannot be empty")]
715    fn test_compute_indices_empty_source() {
716        let source = "";
717        let offsets = vec![0];
718        compute_indices(source, &offsets);
719    }
720}
lintspec_core/textindex.rs

lintspec_core/
textindex.rs