lintspec_core/
textindex.rs

1//! A text index type
2//!
3//! The [`TextIndex`] type holds the line, column (zero-indexed, in all 3 LSP encodings) and utf-8 byte offset for a
4//! given position in the text.
5use std::{fmt, ops::Range};
6
7use derive_more::Add;
8use serde::Serialize;
9use wide::{CmpEq as _, CmpLt as _, i8x32};
10use zerocopy::transmute_ref;
11
12const SIMD_LANES: usize = i8x32::LANES as usize;
13
14/// A span of source code
15pub type TextRange = Range<TextIndex>;
16
17/// A position inside of the source code
18///
19/// Lines and columns start at 0.
20#[derive(Default, Hash, Copy, Clone, PartialEq, Eq, Debug, Serialize, Add)]
21pub struct TextIndex {
22    /// Byte offset from the start of the document
23    pub utf8: usize,
24    /// Line number (0-based)
25    pub line: u32,
26    /// Column offset in bytes (0-based)
27    pub col_utf8: u32,
28    /// Column offset in UTF-16 code units (0-based)
29    pub col_utf16: u32,
30    /// Column offset in UTF-32 code points (0-based)
31    ///
32    /// This is the same as Rust's [`char`] count.
33    pub col_utf32: u32,
34}
35
36impl TextIndex {
37    /// Shorthand for a `TextIndex` with all fields set to 0.
38    pub const ZERO: TextIndex = TextIndex {
39        utf8: 0,
40        line: 0,
41        col_utf8: 0,
42        col_utf16: 0,
43        col_utf32: 0,
44    };
45
46    /// Advance the index, accounting for lf/nl/ls/ps characters and combinations.
47    ///
48    /// This is *not* derived from the definition of 'newline' in the language definition,
49    /// nor is it a complete implementation of the Unicode line breaking algorithm.
50    ///
51    /// Implementation inspired by [`slang_solidity`](https://crates.io/crates/slang_solidity).
52    #[inline]
53    pub fn advance(&mut self, c: char, next: Option<&char>) {
54        // fast path for ASCII characters
55        if c.is_ascii() {
56            self.utf8 += 1;
57            match (c, next) {
58                ('\r', Some(&'\n')) => {
59                    // ignore for now, we will increment the line number when we process the \n
60                }
61                ('\n' | '\r', _) => {
62                    self.line += 1;
63                    self.col_utf8 = 0;
64                    self.col_utf16 = 0;
65                    self.col_utf32 = 0;
66                }
67                _ => {
68                    self.col_utf8 += 1;
69                    self.col_utf16 += 1;
70                    self.col_utf32 += 1;
71                }
72            }
73        } else {
74            // slow path for Unicode
75            let bytes = c.len_utf8();
76            self.utf8 += bytes;
77            match c {
78                '\u{2028}' | '\u{2029}' => {
79                    self.line += 1;
80                    self.col_utf8 = 0;
81                    self.col_utf16 = 0;
82                    self.col_utf32 = 0;
83                }
84                #[expect(clippy::cast_possible_truncation)]
85                _ => {
86                    self.col_utf8 += bytes as u32;
87                    self.col_utf16 += c.len_utf16() as u32;
88                    self.col_utf32 += 1;
89                }
90            }
91        }
92    }
93
94    /// Advance the `TextIndex` knowing the char `c` is non-ASCII
95    #[inline]
96    fn advance_unicode(&mut self, c: char) {
97        debug_assert!(!c.is_ascii());
98        let bytes = c.len_utf8();
99        self.utf8 += bytes;
100        match c {
101            '\u{2028}' | '\u{2029}' => {
102                self.line += 1;
103                self.col_utf8 = 0;
104                self.col_utf16 = 0;
105                self.col_utf32 = 0;
106            }
107            #[expect(clippy::cast_possible_truncation)]
108            _ => {
109                self.col_utf8 += bytes as u32;
110                self.col_utf16 += c.len_utf16() as u32;
111                self.col_utf32 += 1;
112            }
113        }
114    }
115
116    /// Advance this index according to the `Advance` parameter.
117    #[inline]
118    fn advance_by(&mut self, advance: &Advance) {
119        self.utf8 += advance.bytes as usize;
120        self.line += advance.lines;
121        // ASCII-only path: 1 byte = 1 UTF-16 code unit = 1 code point
122        match advance.column {
123            Column::Increment(n) => {
124                self.col_utf8 += n;
125                self.col_utf16 += n;
126                self.col_utf32 += n;
127            }
128            Column::Set(n) => {
129                self.col_utf8 = n;
130                self.col_utf16 = n;
131                self.col_utf32 = n;
132            }
133        }
134    }
135}
136
137impl fmt::Display for TextIndex {
138    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
139        write!(f, "{}:{}", self.line + 1, self.col_utf32 + 1)
140    }
141}
142
143impl PartialOrd for TextIndex {
144    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
145        Some(self.cmp(other))
146    }
147}
148
149impl Ord for TextIndex {
150    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
151        self.utf8.cmp(&other.utf8)
152    }
153}
154
155/// The type of operation to perform on the `TextIndex`'s `column` field
156#[derive(Debug, Clone, Copy, PartialEq, Eq)]
157enum Column {
158    Increment(u32),
159    Set(u32),
160}
161
162/// An update to perform on `TextIndex` after scanning a chunk of the input text
163#[derive(Debug, Clone, PartialEq, Eq)]
164struct Advance {
165    bytes: u32,
166    lines: u32,
167    column: Column,
168}
169
170impl Advance {
171    /// Scan a chunk of text and compute how to advance the `TextIndex`
172    ///
173    /// The return value calculates how much the index can be advanced, until either a non-ASCII character is
174    /// encountered, or the next offset of interest is reached.
175    #[inline]
176    #[must_use]
177    fn scan(slice: &[i8], start: usize, next_offset: usize) -> Self {
178        let bytes = &slice[start..next_offset];
179        let arr: [i8; SIMD_LANES] = bytes.first_chunk().copied().unwrap_or_else(|| {
180            // if we have fewer than the required bytes, we pad with `-1` which corresponds to a non-ASCII character
181            let mut arr = [-1; SIMD_LANES];
182            arr[0..bytes.len()].copy_from_slice(bytes);
183            arr
184        });
185        Self::from(arr)
186    }
187}
188
189impl From<[i8; SIMD_LANES]> for Advance {
190    /// Scan a chunk of text and compute how to advance the `TextIndex`
191    ///
192    /// The return value calculates how much the index can be advanced, until either a non-ASCII character is
193    /// encountered, or the next offset of interest is reached.
194    ///
195    /// Simplified example with a 16 bytes chunk:
196    ///
197    /// ```norust
198    /// input:          "ab\r\nefg\rijkl🦀"
199    /// non-ASCII mask: 0000_0000_0000_1111
200    /// LF mask:        0001_0000_0000_0000
201    /// CR mask:        0010_0001_0000_0000
202    /// to keep:        ^^^^ ^^^^ ^^^^
203    /// ```
204    ///
205    /// We will process an amount of bytes equal to `nonascii_mask.trailing_zeros()` (a contiguous segment of ASCII
206    /// bytes at the start of the chunk). This is the increment that will be applied to the `utf8` field of `TextIndex`
207    /// (the byte offset). Since this path is ASCII-only, all three column encodings advance equally.
208    ///
209    /// First we ignore everything starting at the first non-ASCII byte by shifting the masks:
210    ///
211    /// ```norust
212    /// padding:        vvvv
213    /// non-ASCII mask: 0000_0000_0000_0000
214    /// LF mask:        0000_0001_0000_0000
215    /// CR mask:        0000_0010_0001_0000
216    /// interesting:         ^^^^ ^^^^ ^^^^
217    /// to keep:                  ^^^^ ^^^^
218    /// ```
219    ///
220    /// The number of ASCII bytes on the last line is `lf_mask.leading_zeros()`. The total number of line returns is
221    /// `lf_mask.count_ones()`. We will increment the `line` field of `TextIndex` by this number.
222    ///
223    /// Then we ignore everything but the last line (what comes after the last LF) by shifting the masks in the other
224    /// direction:
225    ///
226    /// ```norust
227    /// padding:                  vvvv vvvv
228    /// non-ASCII mask: 0000_0000_0000_0000
229    /// LF mask:        0000_0000_0000_0000
230    /// CR mask:        0001_0000_0000_0000
231    /// interesting:    ^^^^ ^^^^
232    /// ```
233    ///
234    /// Finally, we subtract the number of `\r` bytes from the last line (`cr_mask.count_ones()`, which do not
235    /// increment the column count) from the number of bytes on the last line which we calculated before. This number
236    /// is the new value of the `column` field of `TextIndex`.
237    #[inline]
238    #[expect(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
239    fn from(chunk: [i8; SIMD_LANES]) -> Self {
240        let bytes = i8x32::new(chunk);
241        let nonascii_mask = bytes.simd_lt(i8x32::ZERO).to_bitmask();
242        let lf_bytes = i8x32::splat(b'\n' as i8);
243        let mut lf_mask = bytes.simd_eq(lf_bytes).to_bitmask();
244        let cr_bytes = i8x32::splat(b'\r' as i8);
245        let mut cr_mask = bytes.simd_eq(cr_bytes).to_bitmask();
246
247        // ignore non-ASCII characters at the end
248        let n_ascii = nonascii_mask.trailing_zeros();
249        if n_ascii == 0 {
250            // there are not ASCII bytes at the start of the chunk
251            return Advance {
252                bytes: 0,
253                column: Column::Increment(0),
254                lines: 0,
255            };
256        }
257        let shift = SIMD_LANES as u32 - n_ascii; // this is < SIMD_LANES
258        lf_mask <<= shift;
259        cr_mask <<= shift;
260
261        let mut n_lines = 0;
262        let column = if lf_mask > 0 {
263            // the chunk contains multiple lines, we ignore everything but the last line
264            n_lines = lf_mask.count_ones();
265            let n_last_line = lf_mask.leading_zeros();
266            if n_last_line == 0 {
267                // edge case where the last byte is \n
268                return Advance {
269                    bytes: n_ascii,
270                    column: Column::Set(0),
271                    lines: n_lines,
272                };
273            }
274            // we ignore the \r in the last line for the columns count
275            cr_mask >>= SIMD_LANES as u32 - n_last_line; // the shift amount is < SIMD_LANES
276            Column::Set(n_last_line - cr_mask.count_ones())
277        } else {
278            Column::Increment(n_ascii - cr_mask.count_ones())
279        };
280        Advance {
281            bytes: n_ascii,
282            column,
283            lines: n_lines,
284        }
285    }
286}
287
288/// Compute the [`TextIndex`] list corresponding to the byte offsets in the given source.
289///
290/// The list of offsets _MUST_ be sorted, but it can contain duplicates. The source string slice _MUST NOT_ be empty.
291///
292/// This routine iterates through the characters and advances a running [`TextIndex`], storing a copy in the output
293/// if it matches a desired offset.
294///
295/// SIMD is used to accelerate processing of ASCII-only sections in the source.
296pub fn compute_indices(source: &str, offsets: &[usize]) -> Vec<TextIndex> {
297    assert!(!source.is_empty(), "source cannot be empty");
298    let mut text_indices = Vec::with_capacity(offsets.len()); // upper bound for the size
299    let mut current = TextIndex::ZERO;
300
301    let mut ofs_iter = offsets.iter();
302    let Some(mut next_offset) = ofs_iter.next() else {
303        return text_indices;
304    };
305    // need to cast the bytes to `i8` to work with SIMD instructions from `wide`.
306    let bytes: &[i8] = transmute_ref!(source.as_bytes());
307    'outer: loop {
308        // process ASCII chunks with SIMD
309        loop {
310            let advance = Advance::scan(bytes, current.utf8, *next_offset);
311            current.advance_by(&advance);
312            if &current.utf8 == next_offset {
313                // we reached a target position, store it
314                text_indices.push(current);
315                // skip duplicates and advance to next offset
316                next_offset = match ofs_iter.find(|o| o != &next_offset) {
317                    Some(o) => o,
318                    None => break 'outer, // all interesting offsets have been found
319                };
320            }
321            if bytes[current.utf8] < 0 {
322                // a non-ASCII character was found, let's go to char-by-char processing
323                break;
324            }
325        }
326        // fall back to char-by-char processing for unicode
327        let remaining_source = &source[current.utf8..];
328        let mut char_iter = remaining_source.chars().peekable();
329        while let Some(c) = char_iter.next() {
330            debug_assert!(
331                next_offset >= &current.utf8,
332                "next offset {next_offset} is smaller than current {}",
333                current.utf8
334            );
335            current.advance_unicode(c);
336            if &current.utf8 >= next_offset {
337                // we reached a target position, store it
338                // for offsets that fall in the middle of a unicode character, we store the next valid position
339                text_indices.push(current);
340                // skip duplicates and advance to next offset
341                next_offset = match ofs_iter.find(|o| o > &&current.utf8) {
342                    Some(o) => o,
343                    None => break 'outer, // all interesting offsets have been found
344                };
345            }
346            if char_iter.peek().is_some_and(char::is_ascii) {
347                // we're done processing the non-ASCII characters, let's go back to SIMD-optimized processing
348                break;
349            }
350        }
351        if current.utf8 >= bytes.len() - 1 {
352            break; // done with the input
353        }
354    }
355    text_indices
356}
357
358#[cfg(test)]
359#[expect(clippy::cast_possible_wrap)]
360mod tests {
361    use similar_asserts::assert_eq;
362
363    use super::*;
364
365    #[test]
366    fn test_advance_simple() {
367        let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabcdabcd"
368            .iter()
369            .map(|b| *b as i8)
370            .collect();
371        let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
372        let advance = Advance::from(chunk);
373        assert_eq!(
374            advance,
375            Advance {
376                bytes: 32,
377                lines: 0,
378                column: Column::Increment(32)
379            }
380        );
381    }
382
383    #[test]
384    fn test_advance_newline() {
385        let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabc\nabcd"
386            .iter()
387            .map(|b| *b as i8)
388            .collect();
389        let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
390        let advance = Advance::from(chunk);
391        assert_eq!(
392            advance,
393            Advance {
394                bytes: 32,
395                lines: 1,
396                column: Column::Set(4)
397            }
398        );
399    }
400
401    #[test]
402    fn test_advance_multiple_newlines() {
403        let chunk: Vec<_> = b"abcdabcdabc\nabcdabcdabcdab\r\nabc\r"
404            .iter()
405            .map(|b| *b as i8)
406            .collect();
407        let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
408        let advance = Advance::from(chunk);
409        assert_eq!(
410            advance,
411            Advance {
412                bytes: 32,
413                lines: 2,
414                column: Column::Set(3)
415            }
416        );
417    }
418
419    #[test]
420    fn test_advance_unicode() {
421        let chunk: Vec<_> = "abcdabcdabcdabcdabcdabcdabcd🦀"
422            .bytes()
423            .map(|b| b as i8)
424            .collect();
425        let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
426        let advance = Advance::from(chunk);
427        assert_eq!(
428            advance,
429            Advance {
430                bytes: 28,
431                lines: 0,
432                column: Column::Increment(28)
433            }
434        );
435    }
436
437    #[test]
438    fn test_advance_unicode_newlines() {
439        let chunk: Vec<_> = "abcdabcdabc\nabcdabcdabc\nabcd🦀"
440            .bytes()
441            .map(|b| b as i8)
442            .collect();
443        let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
444        let advance = Advance::from(chunk);
445        assert_eq!(
446            advance,
447            Advance {
448                bytes: 28,
449                lines: 2,
450                column: Column::Set(4)
451            }
452        );
453    }
454
455    #[test]
456    fn test_advance_next_offset() {
457        let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabcdabcd"
458            .iter()
459            .map(|b| *b as i8)
460            .collect();
461        let advance = Advance::scan(chunk.as_slice(), 0, 28);
462        assert_eq!(
463            advance,
464            Advance {
465                bytes: 28,
466                lines: 0,
467                column: Column::Increment(28)
468            }
469        );
470    }
471
472    #[test]
473    fn test_advance_next_offset_newline() {
474        let chunk: Vec<_> = b"abcdabcdabcdabc\nabcdabcdabc\nabcd"
475            .iter()
476            .map(|b| *b as i8)
477            .collect();
478        let advance = Advance::scan(chunk.as_slice(), 0, 28);
479        assert_eq!(
480            advance,
481            Advance {
482                bytes: 28,
483                lines: 2,
484                column: Column::Set(0)
485            }
486        );
487    }
488
489    #[test]
490    fn test_compute_indices_simple() {
491        let source = "hello world";
492        let offsets = vec![0, 5, 6, 10]; // h, space, w, d
493        let result = compute_indices(source, &offsets);
494
495        assert_eq!(result.len(), 4);
496        assert_eq!(result[0], TextIndex::ZERO);
497        assert_eq!(
498            result[1],
499            TextIndex {
500                utf8: 5,
501                line: 0,
502                col_utf8: 5,
503                col_utf16: 5,
504                col_utf32: 5
505            }
506        );
507        assert_eq!(
508            result[2],
509            TextIndex {
510                utf8: 6,
511                line: 0,
512                col_utf8: 6,
513                col_utf16: 6,
514                col_utf32: 6
515            }
516        );
517        assert_eq!(
518            result[3],
519            TextIndex {
520                utf8: 10,
521                line: 0,
522                col_utf8: 10,
523                col_utf16: 10,
524                col_utf32: 10
525            }
526        );
527    }
528
529    #[test]
530    fn test_compute_indices_with_newlines() {
531        let source = "hello\nworld\ntest";
532        let offsets = vec![0, 5, 6, 12, 15]; // h, nl, w, t, t
533        let result = compute_indices(source, &offsets);
534
535        assert_eq!(result.len(), 5);
536        assert_eq!(result[0], TextIndex::ZERO);
537        assert_eq!(
538            result[1],
539            TextIndex {
540                utf8: 5,
541                line: 0,
542                col_utf8: 5,
543                col_utf16: 5,
544                col_utf32: 5
545            }
546        );
547        assert_eq!(
548            result[2],
549            TextIndex {
550                utf8: 6,
551                line: 1,
552                col_utf8: 0,
553                col_utf16: 0,
554                col_utf32: 0
555            }
556        );
557        assert_eq!(
558            result[3],
559            TextIndex {
560                utf8: 12,
561                line: 2,
562                col_utf8: 0,
563                col_utf16: 0,
564                col_utf32: 0
565            }
566        );
567        assert_eq!(
568            result[4],
569            TextIndex {
570                utf8: 15,
571                line: 2,
572                col_utf8: 3,
573                col_utf16: 3,
574                col_utf32: 3
575            }
576        );
577    }
578
579    #[test]
580    fn test_compute_indices_with_unicode() {
581        let source = "hel🦀lo";
582        let offsets = vec![0, 3, 7]; // h, crab, l
583        let result = compute_indices(source, &offsets);
584
585        assert_eq!(result[0], TextIndex::ZERO);
586        assert_eq!(
587            result[1],
588            TextIndex {
589                utf8: 3,
590                line: 0,
591                col_utf8: 3,
592                col_utf16: 3,
593                col_utf32: 3
594            }
595        );
596        // 🦀 = 4 bytes UTF-8, 2 UTF-16 code units, 1 code point
597        assert_eq!(
598            result[2],
599            TextIndex {
600                utf8: 7,
601                line: 0,
602                col_utf8: 7,
603                col_utf16: 5,
604                col_utf32: 4
605            }
606        );
607    }
608
609    #[test]
610    fn test_compute_indices_with_carriage_return() {
611        let source = "padding_hello\r\nworld_padding_padding";
612        let offsets = vec![8, 13, 14, 16, 36]; // h, \r, \n, o, g (last)
613        let result = compute_indices(source, &offsets);
614
615        assert_eq!(
616            result[0],
617            TextIndex {
618                utf8: 8,
619                line: 0,
620                col_utf8: 8,
621                col_utf16: 8,
622                col_utf32: 8
623            }
624        );
625        assert_eq!(
626            result[1],
627            TextIndex {
628                utf8: 13,
629                line: 0,
630                col_utf8: 13,
631                col_utf16: 13,
632                col_utf32: 13
633            }
634        );
635        assert_eq!(
636            result[2],
637            TextIndex {
638                utf8: 14,
639                line: 0,
640                col_utf8: 13, // \r doesn't advance
641                col_utf16: 13,
642                col_utf32: 13
643            }
644        );
645        assert_eq!(
646            result[3],
647            TextIndex {
648                utf8: 16,
649                line: 1,
650                col_utf8: 1,
651                col_utf16: 1,
652                col_utf32: 1
653            }
654        );
655        assert_eq!(
656            result[4],
657            TextIndex {
658                utf8: 36,
659                line: 1,
660                col_utf8: 21,
661                col_utf16: 21,
662                col_utf32: 21
663            }
664        );
665    }
666
667    #[test]
668    fn test_compute_indices_duplicate_offsets() {
669        let source = "hello";
670        let offsets = vec![0, 0, 2, 2, 4];
671        let result = compute_indices(source, &offsets);
672
673        assert_eq!(result.len(), 3); // duplicates should be handled
674        assert_eq!(result[0], TextIndex::ZERO);
675        assert_eq!(
676            result[1],
677            TextIndex {
678                utf8: 2,
679                line: 0,
680                col_utf8: 2,
681                col_utf16: 2,
682                col_utf32: 2
683            }
684        );
685        assert_eq!(
686            result[2],
687            TextIndex {
688                utf8: 4,
689                line: 0,
690                col_utf8: 4,
691                col_utf16: 4,
692                col_utf32: 4
693            }
694        );
695    }
696
697    #[test]
698    fn test_compute_indices_unicode_line_separators() {
699        let source = "hello\u{2028}world\u{2029}test";
700        // Unicode line separator (\u{2028}) and paragraph separator (\u{2029}) are 3 bytes in UTF-8
701        // and 1 code unit in UTF-16
702        let offsets = vec![0, 5, 8, 16]; // h, ls, w, t
703        let result = compute_indices(source, &offsets);
704
705        assert_eq!(result[0], TextIndex::ZERO);
706        assert_eq!(
707            result[1],
708            TextIndex {
709                utf8: 5,
710                line: 0,
711                col_utf8: 5,
712                col_utf16: 5,
713                col_utf32: 5
714            }
715        );
716        assert_eq!(
717            result[2],
718            TextIndex {
719                utf8: 8,
720                line: 1,
721                col_utf8: 0,
722                col_utf16: 0,
723                col_utf32: 0
724            }
725        );
726        assert_eq!(
727            result[3],
728            TextIndex {
729                utf8: 16,
730                line: 2,
731                col_utf8: 0,
732                col_utf16: 0,
733                col_utf32: 0
734            }
735        );
736    }
737
738    #[test]
739    #[should_panic(expected = "source cannot be empty")]
740    fn test_compute_indices_empty_source() {
741        let source = "";
742        let offsets = vec![0];
743        compute_indices(source, &offsets);
744    }
745}
lintspec_core/textindex.rs

lintspec_core/
textindex.rs