1use std::{fmt, ops::Range};
6
7use derive_more::Add;
8use serde::Serialize;
9use wide::{CmpEq as _, CmpLt as _, i8x32};
10use zerocopy::transmute_ref;
11
12const SIMD_LANES: usize = i8x32::LANES as usize;
13
14pub type TextRange = Range<TextIndex>;
16
17#[derive(Default, Hash, Copy, Clone, PartialEq, Eq, Debug, Serialize, Add)]
21pub struct TextIndex {
22 pub utf8: usize,
24 pub line: u32,
26 pub col_utf8: u32,
28 pub col_utf16: u32,
30 pub col_utf32: u32,
34}
35
36impl TextIndex {
37 pub const ZERO: TextIndex = TextIndex {
39 utf8: 0,
40 line: 0,
41 col_utf8: 0,
42 col_utf16: 0,
43 col_utf32: 0,
44 };
45
46 #[inline]
53 pub fn advance(&mut self, c: char, next: Option<&char>) {
54 if c.is_ascii() {
56 self.utf8 += 1;
57 match (c, next) {
58 ('\r', Some(&'\n')) => {
59 }
61 ('\n' | '\r', _) => {
62 self.line += 1;
63 self.col_utf8 = 0;
64 self.col_utf16 = 0;
65 self.col_utf32 = 0;
66 }
67 _ => {
68 self.col_utf8 += 1;
69 self.col_utf16 += 1;
70 self.col_utf32 += 1;
71 }
72 }
73 } else {
74 let bytes = c.len_utf8();
76 self.utf8 += bytes;
77 match c {
78 '\u{2028}' | '\u{2029}' => {
79 self.line += 1;
80 self.col_utf8 = 0;
81 self.col_utf16 = 0;
82 self.col_utf32 = 0;
83 }
84 #[expect(clippy::cast_possible_truncation)]
85 _ => {
86 self.col_utf8 += bytes as u32;
87 self.col_utf16 += c.len_utf16() as u32;
88 self.col_utf32 += 1;
89 }
90 }
91 }
92 }
93
94 #[inline]
96 fn advance_unicode(&mut self, c: char) {
97 debug_assert!(!c.is_ascii());
98 let bytes = c.len_utf8();
99 self.utf8 += bytes;
100 match c {
101 '\u{2028}' | '\u{2029}' => {
102 self.line += 1;
103 self.col_utf8 = 0;
104 self.col_utf16 = 0;
105 self.col_utf32 = 0;
106 }
107 #[expect(clippy::cast_possible_truncation)]
108 _ => {
109 self.col_utf8 += bytes as u32;
110 self.col_utf16 += c.len_utf16() as u32;
111 self.col_utf32 += 1;
112 }
113 }
114 }
115
116 #[inline]
118 fn advance_by(&mut self, advance: &Advance) {
119 self.utf8 += advance.bytes as usize;
120 self.line += advance.lines;
121 match advance.column {
123 Column::Increment(n) => {
124 self.col_utf8 += n;
125 self.col_utf16 += n;
126 self.col_utf32 += n;
127 }
128 Column::Set(n) => {
129 self.col_utf8 = n;
130 self.col_utf16 = n;
131 self.col_utf32 = n;
132 }
133 }
134 }
135}
136
137impl fmt::Display for TextIndex {
138 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
139 write!(f, "{}:{}", self.line + 1, self.col_utf32 + 1)
140 }
141}
142
143impl PartialOrd for TextIndex {
144 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
145 Some(self.cmp(other))
146 }
147}
148
149impl Ord for TextIndex {
150 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
151 self.utf8.cmp(&other.utf8)
152 }
153}
154
155#[derive(Debug, Clone, Copy, PartialEq, Eq)]
157enum Column {
158 Increment(u32),
159 Set(u32),
160}
161
162#[derive(Debug, Clone, PartialEq, Eq)]
164struct Advance {
165 bytes: u32,
166 lines: u32,
167 column: Column,
168}
169
170impl Advance {
171 #[inline]
176 #[must_use]
177 fn scan(slice: &[i8], start: usize, next_offset: usize) -> Self {
178 let bytes = &slice[start..next_offset];
179 let arr: [i8; SIMD_LANES] = bytes.first_chunk().copied().unwrap_or_else(|| {
180 let mut arr = [-1; SIMD_LANES];
182 arr[0..bytes.len()].copy_from_slice(bytes);
183 arr
184 });
185 Self::from(arr)
186 }
187}
188
189impl From<[i8; SIMD_LANES]> for Advance {
190 #[inline]
238 #[expect(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
239 fn from(chunk: [i8; SIMD_LANES]) -> Self {
240 let bytes = i8x32::new(chunk);
241 let nonascii_mask = bytes.simd_lt(i8x32::ZERO).to_bitmask();
242 let lf_bytes = i8x32::splat(b'\n' as i8);
243 let mut lf_mask = bytes.simd_eq(lf_bytes).to_bitmask();
244 let cr_bytes = i8x32::splat(b'\r' as i8);
245 let mut cr_mask = bytes.simd_eq(cr_bytes).to_bitmask();
246
247 let n_ascii = nonascii_mask.trailing_zeros();
249 if n_ascii == 0 {
250 return Advance {
252 bytes: 0,
253 column: Column::Increment(0),
254 lines: 0,
255 };
256 }
257 let shift = SIMD_LANES as u32 - n_ascii; lf_mask <<= shift;
259 cr_mask <<= shift;
260
261 let mut n_lines = 0;
262 let column = if lf_mask > 0 {
263 n_lines = lf_mask.count_ones();
265 let n_last_line = lf_mask.leading_zeros();
266 if n_last_line == 0 {
267 return Advance {
269 bytes: n_ascii,
270 column: Column::Set(0),
271 lines: n_lines,
272 };
273 }
274 cr_mask >>= SIMD_LANES as u32 - n_last_line; Column::Set(n_last_line - cr_mask.count_ones())
277 } else {
278 Column::Increment(n_ascii - cr_mask.count_ones())
279 };
280 Advance {
281 bytes: n_ascii,
282 column,
283 lines: n_lines,
284 }
285 }
286}
287
288pub fn compute_indices(source: &str, offsets: &[usize]) -> Vec<TextIndex> {
297 assert!(!source.is_empty(), "source cannot be empty");
298 let mut text_indices = Vec::with_capacity(offsets.len()); let mut current = TextIndex::ZERO;
300
301 let mut ofs_iter = offsets.iter();
302 let Some(mut next_offset) = ofs_iter.next() else {
303 return text_indices;
304 };
305 let bytes: &[i8] = transmute_ref!(source.as_bytes());
307 'outer: loop {
308 loop {
310 let advance = Advance::scan(bytes, current.utf8, *next_offset);
311 current.advance_by(&advance);
312 if ¤t.utf8 == next_offset {
313 text_indices.push(current);
315 next_offset = match ofs_iter.find(|o| o != &next_offset) {
317 Some(o) => o,
318 None => break 'outer, };
320 }
321 if bytes[current.utf8] < 0 {
322 break;
324 }
325 }
326 let remaining_source = &source[current.utf8..];
328 let mut char_iter = remaining_source.chars().peekable();
329 while let Some(c) = char_iter.next() {
330 debug_assert!(
331 next_offset >= ¤t.utf8,
332 "next offset {next_offset} is smaller than current {}",
333 current.utf8
334 );
335 current.advance_unicode(c);
336 if ¤t.utf8 >= next_offset {
337 text_indices.push(current);
340 next_offset = match ofs_iter.find(|o| o > &¤t.utf8) {
342 Some(o) => o,
343 None => break 'outer, };
345 }
346 if char_iter.peek().is_some_and(char::is_ascii) {
347 break;
349 }
350 }
351 if current.utf8 >= bytes.len() - 1 {
352 break; }
354 }
355 text_indices
356}
357
358#[cfg(test)]
359#[expect(clippy::cast_possible_wrap)]
360mod tests {
361 use similar_asserts::assert_eq;
362
363 use super::*;
364
365 #[test]
366 fn test_advance_simple() {
367 let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabcdabcd"
368 .iter()
369 .map(|b| *b as i8)
370 .collect();
371 let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
372 let advance = Advance::from(chunk);
373 assert_eq!(
374 advance,
375 Advance {
376 bytes: 32,
377 lines: 0,
378 column: Column::Increment(32)
379 }
380 );
381 }
382
383 #[test]
384 fn test_advance_newline() {
385 let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabc\nabcd"
386 .iter()
387 .map(|b| *b as i8)
388 .collect();
389 let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
390 let advance = Advance::from(chunk);
391 assert_eq!(
392 advance,
393 Advance {
394 bytes: 32,
395 lines: 1,
396 column: Column::Set(4)
397 }
398 );
399 }
400
401 #[test]
402 fn test_advance_multiple_newlines() {
403 let chunk: Vec<_> = b"abcdabcdabc\nabcdabcdabcdab\r\nabc\r"
404 .iter()
405 .map(|b| *b as i8)
406 .collect();
407 let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
408 let advance = Advance::from(chunk);
409 assert_eq!(
410 advance,
411 Advance {
412 bytes: 32,
413 lines: 2,
414 column: Column::Set(3)
415 }
416 );
417 }
418
419 #[test]
420 fn test_advance_unicode() {
421 let chunk: Vec<_> = "abcdabcdabcdabcdabcdabcdabcd🦀"
422 .bytes()
423 .map(|b| b as i8)
424 .collect();
425 let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
426 let advance = Advance::from(chunk);
427 assert_eq!(
428 advance,
429 Advance {
430 bytes: 28,
431 lines: 0,
432 column: Column::Increment(28)
433 }
434 );
435 }
436
437 #[test]
438 fn test_advance_unicode_newlines() {
439 let chunk: Vec<_> = "abcdabcdabc\nabcdabcdabc\nabcd🦀"
440 .bytes()
441 .map(|b| b as i8)
442 .collect();
443 let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
444 let advance = Advance::from(chunk);
445 assert_eq!(
446 advance,
447 Advance {
448 bytes: 28,
449 lines: 2,
450 column: Column::Set(4)
451 }
452 );
453 }
454
455 #[test]
456 fn test_advance_next_offset() {
457 let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabcdabcd"
458 .iter()
459 .map(|b| *b as i8)
460 .collect();
461 let advance = Advance::scan(chunk.as_slice(), 0, 28);
462 assert_eq!(
463 advance,
464 Advance {
465 bytes: 28,
466 lines: 0,
467 column: Column::Increment(28)
468 }
469 );
470 }
471
472 #[test]
473 fn test_advance_next_offset_newline() {
474 let chunk: Vec<_> = b"abcdabcdabcdabc\nabcdabcdabc\nabcd"
475 .iter()
476 .map(|b| *b as i8)
477 .collect();
478 let advance = Advance::scan(chunk.as_slice(), 0, 28);
479 assert_eq!(
480 advance,
481 Advance {
482 bytes: 28,
483 lines: 2,
484 column: Column::Set(0)
485 }
486 );
487 }
488
489 #[test]
490 fn test_compute_indices_simple() {
491 let source = "hello world";
492 let offsets = vec![0, 5, 6, 10]; let result = compute_indices(source, &offsets);
494
495 assert_eq!(result.len(), 4);
496 assert_eq!(result[0], TextIndex::ZERO);
497 assert_eq!(
498 result[1],
499 TextIndex {
500 utf8: 5,
501 line: 0,
502 col_utf8: 5,
503 col_utf16: 5,
504 col_utf32: 5
505 }
506 );
507 assert_eq!(
508 result[2],
509 TextIndex {
510 utf8: 6,
511 line: 0,
512 col_utf8: 6,
513 col_utf16: 6,
514 col_utf32: 6
515 }
516 );
517 assert_eq!(
518 result[3],
519 TextIndex {
520 utf8: 10,
521 line: 0,
522 col_utf8: 10,
523 col_utf16: 10,
524 col_utf32: 10
525 }
526 );
527 }
528
529 #[test]
530 fn test_compute_indices_with_newlines() {
531 let source = "hello\nworld\ntest";
532 let offsets = vec![0, 5, 6, 12, 15]; let result = compute_indices(source, &offsets);
534
535 assert_eq!(result.len(), 5);
536 assert_eq!(result[0], TextIndex::ZERO);
537 assert_eq!(
538 result[1],
539 TextIndex {
540 utf8: 5,
541 line: 0,
542 col_utf8: 5,
543 col_utf16: 5,
544 col_utf32: 5
545 }
546 );
547 assert_eq!(
548 result[2],
549 TextIndex {
550 utf8: 6,
551 line: 1,
552 col_utf8: 0,
553 col_utf16: 0,
554 col_utf32: 0
555 }
556 );
557 assert_eq!(
558 result[3],
559 TextIndex {
560 utf8: 12,
561 line: 2,
562 col_utf8: 0,
563 col_utf16: 0,
564 col_utf32: 0
565 }
566 );
567 assert_eq!(
568 result[4],
569 TextIndex {
570 utf8: 15,
571 line: 2,
572 col_utf8: 3,
573 col_utf16: 3,
574 col_utf32: 3
575 }
576 );
577 }
578
579 #[test]
580 fn test_compute_indices_with_unicode() {
581 let source = "hel🦀lo";
582 let offsets = vec![0, 3, 7]; let result = compute_indices(source, &offsets);
584
585 assert_eq!(result[0], TextIndex::ZERO);
586 assert_eq!(
587 result[1],
588 TextIndex {
589 utf8: 3,
590 line: 0,
591 col_utf8: 3,
592 col_utf16: 3,
593 col_utf32: 3
594 }
595 );
596 assert_eq!(
598 result[2],
599 TextIndex {
600 utf8: 7,
601 line: 0,
602 col_utf8: 7,
603 col_utf16: 5,
604 col_utf32: 4
605 }
606 );
607 }
608
609 #[test]
610 fn test_compute_indices_with_carriage_return() {
611 let source = "padding_hello\r\nworld_padding_padding";
612 let offsets = vec![8, 13, 14, 16, 36]; let result = compute_indices(source, &offsets);
614
615 assert_eq!(
616 result[0],
617 TextIndex {
618 utf8: 8,
619 line: 0,
620 col_utf8: 8,
621 col_utf16: 8,
622 col_utf32: 8
623 }
624 );
625 assert_eq!(
626 result[1],
627 TextIndex {
628 utf8: 13,
629 line: 0,
630 col_utf8: 13,
631 col_utf16: 13,
632 col_utf32: 13
633 }
634 );
635 assert_eq!(
636 result[2],
637 TextIndex {
638 utf8: 14,
639 line: 0,
640 col_utf8: 13, col_utf16: 13,
642 col_utf32: 13
643 }
644 );
645 assert_eq!(
646 result[3],
647 TextIndex {
648 utf8: 16,
649 line: 1,
650 col_utf8: 1,
651 col_utf16: 1,
652 col_utf32: 1
653 }
654 );
655 assert_eq!(
656 result[4],
657 TextIndex {
658 utf8: 36,
659 line: 1,
660 col_utf8: 21,
661 col_utf16: 21,
662 col_utf32: 21
663 }
664 );
665 }
666
667 #[test]
668 fn test_compute_indices_duplicate_offsets() {
669 let source = "hello";
670 let offsets = vec![0, 0, 2, 2, 4];
671 let result = compute_indices(source, &offsets);
672
673 assert_eq!(result.len(), 3); assert_eq!(result[0], TextIndex::ZERO);
675 assert_eq!(
676 result[1],
677 TextIndex {
678 utf8: 2,
679 line: 0,
680 col_utf8: 2,
681 col_utf16: 2,
682 col_utf32: 2
683 }
684 );
685 assert_eq!(
686 result[2],
687 TextIndex {
688 utf8: 4,
689 line: 0,
690 col_utf8: 4,
691 col_utf16: 4,
692 col_utf32: 4
693 }
694 );
695 }
696
697 #[test]
698 fn test_compute_indices_unicode_line_separators() {
699 let source = "hello\u{2028}world\u{2029}test";
700 let offsets = vec![0, 5, 8, 16]; let result = compute_indices(source, &offsets);
704
705 assert_eq!(result[0], TextIndex::ZERO);
706 assert_eq!(
707 result[1],
708 TextIndex {
709 utf8: 5,
710 line: 0,
711 col_utf8: 5,
712 col_utf16: 5,
713 col_utf32: 5
714 }
715 );
716 assert_eq!(
717 result[2],
718 TextIndex {
719 utf8: 8,
720 line: 1,
721 col_utf8: 0,
722 col_utf16: 0,
723 col_utf32: 0
724 }
725 );
726 assert_eq!(
727 result[3],
728 TextIndex {
729 utf8: 16,
730 line: 2,
731 col_utf8: 0,
732 col_utf16: 0,
733 col_utf32: 0
734 }
735 );
736 }
737
738 #[test]
739 #[should_panic(expected = "source cannot be empty")]
740 fn test_compute_indices_empty_source() {
741 let source = "";
742 let offsets = vec![0];
743 compute_indices(source, &offsets);
744 }
745}