1use std::{fmt, ops::Range};
6
7use derive_more::Add;
8use serde::Serialize;
9use wide::{CmpEq as _, CmpLt as _, i8x32};
10use zerocopy::transmute_ref;
11
12const SIMD_LANES: usize = i8x32::LANES as usize;
13
14pub type TextRange = Range<TextIndex>;
16
17#[derive(Default, Hash, Copy, Clone, PartialEq, Eq, Debug, Serialize, Add)]
21pub struct TextIndex {
22 pub utf8: usize,
23 pub utf16: usize,
24 pub line: usize,
25 pub column: usize,
26}
27
28impl TextIndex {
29 pub const ZERO: TextIndex = TextIndex {
31 utf8: 0,
32 utf16: 0,
33 line: 0,
34 column: 0,
35 };
36
37 #[inline]
44 pub fn advance(&mut self, c: char, next: Option<&char>) {
45 if c.is_ascii() {
47 self.utf8 += 1;
48 self.utf16 += 1;
49 match (c, next) {
50 ('\r', Some(&'\n')) => {
51 }
53 ('\n' | '\r', _) => {
54 self.line += 1;
55 self.column = 0;
56 }
57 _ => {
58 self.column += 1;
59 }
60 }
61 } else {
62 self.utf8 += c.len_utf8();
64 self.utf16 += c.len_utf16();
65 match c {
66 '\u{2028}' | '\u{2029}' => {
67 self.line += 1;
68 self.column = 0;
69 }
70 _ => {
71 self.column += 1;
72 }
73 }
74 }
75 }
76
77 #[inline]
79 fn advance_unicode(&mut self, c: char) {
80 debug_assert!(!c.is_ascii());
81 self.utf8 += c.len_utf8();
82 self.utf16 += c.len_utf16();
83 match c {
84 '\u{2028}' | '\u{2029}' => {
85 self.line += 1;
86 self.column = 0;
87 }
88 _ => {
89 self.column += 1;
90 }
91 }
92 }
93
94 #[inline]
96 fn advance_by(&mut self, advance: &Advance) {
97 self.utf8 += advance.bytes;
98 self.utf16 += advance.bytes;
99 self.line += advance.lines;
100 match advance.column {
101 Column::Increment(n) => self.column += n,
102 Column::Set(n) => self.column = n,
103 }
104 }
105}
106
107impl fmt::Display for TextIndex {
108 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
109 write!(f, "{}:{}", self.line + 1, self.column + 1)
110 }
111}
112
113impl PartialOrd for TextIndex {
114 fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
115 Some(self.cmp(other))
116 }
117}
118
119impl Ord for TextIndex {
120 fn cmp(&self, other: &Self) -> std::cmp::Ordering {
121 self.utf8.cmp(&other.utf8)
122 }
123}
124
125#[derive(Debug, Clone, Copy, PartialEq, Eq)]
127enum Column {
128 Increment(usize),
129 Set(usize),
130}
131
132#[derive(Debug, Clone, PartialEq, Eq)]
134struct Advance {
135 bytes: usize,
136 lines: usize,
137 column: Column,
138}
139
140impl Advance {
141 #[inline]
146 #[must_use]
147 fn scan(slice: &[i8], start: usize, next_offset: usize) -> Self {
148 let bytes = &slice[start..next_offset];
149 let arr: [i8; SIMD_LANES] = bytes.first_chunk().copied().unwrap_or_else(|| {
150 let mut arr = [-1; SIMD_LANES];
152 arr[0..bytes.len()].copy_from_slice(bytes);
153 arr
154 });
155 Self::from(arr)
156 }
157}
158
159impl From<[i8; SIMD_LANES]> for Advance {
160 #[inline]
208 fn from(chunk: [i8; SIMD_LANES]) -> Self {
209 let bytes = i8x32::new(chunk);
210 let nonascii_mask = bytes.simd_lt(i8x32::ZERO).to_bitmask();
211 #[expect(clippy::cast_possible_wrap)]
212 let lf_bytes = i8x32::splat(b'\n' as i8);
213 let mut lf_mask = bytes.simd_eq(lf_bytes).to_bitmask();
214 #[expect(clippy::cast_possible_wrap)]
215 let cr_bytes = i8x32::splat(b'\r' as i8);
216 let mut cr_mask = bytes.simd_eq(cr_bytes).to_bitmask();
217
218 let n_ascii = nonascii_mask.trailing_zeros() as usize;
220 if n_ascii == 0 {
221 return Advance {
223 bytes: 0,
224 column: Column::Increment(0),
225 lines: 0,
226 };
227 }
228 let shift = SIMD_LANES - n_ascii; lf_mask <<= shift;
230 cr_mask <<= shift;
231
232 let mut n_lines = 0;
233 let column = if lf_mask > 0 {
234 n_lines = lf_mask.count_ones() as usize;
236 let n_last_line = lf_mask.leading_zeros() as usize;
237 if n_last_line == 0 {
238 return Advance {
240 bytes: n_ascii,
241 column: Column::Set(0),
242 lines: n_lines,
243 };
244 }
245 cr_mask >>= SIMD_LANES - n_last_line; Column::Set(n_last_line - cr_mask.count_ones() as usize)
248 } else {
249 Column::Increment(n_ascii - cr_mask.count_ones() as usize)
250 };
251 Advance {
252 bytes: n_ascii,
253 column,
254 lines: n_lines,
255 }
256 }
257}
258
259pub fn compute_indices(source: &str, offsets: &[usize]) -> Vec<TextIndex> {
268 assert!(!source.is_empty(), "source cannot be empty");
269 let mut text_indices = Vec::with_capacity(offsets.len()); let mut current = TextIndex::ZERO;
271
272 let mut ofs_iter = offsets.iter();
273 let Some(mut next_offset) = ofs_iter.next() else {
274 return text_indices;
275 };
276 let bytes: &[i8] = transmute_ref!(source.as_bytes());
278 'outer: loop {
279 loop {
281 let advance = Advance::scan(bytes, current.utf8, *next_offset);
282 current.advance_by(&advance);
283 if ¤t.utf8 == next_offset {
284 text_indices.push(current);
286 next_offset = match ofs_iter.find(|o| o != &next_offset) {
288 Some(o) => o,
289 None => break 'outer, };
291 }
292 if bytes[current.utf8] < 0 {
293 break;
295 }
296 }
297 let remaining_source = &source[current.utf8..];
299 let mut char_iter = remaining_source.chars().peekable();
300 while let Some(c) = char_iter.next() {
301 debug_assert!(
302 next_offset >= ¤t.utf8,
303 "next offset {next_offset} is smaller than current {}",
304 current.utf8
305 );
306 current.advance_unicode(c);
307 if ¤t.utf8 >= next_offset {
308 text_indices.push(current);
311 next_offset = match ofs_iter.find(|o| o > &¤t.utf8) {
313 Some(o) => o,
314 None => break 'outer, };
316 }
317 if char_iter.peek().is_some_and(char::is_ascii) {
318 break;
320 }
321 }
322 if current.utf8 >= bytes.len() - 1 {
323 break; }
325 }
326 text_indices
327}
328
329#[cfg(test)]
330#[expect(clippy::cast_possible_wrap)]
331mod tests {
332 use similar_asserts::assert_eq;
333
334 use super::*;
335
336 #[test]
337 fn test_advance_simple() {
338 let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabcdabcd"
339 .iter()
340 .map(|b| *b as i8)
341 .collect();
342 let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
343 let advance = Advance::from(chunk);
344 assert_eq!(
345 advance,
346 Advance {
347 bytes: 32,
348 lines: 0,
349 column: Column::Increment(32)
350 }
351 );
352 }
353
354 #[test]
355 fn test_advance_newline() {
356 let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabc\nabcd"
357 .iter()
358 .map(|b| *b as i8)
359 .collect();
360 let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
361 let advance = Advance::from(chunk);
362 assert_eq!(
363 advance,
364 Advance {
365 bytes: 32,
366 lines: 1,
367 column: Column::Set(4)
368 }
369 );
370 }
371
372 #[test]
373 fn test_advance_multiple_newlines() {
374 let chunk: Vec<_> = b"abcdabcdabc\nabcdabcdabcdab\r\nabc\r"
375 .iter()
376 .map(|b| *b as i8)
377 .collect();
378 let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
379 let advance = Advance::from(chunk);
380 assert_eq!(
381 advance,
382 Advance {
383 bytes: 32,
384 lines: 2,
385 column: Column::Set(3)
386 }
387 );
388 }
389
390 #[test]
391 fn test_advance_unicode() {
392 let chunk: Vec<_> = "abcdabcdabcdabcdabcdabcdabcd🦀"
393 .bytes()
394 .map(|b| b as i8)
395 .collect();
396 let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
397 let advance = Advance::from(chunk);
398 assert_eq!(
399 advance,
400 Advance {
401 bytes: 28,
402 lines: 0,
403 column: Column::Increment(28)
404 }
405 );
406 }
407
408 #[test]
409 fn test_advance_unicode_newlines() {
410 let chunk: Vec<_> = "abcdabcdabc\nabcdabcdabc\nabcd🦀"
411 .bytes()
412 .map(|b| b as i8)
413 .collect();
414 let chunk: [i8; 32] = chunk.as_slice().try_into().unwrap();
415 let advance = Advance::from(chunk);
416 assert_eq!(
417 advance,
418 Advance {
419 bytes: 28,
420 lines: 2,
421 column: Column::Set(4)
422 }
423 );
424 }
425
426 #[test]
427 fn test_advance_next_offset() {
428 let chunk: Vec<_> = b"abcdabcdabcdabcdabcdabcdabcdabcd"
429 .iter()
430 .map(|b| *b as i8)
431 .collect();
432 let advance = Advance::scan(chunk.as_slice(), 0, 28);
433 assert_eq!(
434 advance,
435 Advance {
436 bytes: 28,
437 lines: 0,
438 column: Column::Increment(28)
439 }
440 );
441 }
442
443 #[test]
444 fn test_advance_next_offset_newline() {
445 let chunk: Vec<_> = b"abcdabcdabcdabc\nabcdabcdabc\nabcd"
446 .iter()
447 .map(|b| *b as i8)
448 .collect();
449 let advance = Advance::scan(chunk.as_slice(), 0, 28);
450 assert_eq!(
451 advance,
452 Advance {
453 bytes: 28,
454 lines: 2,
455 column: Column::Set(0)
456 }
457 );
458 }
459
460 #[test]
461 fn test_compute_indices_simple() {
462 let source = "hello world";
463 let offsets = vec![0, 5, 6, 10]; let result = compute_indices(source, &offsets);
465
466 assert_eq!(result.len(), 4);
467 assert_eq!(result[0], TextIndex::ZERO);
468 assert_eq!(
469 result[1],
470 TextIndex {
471 utf8: 5,
472 utf16: 5,
473 line: 0,
474 column: 5
475 }
476 );
477 assert_eq!(
478 result[2],
479 TextIndex {
480 utf8: 6,
481 utf16: 6,
482 line: 0,
483 column: 6
484 }
485 );
486 assert_eq!(
487 result[3],
488 TextIndex {
489 utf8: 10,
490 utf16: 10,
491 line: 0,
492 column: 10
493 }
494 );
495 }
496
497 #[test]
498 fn test_compute_indices_with_newlines() {
499 let source = "hello\nworld\ntest";
500 let offsets = vec![0, 5, 6, 12, 15]; let result = compute_indices(source, &offsets);
502
503 assert_eq!(result.len(), 5);
504 assert_eq!(
505 result[0],
506 TextIndex {
507 utf8: 0,
508 utf16: 0,
509 line: 0,
510 column: 0
511 }
512 );
513 assert_eq!(
514 result[1],
515 TextIndex {
516 utf8: 5,
517 utf16: 5,
518 line: 0,
519 column: 5
520 }
521 );
522 assert_eq!(
523 result[2],
524 TextIndex {
525 utf8: 6,
526 utf16: 6,
527 line: 1,
528 column: 0
529 }
530 );
531 assert_eq!(
532 result[3],
533 TextIndex {
534 utf8: 12,
535 utf16: 12,
536 line: 2,
537 column: 0
538 }
539 );
540 assert_eq!(
541 result[4],
542 TextIndex {
543 utf8: 15,
544 utf16: 15,
545 line: 2,
546 column: 3
547 }
548 );
549 }
550
551 #[test]
552 fn test_compute_indices_with_unicode() {
553 let source = "hel🦀lo";
554 let offsets = vec![0, 3, 7]; let result = compute_indices(source, &offsets);
556
557 assert_eq!(
558 result[0],
559 TextIndex {
560 utf8: 0,
561 utf16: 0,
562 line: 0,
563 column: 0
564 }
565 );
566 assert_eq!(
567 result[1],
568 TextIndex {
569 utf8: 3,
570 utf16: 3,
571 line: 0,
572 column: 3
573 }
574 );
575 assert_eq!(
576 result[2],
577 TextIndex {
578 utf8: 7,
579 utf16: 5,
580 line: 0,
581 column: 4
582 }
583 );
584 }
585
586 #[test]
587 fn test_compute_indices_with_carriage_return() {
588 let source = "padding_hello\r\nworld_padding_padding";
589 let offsets = vec![8, 13, 14, 16, 36]; let result = compute_indices(source, &offsets);
591
592 assert_eq!(
593 result[0],
594 TextIndex {
595 utf8: 8,
596 utf16: 8,
597 line: 0,
598 column: 8
599 }
600 );
601 assert_eq!(
602 result[1],
603 TextIndex {
604 utf8: 13,
605 utf16: 13,
606 line: 0,
607 column: 13
608 }
609 );
610 assert_eq!(
611 result[2],
612 TextIndex {
613 utf8: 14,
614 utf16: 14,
615 line: 0,
616 column: 13 }
618 );
619 assert_eq!(
620 result[3],
621 TextIndex {
622 utf8: 16,
623 utf16: 16,
624 line: 1,
625 column: 1
626 }
627 );
628 assert_eq!(
629 result[4],
630 TextIndex {
631 utf8: 36,
632 utf16: 36,
633 line: 1,
634 column: 21
635 }
636 );
637 }
638
639 #[test]
640 fn test_compute_indices_duplicate_offsets() {
641 let source = "hello";
642 let offsets = vec![0, 0, 2, 2, 4];
643 let result = compute_indices(source, &offsets);
644
645 assert_eq!(result.len(), 3); assert_eq!(
647 result[0],
648 TextIndex {
649 utf8: 0,
650 utf16: 0,
651 line: 0,
652 column: 0
653 }
654 );
655 assert_eq!(
656 result[1],
657 TextIndex {
658 utf8: 2,
659 utf16: 2,
660 line: 0,
661 column: 2
662 }
663 );
664 assert_eq!(
665 result[2],
666 TextIndex {
667 utf8: 4,
668 utf16: 4,
669 line: 0,
670 column: 4
671 }
672 );
673 }
674
675 #[test]
676 fn test_compute_indices_unicode_line_separators() {
677 let source = "hello\u{2028}world\u{2029}test";
678 let offsets = vec![0, 5, 8, 16]; let result = compute_indices(source, &offsets);
682
683 assert_eq!(result[0], TextIndex::ZERO);
684 assert_eq!(
685 result[1],
686 TextIndex {
687 utf8: 5,
688 utf16: 5,
689 line: 0,
690 column: 5
691 }
692 );
693 assert_eq!(
694 result[2],
695 TextIndex {
696 utf8: 8,
697 utf16: 6,
698 line: 1,
699 column: 0
700 }
701 );
702 assert_eq!(
703 result[3],
704 TextIndex {
705 utf8: 16,
706 utf16: 12,
707 line: 2,
708 column: 0
709 }
710 );
711 }
712
713 #[test]
714 #[should_panic(expected = "source cannot be empty")]
715 fn test_compute_indices_empty_source() {
716 let source = "";
717 let offsets = vec![0];
718 compute_indices(source, &offsets);
719 }
720}