1use lsp_types::{Position, PositionEncodingKind, Range, TextDocumentContentChangeEvent};
2
3#[derive(Debug)]
4pub struct FullTextDocument {
5 language_id: String,
6 version: i32,
7 content: String,
8 encoding: PositionEncodingKind,
9
10 line_offsets: Vec<u32>,
14}
15
16fn computed_line_offsets(text: &str, is_at_line_start: bool, text_offset: Option<u32>) -> Vec<u32> {
17 let text_offset = text_offset.unwrap_or(0);
18 let mut line_offsets = if is_at_line_start {
19 vec![text_offset]
20 } else {
21 vec![]
22 };
23
24 let mut chars = text.char_indices().peekable();
25 while let Some((idx, char)) = chars.next() {
26 let idx: u32 = idx
27 .try_into()
28 .expect("The length of the text involved in the calculation is too long");
29 if char == '\r' && chars.peek() == Some(&(idx as usize + 1, '\n')) {
30 chars.next();
31 line_offsets.push(text_offset + idx + 2);
32 } else if char == '\n' || char == '\r' {
33 line_offsets.push(text_offset + idx + 1);
34 }
35 }
36
37 line_offsets
38}
39
40fn line_offset_utf16(line: &str, offset: u32) -> u32 {
51 let mut c = 0;
52 for (idx, char) in line.char_indices() {
53 if idx + char.len_utf8() > offset as usize || idx == offset as usize {
54 break;
55 }
56 c += char.len_utf16() as u32;
57 }
58 c
59}
60
61fn line_offset_utf8(line: &str, offset: u32) -> u32 {
62 offset.min(
63 line
64 .len()
65 .try_into()
66 .expect("The length of the text involved in the calculation is too long"),
67 )
68}
69
70fn line_offset_utf32(line: &str, offset: u32) -> u32 {
71 let mut c = 0;
72 for (idx, ch) in line.char_indices() {
73 if idx + ch.len_utf8() > offset as usize || idx == offset as usize {
74 break;
75 }
76 c += 1;
77 }
78 c
79}
80
81fn units_to_byte_offset_utf16(line: &str, character: u32) -> u32 {
82 let mut c = 0;
83 for (idx, ch) in line.char_indices() {
84 if c == character {
85 return idx as u32;
86 }
87 let next = c + ch.len_utf16() as u32;
88 if next > character {
89 return idx as u32;
90 }
91 c = next;
92 }
93 line.len() as u32
94}
95
96fn units_to_byte_offset_utf8(line: &str, character: u32) -> u32 {
97 character.min(
98 line
99 .len()
100 .try_into()
101 .expect("The length of the text involved in the calculation is too long"),
102 )
103}
104
105fn units_to_byte_offset_utf32(line: &str, character: u32) -> u32 {
106 let mut c = 0;
107 for (idx, _ch) in line.char_indices() {
108 if c == character {
109 return idx as u32;
110 }
111 c += 1;
112 if c > character {
113 return idx as u32;
114 }
115 }
116 line.len() as u32
117}
118
119fn bytes_to_units(line: &str, offset: u32, encoding: &PositionEncodingKind) -> u32 {
120 if encoding == &PositionEncodingKind::UTF8 {
121 line_offset_utf8(line, offset)
122 } else if encoding == &PositionEncodingKind::UTF16 {
123 line_offset_utf16(line, offset)
124 } else {
125 line_offset_utf32(line, offset)
126 }
127}
128
129fn units_to_bytes(line: &str, character: u32, encoding: &PositionEncodingKind) -> u32 {
130 if encoding == &PositionEncodingKind::UTF8 {
131 units_to_byte_offset_utf8(line, character)
132 } else if encoding == &PositionEncodingKind::UTF16 {
133 units_to_byte_offset_utf16(line, character)
134 } else {
135 units_to_byte_offset_utf32(line, character)
136 }
137}
138
139impl FullTextDocument {
140 pub fn new(language_id: String, version: i32, content: String) -> Self {
141 Self::new_with_encoding(language_id, version, content, PositionEncodingKind::UTF16)
142 }
143
144 pub fn new_with_encoding(
212 language_id: String,
213 version: i32,
214 content: String,
215 encoding: PositionEncodingKind,
216 ) -> Self {
217 let line_offsets = computed_line_offsets(&content, true, None);
218 Self {
219 language_id,
220 version,
221 content,
222 encoding,
223 line_offsets,
224 }
225 }
226
227 pub fn update(&mut self, changes: &[TextDocumentContentChangeEvent], version: i32) {
228 for change in changes {
229 let TextDocumentContentChangeEvent { range, text, .. } = change;
230 match range {
231 Some(range) => {
232 let Range { start, end } = range;
234 let (start, start_offset) = self.find_canonical_position(start);
235 let (end, end_offset) = self.find_canonical_position(end);
236 assert!(
237 start_offset <= end_offset,
238 "Start offset must be less than end offset. {}:{} (offset {}) is not <= {}:{} (offset {})",
239 start.line, start.character, start_offset,
240 end.line, end.character, end_offset
241 );
242 self.content
243 .replace_range((start_offset as usize)..(end_offset as usize), text);
244
245 let (start_line, end_line) = (start.line, end.line);
246 assert!(start_line <= end_line);
247 let added_line_offsets = computed_line_offsets(text, false, Some(start_offset));
248 let num_added_line_offsets = added_line_offsets.len();
249
250 let splice_start = start_line as usize + 1;
251 let splice_end = std::cmp::min(end_line, self.line_count() - 1) as usize;
252 self.line_offsets
253 .splice(splice_start..=splice_end, added_line_offsets);
254
255 let diff =
256 (text.len() as i32).saturating_sub_unsigned(end_offset - start_offset);
257 if diff != 0 {
258 for i in
259 (splice_start + num_added_line_offsets)..(self.line_count() as usize)
260 {
261 self.line_offsets[i] = self.line_offsets[i].saturating_add_signed(diff);
262 }
263 }
264 }
265 None => {
266 self.line_offsets = computed_line_offsets(text, true, None);
269
270 self.content = text.to_owned();
272 }
273 }
274 }
275
276 self.version = version;
277 }
278
279 fn find_canonical_position(&self, position: &Position) -> (Position, u32) {
284 let offset = self.offset_at(*position);
285 if offset == 0 {
286 (
287 Position {
288 line: 0,
289 character: 0,
290 },
291 0,
292 )
293 } else if self.content.as_bytes().get(offset as usize - 1) == Some(&b'\n') {
294 if self.line_offsets[position.line as usize] == offset {
295 (*position, offset)
296 } else if self.line_offsets[position.line as usize + 1] == offset {
297 (
298 Position {
299 line: position.line + 1,
300 character: 0,
301 },
302 offset,
303 )
304 } else {
305 panic!(
306 "Could not determine canonical value for {position:?} in {:?}",
307 self.content
308 )
309 }
310 } else {
311 (*position, offset)
312 }
313 }
314
315 pub fn language_id(&self) -> &str {
317 &self.language_id
318 }
319
320 pub fn encoding(&self) -> PositionEncodingKind {
323 self.encoding.clone()
324 }
325
326 pub fn version(&self) -> i32 {
328 self.version
329 }
330
331 pub fn get_content(&self, range: Option<Range>) -> &str {
353 match range {
354 Some(Range { start, end }) => {
355 let start = self.offset_at(start);
356 let end = self.offset_at(end).min(self.content_len());
357 self.content.get(start as usize..end as usize).unwrap()
358 }
359 None => &self.content,
360 }
361 }
362
363 fn get_line_and_offset(&self, line: u32) -> Option<(&str, u32)> {
364 self.line_offsets.get(line as usize).map(|&line_offset| {
365 let len: u32 = self.content_len();
366 let eol_offset = self.line_offsets.get((line + 1) as usize).unwrap_or(&len);
367 let line = &self.content[line_offset as usize..*eol_offset as usize];
368 (line, line_offset)
369 })
370 }
371
372 fn get_line(&self, line: u32) -> Option<&str> {
373 self.get_line_and_offset(line).map(|(line, _)| line)
374 }
375
376 pub fn line_count(&self) -> u32 {
378 self.line_offsets
379 .len()
380 .try_into()
381 .expect("The number of lines of text passed in is too long")
382 }
383
384 pub fn content_len(&self) -> u32 {
386 self.content
387 .len()
388 .try_into()
389 .expect("The length of the text passed in is too long")
390 }
391
392 pub fn position_at(&self, offset: u32) -> Position {
398 let offset = offset.min(self.content_len());
399 let line_count = self.line_count();
400 if line_count == 1 {
401 return Position {
403 line: 0,
404 character: bytes_to_units(self.get_line(0).unwrap(), offset, &self.encoding),
405 };
406 }
407
408 let (mut low, mut high) = (0, line_count);
409 while low < high {
410 let mid = (low + high) / 2;
411 if offset
412 >= *self
413 .line_offsets
414 .get(mid as usize)
415 .expect("Unknown mid value")
416 {
417 low = mid + 1;
418 } else {
419 high = mid;
420 }
421 }
422
423 if low == 0 {
424 return Position {
426 line: 0,
427 character: bytes_to_units(self.get_line(0).unwrap(), offset, &self.encoding),
428 };
429 }
430
431 let line = low - 1;
432
433 Position {
434 line,
435 character: bytes_to_units(
436 self.get_line(line).unwrap(),
437 offset - self.line_offsets[line as usize],
438 &self.encoding,
439 ),
440 }
441 }
442
443 pub fn offset_at(&self, position: Position) -> u32 {
446 let Position { line, character } = position;
447 match self.get_line_and_offset(line) {
448 Some((line, offset)) => {
449 offset + units_to_bytes(line, character, &self.encoding)
450 }
451 None => {
452 if line >= self.line_count() {
453 self.content_len()
454 } else {
455 0
456 }
457 }
458 }
459 }
460}
461
462#[cfg(test)]
463mod tests {
464 use super::*;
465 use lsp_types::PositionEncodingKind;
466
467 fn full_text_document() -> FullTextDocument {
468 FullTextDocument::new(
469 "js".to_string(),
470 2,
471 "he\nllo\nworld\r\nfoo\rbar".to_string(),
472 )
473 }
474
475 #[test]
476 fn test_offset_at() {
477 let text_document = full_text_document();
478
479 let offset = text_document.offset_at(Position {
480 line: 1,
481 character: 1,
482 });
483 assert_eq!(offset, 4);
484
485 let offset = text_document.offset_at(Position {
486 line: 2,
487 character: 3,
488 });
489 assert_eq!(offset, 10);
490
491 let offset = text_document.offset_at(Position {
493 line: 3,
494 character: 1,
495 });
496 assert_eq!(offset, 15);
497 }
498
499 #[test]
501 fn test_offset_at_bmp() {
502 let text_document = FullTextDocument::new("js".to_string(), 2, "\u{20AC} euro".to_string());
504
505 let offset = text_document.offset_at(Position {
506 line: 0,
507 character: 2,
510 });
511 assert_eq!(offset, 4);
512 }
513
514 #[test]
516 fn test_offset_at_smp() {
517 let text_document = FullTextDocument::new("js".to_string(), 2, "\u{10437} yee".to_string());
519 let offset = text_document.offset_at(Position {
520 line: 0,
521 character: 3,
524 });
525 assert_eq!(offset, 5);
526 }
527
528 #[test]
530 fn test_offset_at_beyond_end_of_line() {
531 let text_document =
532 FullTextDocument::new("js".to_string(), 2, "\u{20AC} abc\nline 2".to_string());
533 assert_eq!(text_document.line_offsets, vec![0, 8]);
537
538 let offset = text_document.offset_at(Position {
539 line: 0,
540 character: 100,
541 });
542 assert_eq!(offset, 8);
543 }
544
545 #[test]
546 fn test_utf8_encoding_positions() {
547 let text_document = FullTextDocument::new_with_encoding(
548 "plain".to_string(),
549 1,
550 "\u{1F496}abc".to_string(),
551 PositionEncodingKind::UTF8,
552 );
553
554 assert_eq!(
556 text_document.offset_at(Position {
557 line: 0,
558 character: 4,
559 }),
560 4
561 );
562 assert_eq!(
563 text_document.position_at(4),
564 Position {
565 line: 0,
566 character: 4,
567 }
568 );
569
570 assert_eq!(
572 text_document.offset_at(Position {
573 line: 0,
574 character: 99,
575 }),
576 text_document.content_len()
577 );
578 }
579
580 #[test]
581 fn test_utf32_encoding_positions() {
582 let text_document = FullTextDocument::new_with_encoding(
583 "plain".to_string(),
584 1,
585 "\u{1F496}abc".to_string(),
586 PositionEncodingKind::UTF32,
587 );
588
589 assert_eq!(
591 text_document.offset_at(Position {
592 line: 0,
593 character: 1,
594 }),
595 4
596 );
597 assert_eq!(
598 text_document.position_at(4),
599 Position {
600 line: 0,
601 character: 1,
602 }
603 );
604
605 assert_eq!(
607 text_document.offset_at(Position {
608 line: 0,
609 character: 3,
610 }),
611 6
612 );
613 assert_eq!(
614 text_document.position_at(6),
615 Position {
616 line: 0,
617 character: 3,
618 }
619 );
620 }
621
622 #[test]
623 fn test_position_at() {
624 let text_document = full_text_document();
625
626 let position = text_document.position_at(5);
627 assert_eq!(
628 position,
629 Position {
630 line: 1,
631 character: 2
632 }
633 );
634
635 let position = text_document.position_at(11);
636 assert_eq!(
637 position,
638 Position {
639 line: 2,
640 character: 4,
641 }
642 );
643
644 let position = text_document.position_at(15);
645 assert_eq!(
646 position,
647 Position {
648 line: 3,
649 character: 1,
650 }
651 );
652
653 let position = text_document.position_at(0);
654 assert_eq!(
655 position,
656 Position {
657 line: 0,
658 character: 0,
659 }
660 );
661 }
662
663 #[test]
665 fn test_position_at_bmp() {
666 let text_document = FullTextDocument::new("js".to_string(), 2, "\u{20AC} euro".to_string());
668 let position = text_document.position_at(4);
669 assert_eq!(
670 position,
671 Position {
672 line: 0,
673 character: 2,
676 }
677 );
678
679 let text_document =
681 FullTextDocument::new("js".to_string(), 2, "\n\n\u{20AC} euro\n\n".to_string());
682 let position = text_document.position_at(6);
683 assert_eq!(
684 position,
685 Position {
686 line: 2,
687 character: 2,
690 }
691 );
692 }
693
694 #[test]
696 fn test_position_at_smp() {
697 let text_document = FullTextDocument::new("js".to_string(), 2, "\u{10437} yee".to_string());
699 assert_eq!(
700 text_document.position_at(5),
701 Position {
702 line: 0,
703 character: 3,
706 }
707 );
708
709 assert_eq!(
711 text_document.position_at(2),
712 Position {
713 line: 0,
714 character: 0,
715 }
716 );
717
718 let text_document =
720 FullTextDocument::new("js".to_string(), 2, "\n\n\u{10437} yee\n\n".to_string());
721 let position = text_document.position_at(7);
722 assert_eq!(
723 position,
724 Position {
725 line: 2,
726 character: 3,
729 }
730 );
731 }
732
733 #[test]
735 fn test_position_at_line_head() {
736 let text_document = FullTextDocument::new("js".to_string(), 2, "\nyee\n\n".to_string());
737 let position = text_document.position_at(1);
738 assert_eq!(
739 position,
740 Position {
741 line: 1,
742 character: 0,
743 }
744 );
745 }
746
747 #[test]
748 fn test_get_content() {
749 let text_document = full_text_document();
750
751 let start = Position {
752 line: 0,
753 character: 0,
754 };
755 let end = Position {
756 line: 1,
757 character: 2,
758 };
759 let range = Range { start, end };
760 let content = text_document.get_content(Some(range));
761 assert_eq!(content, "he\nll");
762
763 let end = Position {
764 line: 100,
765 character: 100,
766 };
767 let range = Range { start, end };
768 let content = text_document.get_content(Some(range));
769 assert_eq!(content, text_document.content);
770
771 let range = Range {
772 start: Position {
773 line: 1,
774 character: 0,
775 },
776 end: Position {
777 line: 2,
778 character: 3,
779 },
780 };
781 let content = text_document.get_content(Some(range));
782 assert_eq!(content, "llo\nwor");
783 }
784
785 #[test]
787 fn test_get_content_bmp() {
788 let text_document = FullTextDocument::new("js".to_string(), 2, "\u{20AC} euro".to_string());
790
791 let range = Range {
793 start: Position {
794 line: 0,
795 character: 0,
796 },
797 end: Position {
798 line: 0,
799 character: 1,
800 },
801 };
802 let content = text_document.get_content(Some(range));
803 assert_eq!(content, "\u{20AC}");
804
805 let range = Range {
808 start: Position {
809 line: 0,
810 character: 2,
811 },
812 end: Position {
813 line: 0,
814 character: 3,
815 },
816 };
817 let content = text_document.get_content(Some(range));
818 assert_eq!(content, "e");
819 }
820
821 #[test]
823 fn test_get_content_smp() {
824 let text_document = FullTextDocument::new("js".to_string(), 2, "\u{10437} yee".to_string());
826
827 let range = Range {
829 start: Position {
830 line: 0,
831 character: 0,
832 },
833 end: Position {
834 line: 0,
835 character: 2,
836 },
837 };
838 let content = text_document.get_content(Some(range));
839 assert_eq!(content, "\u{10437}");
840 }
841
842 #[test]
843 fn test_update_full_content() {
844 let mut text_document = full_text_document();
845 let new_text = "hello\n js!";
846
847 text_document.update(
848 &[TextDocumentContentChangeEvent {
849 text: new_text.to_string(),
850 range: None,
851 range_length: None,
852 }],
853 1,
854 );
855
856 assert_eq!(&text_document.content, new_text);
857 assert_eq!(text_document.line_offsets, vec![0, 6]);
858 }
859
860 #[test]
861 fn test_update_part_content() {
862 let mut text_document = full_text_document();
863 assert_eq!(text_document.version(), 2);
864 let new_text = String::from("xx\ny");
865 let range = Range {
866 start: Position {
867 line: 1,
868 character: 0,
869 },
870 end: Position {
871 line: 1,
872 character: 3,
873 },
874 };
875 text_document.update(
876 &[TextDocumentContentChangeEvent {
877 range: Some(range),
878 range_length: None,
879 text: new_text,
880 }],
881 1,
882 );
883
884 assert_eq!(&text_document.content, "he\nxx\ny\nworld\r\nfoo\rbar");
885 assert_eq!(text_document.line_offsets, vec![0, 3, 6, 8, 15, 19]);
886 assert_eq!(text_document.version(), 1)
887 }
888
889 #[test]
890 fn test_update_new_content_at_end() {
891 let mut text_document = full_text_document();
892 let new_text = String::from("bar\nbaz");
893
894 let range = Range {
895 start: Position {
896 line: 4,
897 character: 0,
898 },
899 end: Position {
900 line: 5,
901 character: 0,
902 },
903 };
904 text_document.update(
905 &[TextDocumentContentChangeEvent {
906 range: Some(range),
907 range_length: None,
908 text: new_text,
909 }],
910 1,
911 );
912
913 assert_eq!(&text_document.content, "he\nllo\nworld\r\nfoo\rbar\nbaz");
914 assert_eq!(text_document.line_offsets, vec![0, 3, 7, 14, 18, 22]);
915 }
916
917 #[test]
918 #[should_panic(
919 expected = "Start offset must be less than end offset. 2:0 (offset 7) is not <= 1:0 (offset 3)"
920 )]
921 fn test_update_invalid_range() {
922 let mut text_document = full_text_document();
923 let range = Range {
925 start: Position {
926 line: 2,
927 character: 0,
928 },
929 end: Position {
930 line: 1,
931 character: 0,
932 },
933 };
934 text_document.update(
935 &[TextDocumentContentChangeEvent {
936 text: String::from(""),
937 range: Some(range),
938 range_length: Some(0),
939 }],
940 1,
941 );
942 }
943
944 #[test]
947 fn test_multiple_position_same_offset() {
948 let text_document = full_text_document();
949 let end_of_first_line = Position {
950 line: 0,
951 character: 3,
952 };
953 let start_of_second_line = Position {
954 line: 1,
955 character: 0,
956 };
957 assert_eq!(
958 text_document.offset_at(end_of_first_line),
959 text_document.offset_at(start_of_second_line)
960 );
961
962 let beyond_end_of_first_line = Position {
963 line: 0,
964 character: 10_000,
965 };
966 assert_eq!(
967 text_document.offset_at(beyond_end_of_first_line),
968 text_document.offset_at(start_of_second_line)
969 );
970 }
971
972 #[test]
973 fn test_insert_using_positions_after_newline_at_end_of_line() {
974 let mut doc = FullTextDocument::new(
975 "text".to_string(),
976 0,
977 "0:1332533\n0:1332534\n0:1332535\n0:1332536\n".to_string(),
978 );
979 doc.update(
980 &[TextDocumentContentChangeEvent {
981 range: Some(Range {
982 start: Position {
984 line: 1,
985 character: 10,
986 },
987 end: Position {
989 line: 2,
990 character: 10,
991 },
992 }),
993 range_length: None,
994 text: "1:6188912\n1:6188913\n1:6188914\n".to_string(),
995 }],
996 1,
997 );
998 assert_eq!(
999 doc.get_content(None),
1000 concat!(
1001 "0:1332533\n0:1332534\n",
1002 "1:6188912\n1:6188913\n1:6188914\n",
1003 "0:1332536\n",
1004 ),
1005 );
1006 assert_eq!(doc.line_offsets, vec!(0, 10, 20, 30, 40, 50, 60));
1007 }
1008
1009 #[test]
1010 fn test_line_offsets() {
1011 let mut doc =
1012 FullTextDocument::new("text".to_string(), 0, "123456789\n123456789\n".to_string());
1013 assert_eq!(doc.line_offsets, vec!(0, 10, 20));
1014 doc.update(
1015 &[TextDocumentContentChangeEvent {
1016 range: Some(Range {
1017 start: Position {
1018 line: 1,
1019 character: 5,
1020 },
1021 end: Position {
1022 line: 1,
1023 character: 5,
1024 },
1025 }),
1026 range_length: None,
1027 text: "\nA\nB\nC\n".to_string(),
1028 }],
1029 1,
1030 );
1031 assert_eq!(doc.get_content(None), "123456789\n12345\nA\nB\nC\n6789\n",);
1032 assert_eq!(doc.line_offsets, vec!(0, 10, 16, 18, 20, 22, 27));
1033 }
1034
1035 #[test]
1040 fn test_find_canonical_position_regression() {
1041 let str = "\u{20AC}456789\n123456789\n";
1045 let mut doc = FullTextDocument::new("text".to_string(), 0, str.to_string());
1046
1047 let pos = Position {
1048 line: 0,
1049 character: 6,
1050 };
1051 let offset = doc.offset_at(pos) as usize;
1052 assert_ne!(str.as_bytes().get(offset - 1), Some(&b'\n'));
1053 assert_eq!(str.chars().nth(offset - 1), Some('\n'));
1054
1055 doc.update(
1056 &[TextDocumentContentChangeEvent {
1057 range: Some(Range {
1058 start: pos,
1059 end: Position {
1060 line: 0,
1061 character: 7,
1062 },
1063 }),
1064 range_length: None,
1065 text: "X".to_string(),
1066 }],
1067 1,
1068 );
1069 assert_eq!(doc.get_content(None), "\u{20AC}45678X\n123456789\n",);
1070 assert_eq!(doc.line_offsets, vec!(0, 10, 20));
1071 }
1072}