lsp_textdocument/
text_document.rs

1use lsp_types::{Position, Range, TextDocumentContentChangeEvent};
2
3#[derive(Debug)]
4pub struct FullTextDocument {
5    language_id: String,
6    version: i32,
7    content: String,
8
9    /// The value at index `i` in `line_offsets` is the index into `content`
10    /// that is the start of line `i`. As such, the first element of
11    /// `line_offsets` is always 0.
12    line_offsets: Vec<u32>,
13}
14
15fn computed_line_offsets(text: &str, is_at_line_start: bool, text_offset: Option<u32>) -> Vec<u32> {
16    let text_offset = text_offset.unwrap_or(0);
17    let mut line_offsets = if is_at_line_start {
18        vec![text_offset]
19    } else {
20        vec![]
21    };
22
23    let mut chars = text.char_indices().peekable();
24    while let Some((idx, char)) = chars.next() {
25        let idx: u32 = idx
26            .try_into()
27            .expect("The length of the text involved in the calculation is too long");
28        if char == '\r' && chars.peek() == Some(&(idx as usize + 1, '\n')) {
29            chars.next();
30            line_offsets.push(text_offset + idx + 2);
31        } else if char == '\n' || char == '\r' {
32            line_offsets.push(text_offset + idx + 1);
33        }
34    }
35
36    line_offsets
37}
38
39/// given a string (in UTF-8) and a byte offset, returns the offset in UTF-16 code units
40///
41/// for example, consider a string containing a single 4-byte emoji. 4-byte characters
42/// in UTF-8 are supplementary plane characters that require two UTF-16 code units
43/// (surrogate pairs).
44///
45/// in this example:
46/// - offset 4 returns 2;
47/// - offsets 1, 2 or 3 return 0, because they are not on a character boundary and round down;
48/// - offset 5+ will return 2, the length of the string in UTF-16
49fn line_offset_utf16(line: &str, offset: u32) -> u32 {
50    let mut c = 0;
51    for (idx, char) in line.char_indices() {
52        if idx + char.len_utf8() > offset as usize || idx == offset as usize {
53            break;
54        }
55        c += char.len_utf16() as u32;
56    }
57    c
58}
59
60impl FullTextDocument {
61    pub fn new(language_id: String, version: i32, content: String) -> Self {
62        let line_offsets = computed_line_offsets(&content, true, None);
63        Self {
64            language_id,
65            version,
66            content,
67            line_offsets,
68        }
69    }
70
71    pub fn update(&mut self, changes: &[TextDocumentContentChangeEvent], version: i32) {
72        for change in changes {
73            let TextDocumentContentChangeEvent { range, text, .. } = change;
74            match range {
75                Some(range) => {
76                    // update content
77                    let Range { start, end } = range;
78                    let (start, start_offset) = self.find_canonical_position(start);
79                    let (end, end_offset) = self.find_canonical_position(end);
80                    assert!(
81                        start_offset <= end_offset,
82                        "Start offset must be less than end offset. {}:{} (offset {}) is not <= {}:{} (offset {})",
83                        start.line, start.character, start_offset,
84                        end.line, end.character, end_offset
85                    );
86                    self.content
87                        .replace_range((start_offset as usize)..(end_offset as usize), text);
88
89                    let (start_line, end_line) = (start.line, end.line);
90                    assert!(start_line <= end_line);
91                    let added_line_offsets = computed_line_offsets(text, false, Some(start_offset));
92                    let num_added_line_offsets = added_line_offsets.len();
93
94                    let splice_start = start_line as usize + 1;
95                    let splice_end = std::cmp::min(end_line, self.line_count() - 1) as usize;
96                    self.line_offsets
97                        .splice(splice_start..=splice_end, added_line_offsets);
98
99                    let diff =
100                        (text.len() as i32).saturating_sub_unsigned(end_offset - start_offset);
101                    if diff != 0 {
102                        for i in
103                            (splice_start + num_added_line_offsets)..(self.line_count() as usize)
104                        {
105                            self.line_offsets[i] = self.line_offsets[i].saturating_add_signed(diff);
106                        }
107                    }
108                }
109                None => {
110                    // Full Text
111                    // update line_offsets
112                    self.line_offsets = computed_line_offsets(text, true, None);
113
114                    // update content
115                    self.content = text.to_owned();
116                }
117            }
118        }
119
120        self.version = version;
121    }
122
123    /// As demonstrated by test_multiple_position_same_offset(), in some cases,
124    /// there are multiple ways to reference the same Position. We map to a
125    /// "canonical Position" so we can avoid worrying about edge cases all over
126    /// the place.
127    fn find_canonical_position(&self, position: &Position) -> (Position, u32) {
128        let offset = self.offset_at(*position);
129        if offset == 0 {
130            (
131                Position {
132                    line: 0,
133                    character: 0,
134                },
135                0,
136            )
137        } else if self.content.as_bytes().get(offset as usize - 1) == Some(&b'\n') {
138            if self.line_offsets[position.line as usize] == offset {
139                (*position, offset)
140            } else if self.line_offsets[position.line as usize + 1] == offset {
141                (
142                    Position {
143                        line: position.line + 1,
144                        character: 0,
145                    },
146                    offset,
147                )
148            } else {
149                panic!(
150                    "Could not determine canonical value for {position:?} in {:?}",
151                    self.content
152                )
153            }
154        } else {
155            (*position, offset)
156        }
157    }
158
159    /// Document's language id
160    pub fn language_id(&self) -> &str {
161        &self.language_id
162    }
163
164    /// Document's version
165    pub fn version(&self) -> i32 {
166        self.version
167    }
168
169    /// Get document content
170    ///
171    /// # Examples
172    ///
173    /// Basic usage:
174    /// ```
175    /// use lsp_textdocument::FullTextDocument;
176    /// use lsp_types::{Range, Position};
177    ///
178    /// let text_documents = FullTextDocument::new("plain_text".to_string(), 1, "hello rust!".to_string());
179    ///
180    /// // get document all content
181    /// let content = text_documents.get_content(None);
182    /// assert_eq!(content, "hello rust!");
183    ///
184    /// // get document specify content by range
185    /// let (start, end) = (Position::new(0, 1), Position::new(0, 9));
186    /// let range = Range::new(start, end);
187    /// let sub_content = text_documents.get_content(Some(range));
188    /// assert_eq!(sub_content, "ello rus");
189    /// ```
190    pub fn get_content(&self, range: Option<Range>) -> &str {
191        match range {
192            Some(Range { start, end }) => {
193                let start = self.offset_at(start);
194                let end = self.offset_at(end).min(self.content_len());
195                self.content.get(start as usize..end as usize).unwrap()
196            }
197            None => &self.content,
198        }
199    }
200
201    fn get_line_and_offset(&self, line: u32) -> Option<(&str, u32)> {
202        self.line_offsets.get(line as usize).map(|&line_offset| {
203            let len: u32 = self.content_len();
204            let eol_offset = self.line_offsets.get((line + 1) as usize).unwrap_or(&len);
205            let line = &self.content[line_offset as usize..*eol_offset as usize];
206            (line, line_offset)
207        })
208    }
209
210    fn get_line(&self, line: u32) -> Option<&str> {
211        self.get_line_and_offset(line).map(|(line, _)| line)
212    }
213
214    /// A amount of document content line
215    pub fn line_count(&self) -> u32 {
216        self.line_offsets
217            .len()
218            .try_into()
219            .expect("The number of lines of text passed in is too long")
220    }
221
222    /// The length of the document content in UTF-8 bytes
223    pub fn content_len(&self) -> u32 {
224        self.content
225            .len()
226            .try_into()
227            .expect("The length of the text passed in is too long")
228    }
229
230    /// Converts a zero-based byte offset in the UTF8-encoded content to a position
231    ///
232    /// the offset is in bytes, the position is in UTF16 code units. rounds down if
233    /// the offset is not on a code unit boundary, or is beyond the end of the
234    /// content.
235    pub fn position_at(&self, offset: u32) -> Position {
236        let offset = offset.min(self.content_len());
237        let line_count = self.line_count();
238        if line_count == 1 {
239            // only one line
240            return Position {
241                line: 0,
242                character: line_offset_utf16(self.get_line(0).unwrap(), offset),
243            };
244        }
245
246        let (mut low, mut high) = (0, line_count);
247        while low < high {
248            let mid = (low + high) / 2;
249            if offset
250                >= *self
251                    .line_offsets
252                    .get(mid as usize)
253                    .expect("Unknown mid value")
254            {
255                low = mid + 1;
256            } else {
257                high = mid;
258            }
259        }
260
261        if low == 0 {
262            // offset is on the first line
263            return Position {
264                line: 0,
265                character: line_offset_utf16(self.get_line(0).unwrap(), offset),
266            };
267        }
268
269        let line = low - 1;
270
271        Position {
272            line,
273            character: line_offset_utf16(
274                self.get_line(line).unwrap(),
275                offset - self.line_offsets[line as usize],
276            ),
277        }
278    }
279
280    /// Converts a position to a zero-based byte offset, suitable for slicing the
281    /// UTF-8 encoded content.
282    pub fn offset_at(&self, position: Position) -> u32 {
283        let Position { line, character } = position;
284        match self.get_line_and_offset(line) {
285            Some((line, offset)) => {
286                let mut c = 0;
287                let iter = line.char_indices();
288                for (idx, char) in iter {
289                    if c == character {
290                        return offset + idx as u32;
291                    }
292                    c += char.len_utf16() as u32;
293                }
294                offset + line.len() as u32
295            }
296            None => {
297                if line >= self.line_count() {
298                    self.content_len()
299                } else {
300                    0
301                }
302            }
303        }
304    }
305}
306
307#[cfg(test)]
308mod tests {
309    use super::*;
310
311    fn full_text_document() -> FullTextDocument {
312        FullTextDocument::new(
313            "js".to_string(),
314            2,
315            "he\nllo\nworld\r\nfoo\rbar".to_string(),
316        )
317    }
318
319    #[test]
320    fn test_offset_at() {
321        let text_document = full_text_document();
322
323        let offset = text_document.offset_at(Position {
324            line: 1,
325            character: 1,
326        });
327        assert_eq!(offset, 4);
328
329        let offset = text_document.offset_at(Position {
330            line: 2,
331            character: 3,
332        });
333        assert_eq!(offset, 10);
334
335        // the `f` in `foo` (\r\n is a single line terminator)
336        let offset = text_document.offset_at(Position {
337            line: 3,
338            character: 1,
339        });
340        assert_eq!(offset, 15);
341    }
342
343    /// basic multilingual plane
344    #[test]
345    fn test_offset_at_bmp() {
346        // Euro symbol
347        let text_document = FullTextDocument::new("js".to_string(), 2, "\u{20AC} euro".to_string());
348
349        let offset = text_document.offset_at(Position {
350            line: 0,
351            // E euro
352            //   ^
353            character: 2,
354        });
355        assert_eq!(offset, 4);
356    }
357
358    /// supplementary multilingual plane, aka surrogate pair
359    #[test]
360    fn test_offset_at_smp() {
361        // Deseret Small Letter Yee
362        let text_document = FullTextDocument::new("js".to_string(), 2, "\u{10437} yee".to_string());
363        let offset = text_document.offset_at(Position {
364            line: 0,
365            // HL yee
366            //    ^
367            character: 3,
368        });
369        assert_eq!(offset, 5);
370    }
371
372    /// a character beyond the end of the line should clamp to the end of the line
373    #[test]
374    fn test_offset_at_beyond_end_of_line() {
375        let text_document =
376            FullTextDocument::new("js".to_string(), 2, "\u{20AC} abc\nline 2".to_string());
377        // "\u{20AC} abc\nline 2" in UTF-8:
378        // \xE2 \x82 \xAC \x20 \x61 \x62 \x63 \x0A \x6C \x69 \x6E \x65 \x20 \x32
379        // ^ line 1 == 0                           ^ line 2 == 8
380        assert_eq!(text_document.line_offsets, vec![0, 8]);
381
382        let offset = text_document.offset_at(Position {
383            line: 0,
384            character: 100,
385        });
386        assert_eq!(offset, 8);
387    }
388
389    #[test]
390    fn test_position_at() {
391        let text_document = full_text_document();
392
393        let position = text_document.position_at(5);
394        assert_eq!(
395            position,
396            Position {
397                line: 1,
398                character: 2
399            }
400        );
401
402        let position = text_document.position_at(11);
403        assert_eq!(
404            position,
405            Position {
406                line: 2,
407                character: 4,
408            }
409        );
410
411        let position = text_document.position_at(15);
412        assert_eq!(
413            position,
414            Position {
415                line: 3,
416                character: 1,
417            }
418        );
419
420        let position = text_document.position_at(0);
421        assert_eq!(
422            position,
423            Position {
424                line: 0,
425                character: 0,
426            }
427        );
428    }
429
430    /// basic multilingual plane
431    #[test]
432    fn test_position_at_bmp() {
433        // Euro symbol
434        let text_document = FullTextDocument::new("js".to_string(), 2, "\u{20AC} euro".to_string());
435        let position = text_document.position_at(4);
436        assert_eq!(
437            position,
438            Position {
439                line: 0,
440                // E euro
441                //   ^
442                character: 2,
443            }
444        );
445
446        // multi-line content
447        let text_document =
448            FullTextDocument::new("js".to_string(), 2, "\n\n\u{20AC} euro\n\n".to_string());
449        let position = text_document.position_at(6);
450        assert_eq!(
451            position,
452            Position {
453                line: 2,
454                // E euro
455                //   ^
456                character: 2,
457            }
458        );
459    }
460
461    /// supplementary multilingual plane, aka surrogate pair
462    #[test]
463    fn test_position_at_smp() {
464        // Deseret Small Letter Yee
465        let text_document = FullTextDocument::new("js".to_string(), 2, "\u{10437} yee".to_string());
466        assert_eq!(
467            text_document.position_at(5),
468            Position {
469                line: 0,
470                // HL yee
471                //    ^
472                character: 3,
473            }
474        );
475
476        // \u{10437} is 4 bytes wide. if not on a char boundary, round down
477        assert_eq!(
478            text_document.position_at(2),
479            Position {
480                line: 0,
481                character: 0,
482            }
483        );
484
485        // multi-line content
486        let text_document =
487            FullTextDocument::new("js".to_string(), 2, "\n\n\u{10437} yee\n\n".to_string());
488        let position = text_document.position_at(7);
489        assert_eq!(
490            position,
491            Position {
492                line: 2,
493                // HL yee
494                //    ^
495                character: 3,
496            }
497        );
498    }
499
500    /// https://github.com/GiveMe-A-Name/lsp-textdocument/issues/53
501    #[test]
502    fn test_position_at_line_head() {
503        let text_document = FullTextDocument::new("js".to_string(), 2, "\nyee\n\n".to_string());
504        let position = text_document.position_at(1);
505        assert_eq!(
506            position,
507            Position {
508                line: 1,
509                character: 0,
510            }
511        );
512    }
513
514    #[test]
515    fn test_get_content() {
516        let text_document = full_text_document();
517
518        let start = Position {
519            line: 0,
520            character: 0,
521        };
522        let end = Position {
523            line: 1,
524            character: 2,
525        };
526        let range = Range { start, end };
527        let content = text_document.get_content(Some(range));
528        assert_eq!(content, "he\nll");
529
530        let end = Position {
531            line: 100,
532            character: 100,
533        };
534        let range = Range { start, end };
535        let content = text_document.get_content(Some(range));
536        assert_eq!(content, text_document.content);
537
538        let range = Range {
539            start: Position {
540                line: 1,
541                character: 0,
542            },
543            end: Position {
544                line: 2,
545                character: 3,
546            },
547        };
548        let content = text_document.get_content(Some(range));
549        assert_eq!(content, "llo\nwor");
550    }
551
552    /// basic multilingual plane
553    #[test]
554    fn test_get_content_bmp() {
555        // Euro symbol
556        let text_document = FullTextDocument::new("js".to_string(), 2, "\u{20AC} euro".to_string());
557
558        // Euro symbol is 1 UTF16 code unit wide
559        let range = Range {
560            start: Position {
561                line: 0,
562                character: 0,
563            },
564            end: Position {
565                line: 0,
566                character: 1,
567            },
568        };
569        let content = text_document.get_content(Some(range));
570        assert_eq!(content, "\u{20AC}");
571
572        // E euro
573        //   ^
574        let range = Range {
575            start: Position {
576                line: 0,
577                character: 2,
578            },
579            end: Position {
580                line: 0,
581                character: 3,
582            },
583        };
584        let content = text_document.get_content(Some(range));
585        assert_eq!(content, "e");
586    }
587
588    /// supplementary multilingual plane, aka surrogate pairs
589    #[test]
590    fn test_get_content_smp() {
591        // Deseret Small Letter Yee
592        let text_document = FullTextDocument::new("js".to_string(), 2, "\u{10437} yee".to_string());
593
594        // surrogate pairs are 2 UTF16 code units wide
595        let range = Range {
596            start: Position {
597                line: 0,
598                character: 0,
599            },
600            end: Position {
601                line: 0,
602                character: 2,
603            },
604        };
605        let content = text_document.get_content(Some(range));
606        assert_eq!(content, "\u{10437}");
607    }
608
609    #[test]
610    fn test_update_full_content() {
611        let mut text_document = full_text_document();
612        let new_text = "hello\n js!";
613
614        text_document.update(
615            &[TextDocumentContentChangeEvent {
616                text: new_text.to_string(),
617                range: None,
618                range_length: None,
619            }],
620            1,
621        );
622
623        assert_eq!(&text_document.content, new_text);
624        assert_eq!(text_document.line_offsets, vec![0, 6]);
625    }
626
627    #[test]
628    fn test_update_part_content() {
629        let mut text_document = full_text_document();
630        assert_eq!(text_document.version(), 2);
631        let new_text = String::from("xx\ny");
632        let range = Range {
633            start: Position {
634                line: 1,
635                character: 0,
636            },
637            end: Position {
638                line: 1,
639                character: 3,
640            },
641        };
642        text_document.update(
643            &[TextDocumentContentChangeEvent {
644                range: Some(range),
645                range_length: None,
646                text: new_text,
647            }],
648            1,
649        );
650
651        assert_eq!(&text_document.content, "he\nxx\ny\nworld\r\nfoo\rbar");
652        assert_eq!(text_document.line_offsets, vec![0, 3, 6, 8, 15, 19]);
653        assert_eq!(text_document.version(), 1)
654    }
655
656    #[test]
657    fn test_update_new_content_at_end() {
658        let mut text_document = full_text_document();
659        let new_text = String::from("bar\nbaz");
660
661        let range = Range {
662            start: Position {
663                line: 4,
664                character: 0,
665            },
666            end: Position {
667                line: 5,
668                character: 0,
669            },
670        };
671        text_document.update(
672            &[TextDocumentContentChangeEvent {
673                range: Some(range),
674                range_length: None,
675                text: new_text,
676            }],
677            1,
678        );
679
680        assert_eq!(&text_document.content, "he\nllo\nworld\r\nfoo\rbar\nbaz");
681        assert_eq!(text_document.line_offsets, vec![0, 3, 7, 14, 18, 22]);
682    }
683
684    #[test]
685    #[should_panic(
686        expected = "Start offset must be less than end offset. 2:0 (offset 7) is not <= 1:0 (offset 3)"
687    )]
688    fn test_update_invalid_range() {
689        let mut text_document = full_text_document();
690        // start is after end
691        let range = Range {
692            start: Position {
693                line: 2,
694                character: 0,
695            },
696            end: Position {
697                line: 1,
698                character: 0,
699            },
700        };
701        text_document.update(
702            &[TextDocumentContentChangeEvent {
703                text: String::from(""),
704                range: Some(range),
705                range_length: Some(0),
706            }],
707            1,
708        );
709    }
710
711    /// It turns out that there are multiple values for Position that can map to
712    /// the same offset following a newline.
713    #[test]
714    fn test_multiple_position_same_offset() {
715        let text_document = full_text_document();
716        let end_of_first_line = Position {
717            line: 0,
718            character: 3,
719        };
720        let start_of_second_line = Position {
721            line: 1,
722            character: 0,
723        };
724        assert_eq!(
725            text_document.offset_at(end_of_first_line),
726            text_document.offset_at(start_of_second_line)
727        );
728
729        let beyond_end_of_first_line = Position {
730            line: 0,
731            character: 10_000,
732        };
733        assert_eq!(
734            text_document.offset_at(beyond_end_of_first_line),
735            text_document.offset_at(start_of_second_line)
736        );
737    }
738
739    #[test]
740    fn test_insert_using_positions_after_newline_at_end_of_line() {
741        let mut doc = FullTextDocument::new(
742            "text".to_string(),
743            0,
744            "0:1332533\n0:1332534\n0:1332535\n0:1332536\n".to_string(),
745        );
746        doc.update(
747            &[TextDocumentContentChangeEvent {
748                range: Some(Range {
749                    // After \n at the end of line 1.
750                    start: Position {
751                        line: 1,
752                        character: 10,
753                    },
754                    // After \n at the end of line 2.
755                    end: Position {
756                        line: 2,
757                        character: 10,
758                    },
759                }),
760                range_length: None,
761                text: "1:6188912\n1:6188913\n1:6188914\n".to_string(),
762            }],
763            1,
764        );
765        assert_eq!(
766            doc.get_content(None),
767            concat!(
768                "0:1332533\n0:1332534\n",
769                "1:6188912\n1:6188913\n1:6188914\n",
770                "0:1332536\n",
771            ),
772        );
773        assert_eq!(doc.line_offsets, vec!(0, 10, 20, 30, 40, 50, 60));
774    }
775
776    #[test]
777    fn test_line_offsets() {
778        let mut doc =
779            FullTextDocument::new("text".to_string(), 0, "123456789\n123456789\n".to_string());
780        assert_eq!(doc.line_offsets, vec!(0, 10, 20));
781        doc.update(
782            &[TextDocumentContentChangeEvent {
783                range: Some(Range {
784                    start: Position {
785                        line: 1,
786                        character: 5,
787                    },
788                    end: Position {
789                        line: 1,
790                        character: 5,
791                    },
792                }),
793                range_length: None,
794                text: "\nA\nB\nC\n".to_string(),
795            }],
796            1,
797        );
798        assert_eq!(doc.get_content(None), "123456789\n12345\nA\nB\nC\n6789\n",);
799        assert_eq!(doc.line_offsets, vec!(0, 10, 16, 18, 20, 22, 27));
800    }
801
802    /// This tests a regression caused by confusing byte and character offsets.
803    /// When [update] was called on a position whose offset points just after a
804    /// non-newline when interpreted as bytes, but pointed just after at a
805    /// newline when interpreted as chars, it led to a crash.
806    #[test]
807    fn test_find_canonical_position_regression() {
808        // \u{20AC} is a single character in utf-16 but 3 bytes in utf-8,
809        // so the offsets in bytes of everything after it is +2 their offsets
810        // in characters.
811        let str = "\u{20AC}456789\n123456789\n";
812        let mut doc = FullTextDocument::new("text".to_string(), 0, str.to_string());
813
814        let pos = Position {
815            line: 0,
816            character: 6,
817        };
818        let offset = doc.offset_at(pos) as usize;
819        assert_ne!(str.as_bytes().get(offset - 1), Some(&b'\n'));
820        assert_eq!(str.chars().nth(offset - 1), Some('\n'));
821
822        doc.update(
823            &[TextDocumentContentChangeEvent {
824                range: Some(Range {
825                    start: pos,
826                    end: Position {
827                        line: 0,
828                        character: 7,
829                    },
830                }),
831                range_length: None,
832                text: "X".to_string(),
833            }],
834            1,
835        );
836        assert_eq!(doc.get_content(None), "\u{20AC}45678X\n123456789\n",);
837        assert_eq!(doc.line_offsets, vec!(0, 10, 20));
838    }
839}