lsp_textdocument/
text_document.rs

1use lsp_types::{Position, PositionEncodingKind, Range, TextDocumentContentChangeEvent};
2
3#[derive(Debug)]
4pub struct FullTextDocument {
5    language_id: String,
6    version: i32,
7    content: String,
8    encoding: PositionEncodingKind,
9
10    /// The value at index `i` in `line_offsets` is the index into `content`
11    /// that is the start of line `i`. As such, the first element of
12    /// `line_offsets` is always 0.
13    line_offsets: Vec<u32>,
14}
15
16fn computed_line_offsets(text: &str, is_at_line_start: bool, text_offset: Option<u32>) -> Vec<u32> {
17    let text_offset = text_offset.unwrap_or(0);
18    let mut line_offsets = if is_at_line_start {
19        vec![text_offset]
20    } else {
21        vec![]
22    };
23
24    let mut chars = text.char_indices().peekable();
25    while let Some((idx, char)) = chars.next() {
26        let idx: u32 = idx
27            .try_into()
28            .expect("The length of the text involved in the calculation is too long");
29        if char == '\r' && chars.peek() == Some(&(idx as usize + 1, '\n')) {
30            chars.next();
31            line_offsets.push(text_offset + idx + 2);
32        } else if char == '\n' || char == '\r' {
33            line_offsets.push(text_offset + idx + 1);
34        }
35    }
36
37    line_offsets
38}
39
40/// given a string (in UTF-8) and a byte offset, returns the offset in UTF-16 code units
41///
42/// for example, consider a string containing a single 4-byte emoji. 4-byte characters
43/// in UTF-8 are supplementary plane characters that require two UTF-16 code units
44/// (surrogate pairs).
45///
46/// in this example:
47/// - offset 4 returns 2;
48/// - offsets 1, 2 or 3 return 0, because they are not on a character boundary and round down;
49/// - offset 5+ will return 2, the length of the string in UTF-16
50fn line_offset_utf16(line: &str, offset: u32) -> u32 {
51    let mut c = 0;
52    for (idx, char) in line.char_indices() {
53        if idx + char.len_utf8() > offset as usize || idx == offset as usize {
54            break;
55        }
56        c += char.len_utf16() as u32;
57    }
58    c
59}
60
61fn line_offset_utf8(line: &str, offset: u32) -> u32 {
62    offset.min(
63        line
64            .len()
65            .try_into()
66            .expect("The length of the text involved in the calculation is too long"),
67    )
68}
69
70fn line_offset_utf32(line: &str, offset: u32) -> u32 {
71    let mut c = 0;
72    for (idx, ch) in line.char_indices() {
73        if idx + ch.len_utf8() > offset as usize || idx == offset as usize {
74            break;
75        }
76        c += 1;
77    }
78    c
79}
80
81fn units_to_byte_offset_utf16(line: &str, character: u32) -> u32 {
82    let mut c = 0;
83    for (idx, ch) in line.char_indices() {
84        if c == character {
85            return idx as u32;
86        }
87        let next = c + ch.len_utf16() as u32;
88        if next > character {
89            return idx as u32;
90        }
91        c = next;
92    }
93    line.len() as u32
94}
95
96fn units_to_byte_offset_utf8(line: &str, character: u32) -> u32 {
97    character.min(
98        line
99            .len()
100            .try_into()
101            .expect("The length of the text involved in the calculation is too long"),
102    )
103}
104
105fn units_to_byte_offset_utf32(line: &str, character: u32) -> u32 {
106    let mut c = 0;
107    for (idx, _ch) in line.char_indices() {
108        if c == character {
109            return idx as u32;
110        }
111        c += 1;
112        if c > character {
113            return idx as u32;
114        }
115    }
116    line.len() as u32
117}
118
119fn bytes_to_units(line: &str, offset: u32, encoding: &PositionEncodingKind) -> u32 {
120    if encoding == &PositionEncodingKind::UTF8 {
121        line_offset_utf8(line, offset)
122    } else if encoding == &PositionEncodingKind::UTF16 {
123        line_offset_utf16(line, offset)
124    } else {
125        line_offset_utf32(line, offset)
126    }
127}
128
129fn units_to_bytes(line: &str, character: u32, encoding: &PositionEncodingKind) -> u32 {
130    if encoding == &PositionEncodingKind::UTF8 {
131        units_to_byte_offset_utf8(line, character)
132    } else if encoding == &PositionEncodingKind::UTF16 {
133        units_to_byte_offset_utf16(line, character)
134    } else {
135        units_to_byte_offset_utf32(line, character)
136    }
137}
138
139impl FullTextDocument {
140    pub fn new(language_id: String, version: i32, content: String) -> Self {
141        Self::new_with_encoding(language_id, version, content, PositionEncodingKind::UTF16)
142    }
143
144    /// Create a new text document with a specific position encoding
145    ///
146    /// This method allows you to create a text document with a specific position encoding
147    /// for character positions. The encoding determines how character offsets are calculated
148    /// and is important for proper LSP communication between client and server.
149    ///
150    /// Use this method instead of [`new()`](Self::new) when you need to use a position encoding
151    /// other than UTF-16, or when you want to explicitly specify the encoding to match what
152    /// was negotiated with the LSP client during initialization.
153    ///
154    /// # Arguments
155    ///
156    /// * `language_id` - The language identifier for the document (e.g., "rust", "javascript")
157    /// * `version` - The version number of the document
158    /// * `content` - The full text content of the document
159    /// * `encoding` - The position encoding to use. Can be UTF-8, UTF-16, or UTF-32.
160    ///
161    /// # Position Encodings
162    ///
163    /// - **UTF-16**: The default encoding for backward compatibility with LSP 3.16 and earlier.
164    ///   Each UTF-16 code unit counts as one position unit. Use [`new()`](Self::new) for this.
165    /// - **UTF-8**: Each byte counts as one position unit. More efficient for ASCII-heavy text.
166    /// - **UTF-32**: Each Unicode code point counts as one position unit.
167    ///
168    /// # Examples
169    ///
170    /// Basic usage with UTF-16 (default):
171    ///
172    /// ```
173    /// use lsp_textdocument::FullTextDocument;
174    /// use lsp_types::PositionEncodingKind;
175    ///
176    /// let doc = FullTextDocument::new_with_encoding(
177    ///     "rust".to_string(),
178    ///     1,
179    ///     "fn main() {}".to_string(),
180    ///     PositionEncodingKind::UTF16
181    /// );
182    /// ```
183    ///
184    /// Using UTF-8 encoding for better performance with ASCII text:
185    ///
186    /// ```
187    /// use lsp_textdocument::FullTextDocument;
188    /// use lsp_types::PositionEncodingKind;
189    ///
190    /// let doc = FullTextDocument::new_with_encoding(
191    ///     "javascript".to_string(),
192    ///     1,
193    ///     "console.log('Hello');".to_string(),
194    ///     PositionEncodingKind::UTF8
195    /// );
196    /// ```
197    ///
198    /// Using UTF-32 encoding where each Unicode code point is one unit:
199    ///
200    /// ```
201    /// use lsp_textdocument::FullTextDocument;
202    /// use lsp_types::PositionEncodingKind;
203    ///
204    /// let doc = FullTextDocument::new_with_encoding(
205    ///     "plain".to_string(),
206    ///     1,
207    ///     "Hello 🦀 World".to_string(),
208    ///     PositionEncodingKind::UTF32
209    /// );
210    /// ```
211    pub fn new_with_encoding(
212        language_id: String,
213        version: i32,
214        content: String,
215        encoding: PositionEncodingKind,
216    ) -> Self {
217        let line_offsets = computed_line_offsets(&content, true, None);
218        Self {
219            language_id,
220            version,
221            content,
222            encoding,
223            line_offsets,
224        }
225    }
226
227    pub fn update(&mut self, changes: &[TextDocumentContentChangeEvent], version: i32) {
228        for change in changes {
229            let TextDocumentContentChangeEvent { range, text, .. } = change;
230            match range {
231                Some(range) => {
232                    // update content
233                    let Range { start, end } = range;
234                    let (start, start_offset) = self.find_canonical_position(start);
235                    let (end, end_offset) = self.find_canonical_position(end);
236                    assert!(
237                        start_offset <= end_offset,
238                        "Start offset must be less than end offset. {}:{} (offset {}) is not <= {}:{} (offset {})",
239                        start.line, start.character, start_offset,
240                        end.line, end.character, end_offset
241                    );
242                    self.content
243                        .replace_range((start_offset as usize)..(end_offset as usize), text);
244
245                    let (start_line, end_line) = (start.line, end.line);
246                    assert!(start_line <= end_line);
247                    let added_line_offsets = computed_line_offsets(text, false, Some(start_offset));
248                    let num_added_line_offsets = added_line_offsets.len();
249
250                    let splice_start = start_line as usize + 1;
251                    let splice_end = std::cmp::min(end_line, self.line_count() - 1) as usize;
252                    self.line_offsets
253                        .splice(splice_start..=splice_end, added_line_offsets);
254
255                    let diff =
256                        (text.len() as i32).saturating_sub_unsigned(end_offset - start_offset);
257                    if diff != 0 {
258                        for i in
259                            (splice_start + num_added_line_offsets)..(self.line_count() as usize)
260                        {
261                            self.line_offsets[i] = self.line_offsets[i].saturating_add_signed(diff);
262                        }
263                    }
264                }
265                None => {
266                    // Full Text
267                    // update line_offsets
268                    self.line_offsets = computed_line_offsets(text, true, None);
269
270                    // update content
271                    self.content = text.to_owned();
272                }
273            }
274        }
275
276        self.version = version;
277    }
278
279    /// As demonstrated by test_multiple_position_same_offset(), in some cases,
280    /// there are multiple ways to reference the same Position. We map to a
281    /// "canonical Position" so we can avoid worrying about edge cases all over
282    /// the place.
283    fn find_canonical_position(&self, position: &Position) -> (Position, u32) {
284        let offset = self.offset_at(*position);
285        if offset == 0 {
286            (
287                Position {
288                    line: 0,
289                    character: 0,
290                },
291                0,
292            )
293        } else if self.content.as_bytes().get(offset as usize - 1) == Some(&b'\n') {
294            if self.line_offsets[position.line as usize] == offset {
295                (*position, offset)
296            } else if self.line_offsets[position.line as usize + 1] == offset {
297                (
298                    Position {
299                        line: position.line + 1,
300                        character: 0,
301                    },
302                    offset,
303                )
304            } else {
305                panic!(
306                    "Could not determine canonical value for {position:?} in {:?}",
307                    self.content
308                )
309            }
310        } else {
311            (*position, offset)
312        }
313    }
314
315    /// Document's language id
316    pub fn language_id(&self) -> &str {
317        &self.language_id
318    }
319
320    /// Returns the position encoding kind used by this document when
321    /// converting between LSP positions and byte offsets.
322    pub fn encoding(&self) -> PositionEncodingKind {
323        self.encoding.clone()
324    }
325
326    /// Document's version
327    pub fn version(&self) -> i32 {
328        self.version
329    }
330
331    /// Get document content
332    ///
333    /// # Examples
334    ///
335    /// Basic usage:
336    /// ```
337    /// use lsp_textdocument::FullTextDocument;
338    /// use lsp_types::{Range, Position};
339    ///
340    /// let text_documents = FullTextDocument::new("plain_text".to_string(), 1, "hello rust!".to_string());
341    ///
342    /// // get document all content
343    /// let content = text_documents.get_content(None);
344    /// assert_eq!(content, "hello rust!");
345    ///
346    /// // get document specify content by range
347    /// let (start, end) = (Position::new(0, 1), Position::new(0, 9));
348    /// let range = Range::new(start, end);
349    /// let sub_content = text_documents.get_content(Some(range));
350    /// assert_eq!(sub_content, "ello rus");
351    /// ```
352    pub fn get_content(&self, range: Option<Range>) -> &str {
353        match range {
354            Some(Range { start, end }) => {
355                let start = self.offset_at(start);
356                let end = self.offset_at(end).min(self.content_len());
357                self.content.get(start as usize..end as usize).unwrap()
358            }
359            None => &self.content,
360        }
361    }
362
363    fn get_line_and_offset(&self, line: u32) -> Option<(&str, u32)> {
364        self.line_offsets.get(line as usize).map(|&line_offset| {
365            let len: u32 = self.content_len();
366            let eol_offset = self.line_offsets.get((line + 1) as usize).unwrap_or(&len);
367            let line = &self.content[line_offset as usize..*eol_offset as usize];
368            (line, line_offset)
369        })
370    }
371
372    fn get_line(&self, line: u32) -> Option<&str> {
373        self.get_line_and_offset(line).map(|(line, _)| line)
374    }
375
376    /// A amount of document content line
377    pub fn line_count(&self) -> u32 {
378        self.line_offsets
379            .len()
380            .try_into()
381            .expect("The number of lines of text passed in is too long")
382    }
383
384    /// The length of the document content in UTF-8 bytes
385    pub fn content_len(&self) -> u32 {
386        self.content
387            .len()
388            .try_into()
389            .expect("The length of the text passed in is too long")
390    }
391
392    /// Converts a zero-based byte offset in the UTF8-encoded content to a position
393    ///
394    /// the offset is in bytes, the position is in UTF16 code units. rounds down if
395    /// the offset is not on a code unit boundary, or is beyond the end of the
396    /// content.
397    pub fn position_at(&self, offset: u32) -> Position {
398        let offset = offset.min(self.content_len());
399        let line_count = self.line_count();
400        if line_count == 1 {
401            // only one line
402            return Position {
403                line: 0,
404                character: bytes_to_units(self.get_line(0).unwrap(), offset, &self.encoding),
405            };
406        }
407
408        let (mut low, mut high) = (0, line_count);
409        while low < high {
410            let mid = (low + high) / 2;
411            if offset
412                >= *self
413                    .line_offsets
414                    .get(mid as usize)
415                    .expect("Unknown mid value")
416            {
417                low = mid + 1;
418            } else {
419                high = mid;
420            }
421        }
422
423        if low == 0 {
424            // offset is on the first line
425            return Position {
426                line: 0,
427                character: bytes_to_units(self.get_line(0).unwrap(), offset, &self.encoding),
428            };
429        }
430
431        let line = low - 1;
432
433        Position {
434            line,
435            character: bytes_to_units(
436                self.get_line(line).unwrap(),
437                offset - self.line_offsets[line as usize],
438                &self.encoding,
439            ),
440        }
441    }
442
443    /// Converts a position to a zero-based byte offset, suitable for slicing the
444    /// UTF-8 encoded content.
445    pub fn offset_at(&self, position: Position) -> u32 {
446        let Position { line, character } = position;
447        match self.get_line_and_offset(line) {
448            Some((line, offset)) => {
449                offset + units_to_bytes(line, character, &self.encoding)
450            }
451            None => {
452                if line >= self.line_count() {
453                    self.content_len()
454                } else {
455                    0
456                }
457            }
458        }
459    }
460}
461
462#[cfg(test)]
463mod tests {
464    use super::*;
465    use lsp_types::PositionEncodingKind;
466
467    fn full_text_document() -> FullTextDocument {
468        FullTextDocument::new(
469            "js".to_string(),
470            2,
471            "he\nllo\nworld\r\nfoo\rbar".to_string(),
472        )
473    }
474
475    #[test]
476    fn test_offset_at() {
477        let text_document = full_text_document();
478
479        let offset = text_document.offset_at(Position {
480            line: 1,
481            character: 1,
482        });
483        assert_eq!(offset, 4);
484
485        let offset = text_document.offset_at(Position {
486            line: 2,
487            character: 3,
488        });
489        assert_eq!(offset, 10);
490
491        // the `f` in `foo` (\r\n is a single line terminator)
492        let offset = text_document.offset_at(Position {
493            line: 3,
494            character: 1,
495        });
496        assert_eq!(offset, 15);
497    }
498
499    /// basic multilingual plane
500    #[test]
501    fn test_offset_at_bmp() {
502        // Euro symbol
503        let text_document = FullTextDocument::new("js".to_string(), 2, "\u{20AC} euro".to_string());
504
505        let offset = text_document.offset_at(Position {
506            line: 0,
507            // E euro
508            //   ^
509            character: 2,
510        });
511        assert_eq!(offset, 4);
512    }
513
514    /// supplementary multilingual plane, aka surrogate pair
515    #[test]
516    fn test_offset_at_smp() {
517        // Deseret Small Letter Yee
518        let text_document = FullTextDocument::new("js".to_string(), 2, "\u{10437} yee".to_string());
519        let offset = text_document.offset_at(Position {
520            line: 0,
521            // HL yee
522            //    ^
523            character: 3,
524        });
525        assert_eq!(offset, 5);
526    }
527
528    /// a character beyond the end of the line should clamp to the end of the line
529    #[test]
530    fn test_offset_at_beyond_end_of_line() {
531        let text_document =
532            FullTextDocument::new("js".to_string(), 2, "\u{20AC} abc\nline 2".to_string());
533        // "\u{20AC} abc\nline 2" in UTF-8:
534        // \xE2 \x82 \xAC \x20 \x61 \x62 \x63 \x0A \x6C \x69 \x6E \x65 \x20 \x32
535        // ^ line 1 == 0                           ^ line 2 == 8
536        assert_eq!(text_document.line_offsets, vec![0, 8]);
537
538        let offset = text_document.offset_at(Position {
539            line: 0,
540            character: 100,
541        });
542        assert_eq!(offset, 8);
543    }
544
545    #[test]
546    fn test_utf8_encoding_positions() {
547        let text_document = FullTextDocument::new_with_encoding(
548            "plain".to_string(),
549            1,
550            "\u{1F496}abc".to_string(),
551            PositionEncodingKind::UTF8,
552        );
553
554        // UTF-8 character offset is in bytes; emoji is 4 bytes.
555        assert_eq!(
556            text_document.offset_at(Position {
557                line: 0,
558                character: 4,
559            }),
560            4
561        );
562        assert_eq!(
563            text_document.position_at(4),
564            Position {
565                line: 0,
566                character: 4,
567            }
568        );
569
570        // beyond line end clamps to line end
571        assert_eq!(
572            text_document.offset_at(Position {
573                line: 0,
574                character: 99,
575            }),
576            text_document.content_len()
577        );
578    }
579
580    #[test]
581    fn test_utf32_encoding_positions() {
582        let text_document = FullTextDocument::new_with_encoding(
583            "plain".to_string(),
584            1,
585            "\u{1F496}abc".to_string(),
586            PositionEncodingKind::UTF32,
587        );
588
589        // UTF-32 character offset is in Unicode scalars; emoji counts as 1.
590        assert_eq!(
591            text_document.offset_at(Position {
592                line: 0,
593                character: 1,
594            }),
595            4
596        );
597        assert_eq!(
598            text_document.position_at(4),
599            Position {
600                line: 0,
601                character: 1,
602            }
603        );
604
605        // Position after the emoji should move into ASCII bytes/scalars correctly.
606        assert_eq!(
607            text_document.offset_at(Position {
608                line: 0,
609                character: 3,
610            }),
611            6
612        );
613        assert_eq!(
614            text_document.position_at(6),
615            Position {
616                line: 0,
617                character: 3,
618            }
619        );
620    }
621
622    #[test]
623    fn test_position_at() {
624        let text_document = full_text_document();
625
626        let position = text_document.position_at(5);
627        assert_eq!(
628            position,
629            Position {
630                line: 1,
631                character: 2
632            }
633        );
634
635        let position = text_document.position_at(11);
636        assert_eq!(
637            position,
638            Position {
639                line: 2,
640                character: 4,
641            }
642        );
643
644        let position = text_document.position_at(15);
645        assert_eq!(
646            position,
647            Position {
648                line: 3,
649                character: 1,
650            }
651        );
652
653        let position = text_document.position_at(0);
654        assert_eq!(
655            position,
656            Position {
657                line: 0,
658                character: 0,
659            }
660        );
661    }
662
663    /// basic multilingual plane
664    #[test]
665    fn test_position_at_bmp() {
666        // Euro symbol
667        let text_document = FullTextDocument::new("js".to_string(), 2, "\u{20AC} euro".to_string());
668        let position = text_document.position_at(4);
669        assert_eq!(
670            position,
671            Position {
672                line: 0,
673                // E euro
674                //   ^
675                character: 2,
676            }
677        );
678
679        // multi-line content
680        let text_document =
681            FullTextDocument::new("js".to_string(), 2, "\n\n\u{20AC} euro\n\n".to_string());
682        let position = text_document.position_at(6);
683        assert_eq!(
684            position,
685            Position {
686                line: 2,
687                // E euro
688                //   ^
689                character: 2,
690            }
691        );
692    }
693
694    /// supplementary multilingual plane, aka surrogate pair
695    #[test]
696    fn test_position_at_smp() {
697        // Deseret Small Letter Yee
698        let text_document = FullTextDocument::new("js".to_string(), 2, "\u{10437} yee".to_string());
699        assert_eq!(
700            text_document.position_at(5),
701            Position {
702                line: 0,
703                // HL yee
704                //    ^
705                character: 3,
706            }
707        );
708
709        // \u{10437} is 4 bytes wide. if not on a char boundary, round down
710        assert_eq!(
711            text_document.position_at(2),
712            Position {
713                line: 0,
714                character: 0,
715            }
716        );
717
718        // multi-line content
719        let text_document =
720            FullTextDocument::new("js".to_string(), 2, "\n\n\u{10437} yee\n\n".to_string());
721        let position = text_document.position_at(7);
722        assert_eq!(
723            position,
724            Position {
725                line: 2,
726                // HL yee
727                //    ^
728                character: 3,
729            }
730        );
731    }
732
733    /// https://github.com/GiveMe-A-Name/lsp-textdocument/issues/53
734    #[test]
735    fn test_position_at_line_head() {
736        let text_document = FullTextDocument::new("js".to_string(), 2, "\nyee\n\n".to_string());
737        let position = text_document.position_at(1);
738        assert_eq!(
739            position,
740            Position {
741                line: 1,
742                character: 0,
743            }
744        );
745    }
746
747    #[test]
748    fn test_get_content() {
749        let text_document = full_text_document();
750
751        let start = Position {
752            line: 0,
753            character: 0,
754        };
755        let end = Position {
756            line: 1,
757            character: 2,
758        };
759        let range = Range { start, end };
760        let content = text_document.get_content(Some(range));
761        assert_eq!(content, "he\nll");
762
763        let end = Position {
764            line: 100,
765            character: 100,
766        };
767        let range = Range { start, end };
768        let content = text_document.get_content(Some(range));
769        assert_eq!(content, text_document.content);
770
771        let range = Range {
772            start: Position {
773                line: 1,
774                character: 0,
775            },
776            end: Position {
777                line: 2,
778                character: 3,
779            },
780        };
781        let content = text_document.get_content(Some(range));
782        assert_eq!(content, "llo\nwor");
783    }
784
785    /// basic multilingual plane
786    #[test]
787    fn test_get_content_bmp() {
788        // Euro symbol
789        let text_document = FullTextDocument::new("js".to_string(), 2, "\u{20AC} euro".to_string());
790
791        // Euro symbol is 1 UTF16 code unit wide
792        let range = Range {
793            start: Position {
794                line: 0,
795                character: 0,
796            },
797            end: Position {
798                line: 0,
799                character: 1,
800            },
801        };
802        let content = text_document.get_content(Some(range));
803        assert_eq!(content, "\u{20AC}");
804
805        // E euro
806        //   ^
807        let range = Range {
808            start: Position {
809                line: 0,
810                character: 2,
811            },
812            end: Position {
813                line: 0,
814                character: 3,
815            },
816        };
817        let content = text_document.get_content(Some(range));
818        assert_eq!(content, "e");
819    }
820
821    /// supplementary multilingual plane, aka surrogate pairs
822    #[test]
823    fn test_get_content_smp() {
824        // Deseret Small Letter Yee
825        let text_document = FullTextDocument::new("js".to_string(), 2, "\u{10437} yee".to_string());
826
827        // surrogate pairs are 2 UTF16 code units wide
828        let range = Range {
829            start: Position {
830                line: 0,
831                character: 0,
832            },
833            end: Position {
834                line: 0,
835                character: 2,
836            },
837        };
838        let content = text_document.get_content(Some(range));
839        assert_eq!(content, "\u{10437}");
840    }
841
842    #[test]
843    fn test_update_full_content() {
844        let mut text_document = full_text_document();
845        let new_text = "hello\n js!";
846
847        text_document.update(
848            &[TextDocumentContentChangeEvent {
849                text: new_text.to_string(),
850                range: None,
851                range_length: None,
852            }],
853            1,
854        );
855
856        assert_eq!(&text_document.content, new_text);
857        assert_eq!(text_document.line_offsets, vec![0, 6]);
858    }
859
860    #[test]
861    fn test_update_part_content() {
862        let mut text_document = full_text_document();
863        assert_eq!(text_document.version(), 2);
864        let new_text = String::from("xx\ny");
865        let range = Range {
866            start: Position {
867                line: 1,
868                character: 0,
869            },
870            end: Position {
871                line: 1,
872                character: 3,
873            },
874        };
875        text_document.update(
876            &[TextDocumentContentChangeEvent {
877                range: Some(range),
878                range_length: None,
879                text: new_text,
880            }],
881            1,
882        );
883
884        assert_eq!(&text_document.content, "he\nxx\ny\nworld\r\nfoo\rbar");
885        assert_eq!(text_document.line_offsets, vec![0, 3, 6, 8, 15, 19]);
886        assert_eq!(text_document.version(), 1)
887    }
888
889    #[test]
890    fn test_update_new_content_at_end() {
891        let mut text_document = full_text_document();
892        let new_text = String::from("bar\nbaz");
893
894        let range = Range {
895            start: Position {
896                line: 4,
897                character: 0,
898            },
899            end: Position {
900                line: 5,
901                character: 0,
902            },
903        };
904        text_document.update(
905            &[TextDocumentContentChangeEvent {
906                range: Some(range),
907                range_length: None,
908                text: new_text,
909            }],
910            1,
911        );
912
913        assert_eq!(&text_document.content, "he\nllo\nworld\r\nfoo\rbar\nbaz");
914        assert_eq!(text_document.line_offsets, vec![0, 3, 7, 14, 18, 22]);
915    }
916
917    #[test]
918    #[should_panic(
919        expected = "Start offset must be less than end offset. 2:0 (offset 7) is not <= 1:0 (offset 3)"
920    )]
921    fn test_update_invalid_range() {
922        let mut text_document = full_text_document();
923        // start is after end
924        let range = Range {
925            start: Position {
926                line: 2,
927                character: 0,
928            },
929            end: Position {
930                line: 1,
931                character: 0,
932            },
933        };
934        text_document.update(
935            &[TextDocumentContentChangeEvent {
936                text: String::from(""),
937                range: Some(range),
938                range_length: Some(0),
939            }],
940            1,
941        );
942    }
943
944    /// It turns out that there are multiple values for Position that can map to
945    /// the same offset following a newline.
946    #[test]
947    fn test_multiple_position_same_offset() {
948        let text_document = full_text_document();
949        let end_of_first_line = Position {
950            line: 0,
951            character: 3,
952        };
953        let start_of_second_line = Position {
954            line: 1,
955            character: 0,
956        };
957        assert_eq!(
958            text_document.offset_at(end_of_first_line),
959            text_document.offset_at(start_of_second_line)
960        );
961
962        let beyond_end_of_first_line = Position {
963            line: 0,
964            character: 10_000,
965        };
966        assert_eq!(
967            text_document.offset_at(beyond_end_of_first_line),
968            text_document.offset_at(start_of_second_line)
969        );
970    }
971
972    #[test]
973    fn test_insert_using_positions_after_newline_at_end_of_line() {
974        let mut doc = FullTextDocument::new(
975            "text".to_string(),
976            0,
977            "0:1332533\n0:1332534\n0:1332535\n0:1332536\n".to_string(),
978        );
979        doc.update(
980            &[TextDocumentContentChangeEvent {
981                range: Some(Range {
982                    // After \n at the end of line 1.
983                    start: Position {
984                        line: 1,
985                        character: 10,
986                    },
987                    // After \n at the end of line 2.
988                    end: Position {
989                        line: 2,
990                        character: 10,
991                    },
992                }),
993                range_length: None,
994                text: "1:6188912\n1:6188913\n1:6188914\n".to_string(),
995            }],
996            1,
997        );
998        assert_eq!(
999            doc.get_content(None),
1000            concat!(
1001                "0:1332533\n0:1332534\n",
1002                "1:6188912\n1:6188913\n1:6188914\n",
1003                "0:1332536\n",
1004            ),
1005        );
1006        assert_eq!(doc.line_offsets, vec!(0, 10, 20, 30, 40, 50, 60));
1007    }
1008
1009    #[test]
1010    fn test_line_offsets() {
1011        let mut doc =
1012            FullTextDocument::new("text".to_string(), 0, "123456789\n123456789\n".to_string());
1013        assert_eq!(doc.line_offsets, vec!(0, 10, 20));
1014        doc.update(
1015            &[TextDocumentContentChangeEvent {
1016                range: Some(Range {
1017                    start: Position {
1018                        line: 1,
1019                        character: 5,
1020                    },
1021                    end: Position {
1022                        line: 1,
1023                        character: 5,
1024                    },
1025                }),
1026                range_length: None,
1027                text: "\nA\nB\nC\n".to_string(),
1028            }],
1029            1,
1030        );
1031        assert_eq!(doc.get_content(None), "123456789\n12345\nA\nB\nC\n6789\n",);
1032        assert_eq!(doc.line_offsets, vec!(0, 10, 16, 18, 20, 22, 27));
1033    }
1034
1035    /// This tests a regression caused by confusing byte and character offsets.
1036    /// When [update] was called on a position whose offset points just after a
1037    /// non-newline when interpreted as bytes, but pointed just after at a
1038    /// newline when interpreted as chars, it led to a crash.
1039    #[test]
1040    fn test_find_canonical_position_regression() {
1041        // \u{20AC} is a single character in utf-16 but 3 bytes in utf-8,
1042        // so the offsets in bytes of everything after it is +2 their offsets
1043        // in characters.
1044        let str = "\u{20AC}456789\n123456789\n";
1045        let mut doc = FullTextDocument::new("text".to_string(), 0, str.to_string());
1046
1047        let pos = Position {
1048            line: 0,
1049            character: 6,
1050        };
1051        let offset = doc.offset_at(pos) as usize;
1052        assert_ne!(str.as_bytes().get(offset - 1), Some(&b'\n'));
1053        assert_eq!(str.chars().nth(offset - 1), Some('\n'));
1054
1055        doc.update(
1056            &[TextDocumentContentChangeEvent {
1057                range: Some(Range {
1058                    start: pos,
1059                    end: Position {
1060                        line: 0,
1061                        character: 7,
1062                    },
1063                }),
1064                range_length: None,
1065                text: "X".to_string(),
1066            }],
1067            1,
1068        );
1069        assert_eq!(doc.get_content(None), "\u{20AC}45678X\n123456789\n",);
1070        assert_eq!(doc.line_offsets, vec!(0, 10, 20));
1071    }
1072}
lsp_textdocument/text_document.rs

lsp_textdocument/
text_document.rs