lsp_document/
lib.rs

1//! Helpers to convert between LSP representations of text documents and Rust
2//! strings.
3//!
4//! ## Motivation:
5//! LSP uses UTF16-encoded strings while Rust's strings are UTF8-encoded. This
6//! means that text offsets in LSP and in Rust are different:
7//! - LSP offsets are in 16-bit code-units and each character is either 1 or 2 of those,
8//! - Rust strings are indexed in bytes and each character takes from 1 to 4 bytes.
9//!
10//! To ensure that LSP client and server "talk" about the same part of a text
11//! document we need a translation layer.
12//!
13//! ## Structure
14//! There are two traits that define the basic functionality on text documents:
15//! - [`TextMap`] defines operations to convert between byte offsets and [`Pos`]'s
16//!   inside a UTF8-encoded string.
17//! - [`TextAdapter`] defines operations to convert between LSP positions, native
18//!   positions, and derived types.
19//!
20//! The work-horse struct that implements both of these traits is
21//! [`IndexedText`]. It wraps the original text and can act as a replacement for
22//! [`String`] where relevant. The struct is generic in the type of text it
23//! wraps, however, so depending on the use-case it can be either:
24//! - `IndexedText<&str>` when you don't really need an ownership of the original
25//!   text, or
26//! - `IndexedText<Arc<str>>` otherwise.
27//!
28//! ## Example usage
29//!
30//! Below is a an example where the original text is `&'static str`.
31//!
32//! ```rust
33//! use lsp_document::{TextMap, TextAdapter, Pos, IndexedText};
34//! use lsp_types::Position;
35//!
36//! // Character width
37//! // U16:     1111111111111 1111111111 1 11 1 1 111111111 21
38//! // U8:      1111111111111 1222122221 1 13 3 3 111111111 41
39//! // U8 offset
40//! //          0         1       2      3       4          5
41//! //          0123456789012 3468013579 0 12 5 8 123456789 04
42//! let text = "Hello, world!\nКак дела?\r\n做得好\nThis is 💣!";
43//! let text = IndexedText::new(text);
44//! //
45//! // Examples of using TextMap methods
46//! //
47//! // Pos of 💣 from its offset
48//! assert_eq!(text.offset_to_pos(50).unwrap(), Pos::new(3, 8));
49//! // Raw line range info
50//! assert_eq!(text.line_range(2).unwrap(), Pos::new(2, 0)..Pos::new(2, 10));
51//! // Extracting part of text between two positions
52//! assert_eq!(text.substr(Pos::new(1, 7)..Pos::new(1, 15)).unwrap(), "дела");
53//!
54//! //
55//! // Example of using TextAdapter methods
56//! //
57//! // Pos of `!` after 💣
58//! assert_eq!(text.lsp_pos_to_pos(&Position::new(3, 10)).unwrap(), Pos::new(3, 12));
59//! assert_eq!(text.pos_to_lsp_pos(&Pos::new(3, 12)).unwrap(), Position::new(3, 10));
60//! ```
61
62use std::{borrow::Borrow, cmp::Ordering, ops::Range};
63
64/// Native position inside a text document/string. Points to a valid position
65/// **before** the character inside a UTF8-encoded string.
66///
67/// ## Why use [`Pos`] instead of raw `usize` offset
68///
69/// This depends on the use-case. Often raw `usize` or a newtype wrapper around
70/// `usize` is sufficient. However, raw byte offsets are not stable at all when a
71/// text document changes.
72///
73/// Usually, a text document is an input to later stages of the pipelines. Let's
74/// take a simple incremental pipeline:
75/// ```text
76/// text: string ->
77///  symbols: Symbol { ..., start: usize, end: usize } ->
78///  diagnostics: Diag { ..., start: usize, end: usize }
79/// ```
80///
81/// Now, any change to `text` on line N will shift all `start` and `end` offsets,
82/// which will invalidate all symbols and diagnostics following the change and
83/// require recomputation.
84///
85/// However, if `start` and `end` are [`Pos`]es then only the line where the
86/// change was made is affected. Symbols and diagnostic for other lines won't be
87/// invalidated.
88#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
89pub struct Pos {
90    /// 0-indexed line inside the text document.
91    pub line: u32,
92    /// 0-indexed byte offset from the beginning of the line.
93    /// The offset is at a valid char boundary.
94    pub col: u32,
95}
96
97impl Pos {
98    /// Create a new [`Pos`]. This method shouldn't be required to use most of
99    /// the time!
100    ///
101    /// `line` is 0-indexed, `col` is a 0-indexed byte-offset from the beginning
102    /// of the line to a **valid char position**.
103    pub fn new(line: u32, col: u32) -> Self {
104        Self { line, col }
105    }
106}
107
108impl PartialOrd for Pos {
109    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
110        Some(self.cmp(other))
111    }
112}
113
114impl Ord for Pos {
115    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
116        let line_cmp = self.line.cmp(&other.line);
117        if line_cmp == Ordering::Equal {
118            self.col.cmp(&other.col)
119        } else {
120            line_cmp
121        }
122    }
123}
124
125/// Native representation of a change that replaces a part of the target text.
126///
127/// Can be converted to and from [`lsp_types::TextDocumentContentChangeEvent`] by
128/// [`TextAdapter`].
129pub struct TextChange {
130    /// Specifies the part of the text that needs to be replaced. When `None` the
131    /// whole text needs to be replaced.
132    pub range: Option<Range<Pos>>,
133    /// The replacement text.
134    pub patch: String,
135}
136
137/// Defines operations to convert between byte offsets and native [`Pos`].
138///
139/// Most operations return an [`Option`] where [`None`] signals that the
140/// conversion wasn't successful.
141pub trait TextMap {
142    fn text(&self) -> &str;
143    fn offset_to_pos(&self, offset: usize) -> Option<Pos>;
144    fn offset_range_to_range(&self, offsets: Range<usize>) -> Option<Range<Pos>> {
145        let start = self.offset_to_pos(offsets.start)?;
146        let end = self.offset_to_pos(offsets.end)?;
147        Some(start..end)
148    }
149    fn line_range(&self, line: u32) -> Option<Range<Pos>>;
150    fn substr(&self, range: Range<Pos>) -> Option<&str>;
151}
152
153/// Defines operations to convert between native text types and [`lsp_types`].
154/// The trait is automatically derived for any type that implements [`TextMap`].
155///
156/// Most operations return an [`Option`] where [`None`] signals that the
157/// conversion wasn't successful.
158pub trait TextAdapter {
159    fn pos_to_lsp_pos(&self, pos: &Pos) -> Option<lsp_types::Position>;
160    fn lsp_pos_to_pos(&self, lsp_pos: &lsp_types::Position) -> Option<Pos>;
161    fn range_to_lsp_range(&self, range: &Range<Pos>) -> Option<lsp_types::Range>;
162    fn lsp_range_to_range(&self, lsp_range: &lsp_types::Range) -> Option<Range<Pos>>;
163    fn change_to_lsp_change(
164        &self,
165        change: TextChange,
166    ) -> Option<lsp_types::TextDocumentContentChangeEvent>;
167    fn lsp_change_to_change(
168        &self,
169        lsp_change: lsp_types::TextDocumentContentChangeEvent,
170    ) -> Option<TextChange>;
171}
172
173impl<T: TextMap> TextAdapter for T {
174    fn pos_to_lsp_pos(&self, pos: &Pos) -> Option<lsp_types::Position> {
175        let line_num = pos.line;
176        let line_range = self.line_range(line_num)?;
177        let line = self.substr(line_range)?;
178
179        let target_u8_offset = pos.col as usize;
180
181        let mut u8_offset: usize = 0;
182        let mut u16_offset: usize = 0;
183        let mut found = false;
184
185        for c in line.chars() {
186            if u8_offset == target_u8_offset {
187                found = true;
188                break;
189            } else {
190                u8_offset += c.len_utf8();
191                u16_offset += c.len_utf16();
192            }
193        }
194
195        // Handle "append"/"after eol" case
196        if !found && u8_offset == target_u8_offset {
197            found = true;
198        }
199
200        assert!(found, "Offset not found in line");
201        Some(lsp_types::Position::new(line_num as u32, u16_offset as u32))
202    }
203
204    fn lsp_pos_to_pos(&self, lsp_pos: &lsp_types::Position) -> Option<Pos> {
205        let line_range = self.line_range(lsp_pos.line)?;
206        let line = self.substr(line_range)?;
207
208        let mut u8_offset: usize = 0;
209        let mut u16_offset: usize = 0;
210        let mut found = false;
211
212        // Handle the case of artificial blank line
213        if lsp_pos.character == 0 {
214            found = true;
215        }
216
217        for c in line.chars() {
218            if u16_offset == lsp_pos.character as usize {
219                found = true;
220                break;
221            } else {
222                u16_offset += c.len_utf16();
223                u8_offset += c.len_utf8();
224            }
225        }
226
227        // Handle "append" case
228        if !found && u16_offset == lsp_pos.character as usize {
229            found = true;
230        }
231
232        assert!(found, "LSP pos not found in line");
233        Some(Pos::new(lsp_pos.line, u8_offset as u32))
234    }
235
236    fn range_to_lsp_range(&self, range: &Range<Pos>) -> Option<lsp_types::Range> {
237        Some(lsp_types::Range::new(
238            self.pos_to_lsp_pos(&range.start)?,
239            self.pos_to_lsp_pos(&range.end)?,
240        ))
241    }
242
243    fn lsp_range_to_range(&self, lsp_range: &lsp_types::Range) -> Option<Range<Pos>> {
244        Some(self.lsp_pos_to_pos(&lsp_range.start)?..self.lsp_pos_to_pos(&lsp_range.end)?)
245    }
246
247    fn change_to_lsp_change(
248        &self,
249        change: TextChange,
250    ) -> Option<lsp_types::TextDocumentContentChangeEvent> {
251        if let Some(range) = change.range {
252            let lsp_range = self.range_to_lsp_range(&range)?;
253            Some(lsp_types::TextDocumentContentChangeEvent {
254                range: Some(lsp_range),
255                range_length: None,
256                text: change.patch,
257            })
258        } else {
259            Some(lsp_types::TextDocumentContentChangeEvent {
260                range: None,
261                range_length: None,
262                text: change.patch,
263            })
264        }
265    }
266
267    fn lsp_change_to_change(
268        &self,
269        lsp_change: lsp_types::TextDocumentContentChangeEvent,
270    ) -> Option<TextChange> {
271        if let Some(lsp_range) = lsp_change.range {
272            let range = self.lsp_range_to_range(&lsp_range)?;
273            Some(TextChange {
274                range: Some(range),
275                patch: lsp_change.text,
276            })
277        } else {
278            Some(TextChange {
279                range: None,
280                patch: lsp_change.text,
281            })
282        }
283    }
284}
285
286/// A combo of [`TextMap`] + [`TextAdapter`]. Wraps the original text and
287/// provides all the conversion methods.
288///
289/// Generic over the type of the text it wraps. Can be used with e.g. `&str`,
290/// `String`, or `Arc<str>`, depending on whether ownership is needed and if it
291/// needs to be unique or shared.
292#[derive(Debug, PartialEq, Eq, Clone)]
293pub struct IndexedText<T>
294where
295    T: Borrow<str>,
296{
297    /// The original text
298    text: T,
299    /// Range of start-end offsets for all lines in the `text`. [`u32`] should be
300    /// enough for upto 4GB files; show me a source file like this!
301    line_ranges: Vec<Range<u32>>,
302}
303
304impl<T: Borrow<str>> IndexedText<T> {
305    pub fn new(text: T) -> Self {
306        let mut line_ranges: Vec<Range<u32>> = Vec::new();
307
308        let mut line_start: Option<usize> = None;
309        let mut last_char: Option<(usize, char)> = None;
310
311        let mut char_iter = text.borrow().char_indices().peekable();
312
313        while let Some((pos, c)) = char_iter.next() {
314            if line_start.is_none() {
315                line_start = Some(pos);
316            }
317            last_char = Some((pos, c));
318
319            let mut is_newline = false;
320
321            if c == '\n' {
322                is_newline = true;
323            } else if c == '\r' {
324                if char_iter.peek().filter(|(_, pc)| *pc == '\n').is_some() {
325                    continue;
326                }
327                is_newline = true;
328            }
329
330            if is_newline {
331                let start = line_start.expect("line_start should be always initialized");
332                debug_assert!(
333                    text.borrow().is_char_boundary(start),
334                    "Start is not at char boundary"
335                );
336                let end = pos + c.len_utf8();
337                debug_assert!(
338                    text.borrow().is_char_boundary(end),
339                    "End is not at char boundary"
340                );
341
342                line_ranges.push(start as u32..end as u32);
343                line_start = None;
344            }
345        }
346
347        // Handle a situation when there's no newline at the end
348        if let (Some(start), Some((pos, c))) = (line_start, last_char) {
349            line_ranges.push(start as u32..(pos + c.len_utf8()) as u32);
350        }
351
352        // Insert an artificial blank line with an empty range
353        if let Some((pos, c)) = last_char {
354            line_ranges.push((pos + c.len_utf8()) as u32..(pos + c.len_utf8()) as u32);
355        }
356
357        // Insert an artificial blank line for an empty string
358        if text.borrow().is_empty() {
359            line_ranges.push(0..0);
360        }
361
362        IndexedText { text, line_ranges }
363    }
364
365    fn offset_to_line(&self, offset: usize) -> Option<u32> {
366        match offset.cmp(&self.text.borrow().len()) {
367            Ordering::Greater => None,
368            Ordering::Equal => Some((self.line_ranges.len().max(2) - 2) as u32),
369            Ordering::Less => {
370                let line = self.line_ranges.binary_search_by(|r| {
371                    if offset < r.start as usize {
372                        Ordering::Greater
373                    } else if offset >= r.end as usize {
374                        Ordering::Less
375                    } else if offset >= r.start as usize && offset < r.end as usize {
376                        Ordering::Equal
377                    } else {
378                        panic!("Impossible case: offset={} and range={:?}", offset, r)
379                    }
380                });
381                Some(
382                    line.unwrap_or_else(|_| {
383                        panic!("Couldn't translate u8 offset {} to line", offset)
384                    }) as u32,
385                )
386            }
387        }
388    }
389
390    fn pos_to_offset(&self, pos: &Pos) -> Option<usize> {
391        let line_range = self.line_ranges.get(pos.line as usize)?;
392        Some(line_range.start as usize + (pos.col as usize))
393    }
394}
395
396impl<T: Borrow<str>> TextMap for IndexedText<T> {
397    fn text(&self) -> &str {
398        self.text.borrow()
399    }
400
401    fn offset_to_pos(&self, offset: usize) -> Option<Pos> {
402        let line = self.offset_to_line(offset)?;
403        let range = &self.line_ranges[line as usize];
404        let char = offset - (range.start as usize);
405        Some(Pos {
406            line,
407            col: char as u32,
408        })
409    }
410
411    fn line_range(&self, line: u32) -> Option<Range<Pos>> {
412        let offset = self.line_ranges.get(line as usize)?;
413        Some(Pos::new(line, 0)..Pos::new(line, offset.end - offset.start))
414    }
415
416    fn substr(&self, range: Range<Pos>) -> Option<&str> {
417        let start_line = self.line_ranges.get(range.start.line as usize)?;
418        let end_line = self.line_ranges.get(range.end.line as usize)?;
419        let start_offset = start_line.start + range.start.col;
420        let end_offset = end_line.start + range.end.col;
421
422        Some(&self.text()[start_offset as usize..end_offset as usize])
423    }
424}
425
426/// Applies a [`TextChange`] to [`IndexedText`] returning a new text as [`String`].
427pub fn apply_change<S: Borrow<str>>(text: &IndexedText<S>, change: TextChange) -> String {
428    match change.range {
429        None => change.patch,
430        Some(range) => {
431            let orig = text.text();
432
433            let offset_start = text.pos_to_offset(&range.start).unwrap();
434            let offset_end = text.pos_to_offset(&range.end).unwrap();
435            debug_assert!(
436                offset_start <= offset_end,
437                "Expected start <= end, got {}..{}",
438                offset_start,
439                offset_end
440            );
441            debug_assert!(
442                offset_end <= orig.len(),
443                "Expected end <= text.len(), got {} > {}",
444                offset_end,
445                orig.len()
446            );
447
448            let mut new = orig.to_string();
449
450            if offset_start == text.text().len() {
451                new.push_str(&change.patch);
452            } else {
453                new.replace_range(offset_start..offset_end, &change.patch)
454            }
455            new
456        }
457    }
458}
459
460#[cfg(test)]
461mod tests {
462    mod offset_map {
463        use crate::*;
464        use pretty_assertions::assert_eq;
465
466        #[test]
467        fn no_newline() {
468            //          012
469            let text = "Hi!";
470            let offsets = IndexedText::new(text);
471            assert_eq!(offsets.line_ranges, vec![0..3, 3..3]);
472        }
473
474        #[test]
475        fn newline() {
476            //          012 3
477            let text = "Hi!\n";
478            let offsets = IndexedText::new(text);
479            assert_eq!(offsets.line_ranges, vec![0..4, 4..4]);
480        }
481
482        #[test]
483        fn win_newline() {
484            //          012 3 4
485            let text = "Hi!\r\n";
486            let offsets = IndexedText::new(text);
487            assert_eq!(offsets.line_ranges, vec![0..5, 5..5]);
488        }
489
490        #[test]
491        fn two_lines() {
492            //          012 345678
493            let text = "Hi!\nWorld";
494            let offsets = IndexedText::new(text);
495            assert_eq!(offsets.line_ranges, vec![0..4, 4..9, 9..9]);
496        }
497
498        #[test]
499        fn eof_to_lsp() {
500            let text = "H";
501            let text = IndexedText::new(text);
502            let pos = text.offset_to_pos(1).unwrap();
503            let lsp_pos = text.pos_to_lsp_pos(&pos);
504            assert_eq!(lsp_pos, Some(lsp_types::Position::new(0, 1)));
505        }
506
507        #[test]
508        fn empty_lsp() {
509            let text = "";
510            let text = IndexedText::new(text);
511            let pos = text.offset_to_pos(0).unwrap();
512            assert_eq!(
513                text.pos_to_lsp_pos(&pos),
514                Some(lsp_types::Position::new(0, 0))
515            );
516        }
517    }
518
519    mod apply_change {
520        use crate::*;
521        use lsp_types::TextDocumentContentChangeEvent;
522        use pretty_assertions::assert_eq;
523
524        #[test]
525        fn within_line() {
526            let text = "# Hello World";
527            let text = IndexedText::new(text);
528
529            let change = TextDocumentContentChangeEvent {
530                range: Some(lsp_types::Range::new(
531                    lsp_types::Position::new(0, 2),
532                    lsp_types::Position::new(0, 7),
533                )),
534                range_length: None,
535                text: "Hi".to_string(),
536            };
537            let change = text.lsp_change_to_change(change).unwrap();
538            let replaced = apply_change(&text, change);
539            assert_eq!(&replaced, "# Hi World");
540        }
541
542        #[test]
543        fn at_newline() {
544            //          01 2
545            let text = "Hi\n";
546            let text = IndexedText::new(text);
547
548            let change = TextDocumentContentChangeEvent {
549                range: Some(lsp_types::Range::new(
550                    lsp_types::Position::new(0, 2),
551                    lsp_types::Position::new(1, 0),
552                )),
553                range_length: None,
554                text: "".to_string(),
555            };
556            let change = text.lsp_change_to_change(change).unwrap();
557            let replaced = apply_change(&text, change);
558            assert_eq!(&replaced, "Hi");
559        }
560
561        #[test]
562        fn at_linend() {
563            //          01
564            let text = "Hi";
565            let text = IndexedText::new(text);
566
567            let change = TextDocumentContentChangeEvent {
568                range: Some(lsp_types::Range::new(
569                    lsp_types::Position::new(0, 2),
570                    lsp_types::Position::new(0, 2),
571                )),
572                range_length: None,
573                text: "\n".to_string(),
574            };
575            let change = text.lsp_change_to_change(change).unwrap();
576            let replaced = apply_change(&text, change);
577            assert_eq!(&replaced, "Hi\n");
578        }
579    }
580}