mm0_util/
lined_string.rs

1//! Embellished String which carries additional data useful for interacting with language server messages.
2//!
3//! The [`LinedString::apply_changes`] function is used to realize changes made to a string once those
4//! changes are received by mm0-rs from a language server. [`LinedString`] Implements associated
5//! methods allowing it to be used nicely with the [`Position`] type specified by the language server protocol.
6
7use crate::{Position, Range, Span};
8#[cfg(feature = "memory")]
9use mm0_deepsize_derive::DeepSizeOf;
10use std::ops::{Deref, Index};
11
12/// Wrapper around std's String which stores data about the positions of any newline characters.
13///
14/// Also contains a boolean indicating whether the string has any unicode characters.
15/// Unicode is currently unsupported, but this allows the lexer to gracefully handle
16/// errors arising from the presence of unicode characters in input.
17/// The indices stored in `lines` are the successors of any newline characters.
18#[cfg_attr(feature = "memory", derive(DeepSizeOf))]
19#[derive(Default, Clone, Debug)]
20pub struct LinedString {
21  s: String,
22  unicode: bool,
23  lines: Vec<usize>,
24}
25
26/// Allows [`LinedString`] to be indexed with a [`Span`], since [`Span`] is essentially a range.
27impl Index<Span> for LinedString {
28  type Output = [u8];
29  fn index(&self, s: Span) -> &[u8] { &self.as_bytes()[s.start..s.end] }
30}
31
32/// Calculates the largest index `n` (on a UTF8 boundary) such that `s[..n]` is at most `chs` UTF-16 codepoints.
33/// Used in [`LinedString::lsp_to_idx`] to account for additional character offset introduced by unicode.
34fn lsp_to_idx(s: &str, mut chs: usize) -> usize {
35  for (n, c) in s.char_indices() {
36    let i = c.len_utf16();
37    if chs < i {
38      return n
39    }
40    chs -= i;
41  }
42  s.len()
43}
44
45impl LinedString {
46  /// Index a [`LinedString`] with a [`Span`], returning a `str`.
47  ///
48  /// # Safety
49  /// This function uses `str::get_unchecked()` internally, so it is *unsafe* unless the [`Span`] used in the index
50  /// was generated from the file that is being indexed.
51  #[allow(unused)]
52  pub fn str_at(&self, s: Span) -> &str { unsafe { std::str::from_utf8_unchecked(&self[s]) } }
53
54  /// Calculate and store information about the positions of any newline
55  /// characters in the string, and set 'unicode' to true if the string contains unicode.
56  /// The data in 'lines' is actually the positions of the characters immediately after
57  /// the line break (so \n.pos + 1).
58  #[must_use]
59  fn get_lines(unicode: &mut bool, s: &str) -> Vec<usize> {
60    let mut lines = vec![];
61    for (b, c) in s.char_indices() {
62      if c == '\n' {
63        lines.push(b + 1)
64      }
65      if !c.is_ascii() {
66        *unicode = true
67      }
68    }
69    lines
70  }
71
72  /// Turn a byte index into an LSP [`Position`]
73  ///
74  /// # Safety
75  /// `idx` must be a valid index in the string.
76  #[must_use]
77  pub fn to_pos(&self, idx: usize) -> Position {
78    use std::convert::TryInto;
79    let (pos, line) = match self.lines.binary_search(&idx) {
80      Ok(n) => (idx, n + 1),
81      Err(n) => (n.checked_sub(1).map_or(0, |i| self.lines[i]), n),
82    };
83    Position {
84      line: line.try_into().expect("too many lines"),
85      character: if self.unicode {
86        // Safety: we know that `pos` is valid index, and we have assumed that `idx` is
87        unsafe { self.s.get_unchecked(pos..idx) }.chars().map(char::len_utf16).sum()
88      } else {
89        idx - pos
90      }
91      .try_into()
92      .expect("too many characters"),
93    }
94  }
95
96  /// Turn a [`Span`] into an LSP [`Range`].
97  #[must_use]
98  pub fn to_range(&self, s: Span) -> Range {
99    Range { start: self.to_pos(s.start), end: self.to_pos(s.end) }
100  }
101
102  /// Turn a [`FileSpan`](crate::FileSpan) into an LSP [`Location`](lsp_types::Location).
103  #[cfg(feature = "server")]
104  #[must_use]
105  pub fn to_loc(&self, fs: &crate::FileSpan) -> lsp_types::Location {
106    lsp_types::Location { uri: fs.file.url().clone(), range: self.to_range(fs.span) }
107  }
108
109  /// Get the total number of lines in the file (as a `u32` for LSP compatibility).
110  #[must_use]
111  pub fn num_lines(&self) -> u32 {
112    use std::convert::TryInto;
113    self.lines.len().try_into().expect("too many lines")
114  }
115
116  /// Get the [`Position`] (line and UTF-16 code unit offset) of the end of the file.
117  #[must_use]
118  pub fn end(&self) -> Position { self.to_pos(self.s.len()) }
119
120  /// Calculates the byte index of the position `chs` UTF-16 code units after
121  /// byte index `start` in the string.
122  /// If there's no unicode, we can just use (start + idx).
123  /// In the presence of unicode, use the helper function [`lsp_to_idx`](Self::lsp_to_idx)
124  /// to account for any additional character offset.
125  ///
126  /// # Safety
127  /// `start` must be a valid index in the string.
128  #[must_use]
129  fn lsp_to_idx(&self, start: usize, chs: usize) -> usize {
130    start + if self.unicode { lsp_to_idx(unsafe { self.get_unchecked(start..) }, chs) } else { chs }
131  }
132
133  /// Turn an LSP [`Position`] into a usize index. [`Position`] is already zero-based,
134  /// but [`LinedString.lines`] stores `1 + position` of the actual linebreak characters,
135  /// so `lines[0]` points to the start of line 1, `lines[1]` points to the start of line 2, etc.
136  /// with the start of line 0 just being s.0.
137  #[must_use]
138  pub fn to_idx(&self, pos: Position) -> Option<usize> {
139    match pos.line.checked_sub(1) {
140      None => Some(self.lsp_to_idx(0, pos.character as usize)),
141      Some(n) =>
142        self.lines.get(n as usize).map(|&idx| self.lsp_to_idx(idx, pos.character as usize)),
143    }
144  }
145
146  /// Extend a [`LinedString`] with the contents of a `&str`, adding additional newline info as needed.
147  pub fn extend(&mut self, s: &str) {
148    let len = self.s.len();
149    self.s.push_str(s);
150    for (b, c) in s.char_indices() {
151      if c == '\n' {
152        self.lines.push(b + len + 1)
153      }
154      if !c.is_ascii() {
155        self.unicode = true
156      }
157    }
158  }
159
160  /// Extends a [`LinedString`]'s contents with the contents of a passed string slice
161  /// until it reaches some [`Position`]. Returns the portion of the passed string slice
162  /// that was not added to the [`LinedString`].
163  pub fn extend_until<'a>(&mut self, unicode: bool, s: &'a str, pos: Position) -> &'a str {
164    self.unicode |= unicode;
165    let end = self.end();
166    debug_assert!(end <= pos);
167    let (chs, off) = if pos.line == end.line {
168      ((pos.character - end.character) as usize, 0)
169    } else {
170      let len = self.s.len();
171      let mut it = s.char_indices();
172      (
173        pos.character as usize,
174        loop {
175          if let Some((b, c)) = it.next() {
176            if c == '\n' {
177              self.lines.push(b + len + 1);
178              if pos.line == self.num_lines() {
179                break b + 1
180              }
181            }
182          } else {
183            break s.len()
184          }
185        },
186      )
187    };
188    let tail = unsafe { s.get_unchecked(off..) };
189    let idx = if unicode { lsp_to_idx(tail, chs) } else { chs };
190    let len = self.s.len() + off;
191    for (b, c) in unsafe { tail.get_unchecked(..idx) }.char_indices() {
192      if c == '\n' {
193        self.lines.push(b + len + 1)
194      }
195    }
196    let (left, right) = s.split_at(off + idx);
197    self.s.push_str(left);
198    right
199  }
200
201  /// Truncates a [`LinedString`]'s contents so that it's equal to the character position
202  /// indicated by some lsp [`Position`], discarding any unneeded newline data.
203  /// Does nothing if the [`LinedString`]'s contents were already less than or equal
204  /// in length to the [`Position`]'s index.
205  pub fn truncate(&mut self, pos: Position) {
206    if let Some(idx) = self.to_idx(pos) {
207      if idx < self.s.len() {
208        self.s.truncate(idx);
209        self.lines.truncate(pos.line as usize);
210      }
211    }
212  }
213
214  /// Does a bunch of string juggling to actually realize the contents of an iterator
215  /// containing a sequence of [`TextDocumentContentChangeEvent`] messages.
216  ///
217  /// [`TextDocumentContentChangeEvent`]: lsp_types::TextDocumentContentChangeEvent
218  #[cfg(feature = "server")]
219  #[must_use]
220  pub fn apply_changes(
221    &self, changes: impl Iterator<Item = lsp_types::TextDocumentContentChangeEvent>,
222  ) -> (Position, LinedString) {
223    let mut old: LinedString;
224    let mut out = LinedString::default();
225    let mut uncopied: &str = &self.s;
226    let mut first_change = None;
227    for e in changes {
228      if let Some(Range { start, end }) = e.range {
229        if first_change.map_or(true, |c| start < c) {
230          first_change = Some(start)
231        }
232        if out.end() > start {
233          out.extend(uncopied);
234          old = std::mem::take(&mut out);
235          uncopied = &old;
236        }
237        uncopied = out.extend_until(self.unicode, uncopied, end);
238        out.truncate(start);
239        out.extend(&e.text);
240      } else {
241        out = e.text.into();
242        first_change = Some(Position::default());
243        uncopied = "";
244      }
245    }
246    out.extend(uncopied);
247    if let Some(pos) = first_change {
248      let start = out.to_idx(pos).expect("change out of range");
249      let from = unsafe { self.s.get_unchecked(start..) };
250      let to = unsafe { out.s.get_unchecked(start..) };
251      for ((b, c1), c2) in from.char_indices().zip(to.chars()) {
252        if c1 != c2 {
253          return (out.to_pos(b + start), out)
254        }
255      }
256    }
257    (out.end(), out)
258  }
259}
260
261impl Deref for LinedString {
262  type Target = String;
263  fn deref(&self) -> &String { &self.s }
264}
265
266impl From<String> for LinedString {
267  fn from(s: String) -> LinedString {
268    let mut unicode = false;
269    let lines = LinedString::get_lines(&mut unicode, &s);
270    LinedString { unicode, lines, s }
271  }
272}