Skip to main content

typst_syntax/
lines.rs

1use std::hash::{Hash, Hasher};
2use std::iter::zip;
3use std::ops::Range;
4use std::sync::Arc;
5
6use crate::is_newline;
7
8/// A text buffer and metadata about lines.
9///
10/// This is internally reference-counted and thus cheap to clone.
11#[derive(Clone)]
12pub struct Lines<S>(Arc<LinesInner<S>>);
13
14/// The internal representation of [`Lines`].
15#[derive(Clone)]
16struct LinesInner<T> {
17    lines: Vec<Line>,
18    text: T,
19}
20
21/// Metadata about a line.
22#[derive(Debug, Copy, Clone, Eq, PartialEq)]
23pub struct Line {
24    /// The UTF-8 byte offset where the line starts.
25    byte_idx: usize,
26    /// The UTF-16 codepoint offset where the line starts.
27    utf16_idx: usize,
28}
29
30impl<T: AsRef<str>> Lines<T> {
31    /// Create from the text buffer and compute the line metadata.
32    pub fn new(text: T) -> Self {
33        let lines = lines(text.as_ref());
34        Lines(Arc::new(LinesInner { lines, text }))
35    }
36
37    /// The text as a string slice.
38    pub fn text(&self) -> &str {
39        self.0.text.as_ref()
40    }
41
42    /// Get the length of the file in UTF-8 encoded bytes.
43    pub fn len_bytes(&self) -> usize {
44        self.0.text.as_ref().len()
45    }
46
47    /// Get the length of the file in UTF-16 code units.
48    pub fn len_utf16(&self) -> usize {
49        let last = self.0.lines.last().unwrap();
50        last.utf16_idx + len_utf16(&self.text()[last.byte_idx..])
51    }
52
53    /// Get the length of the file in lines.
54    pub fn len_lines(&self) -> usize {
55        self.0.lines.len()
56    }
57
58    /// Return the index of the UTF-16 code unit at the byte index.
59    pub fn byte_to_utf16(&self, byte_idx: usize) -> Option<usize> {
60        let line_idx = self.byte_to_line(byte_idx)?;
61        let line = self.0.lines.get(line_idx)?;
62        let head = self.text().get(line.byte_idx..byte_idx)?;
63        Some(line.utf16_idx + len_utf16(head))
64    }
65
66    /// Return the index of the line that contains the given byte index.
67    pub fn byte_to_line(&self, byte_idx: usize) -> Option<usize> {
68        (byte_idx <= self.text().len()).then(|| {
69            match self.0.lines.binary_search_by_key(&byte_idx, |line| line.byte_idx) {
70                Ok(i) => i,
71                Err(i) => i - 1,
72            }
73        })
74    }
75
76    /// Return the index of the column at the byte index.
77    ///
78    /// The column is defined as the number of characters in the line before the
79    /// byte index.
80    pub fn byte_to_column(&self, byte_idx: usize) -> Option<usize> {
81        let line = self.byte_to_line(byte_idx)?;
82        let start = self.line_to_byte(line)?;
83        let head = self.text().get(start..byte_idx)?;
84        Some(head.chars().count())
85    }
86
87    /// Return the index of the line and column at the byte index.
88    pub fn byte_to_line_column(&self, byte_idx: usize) -> Option<(usize, usize)> {
89        let line = self.byte_to_line(byte_idx)?;
90        let start = self.line_to_byte(line)?;
91        let head = self.text().get(start..byte_idx)?;
92        let col = head.chars().count();
93        Some((line, col))
94    }
95
96    /// Return the byte index at the UTF-16 code unit.
97    pub fn utf16_to_byte(&self, utf16_idx: usize) -> Option<usize> {
98        let line = self.0.lines.get(
99            match self.0.lines.binary_search_by_key(&utf16_idx, |line| line.utf16_idx) {
100                Ok(i) => i,
101                Err(i) => i - 1,
102            },
103        )?;
104
105        let text = self.text();
106        let mut k = line.utf16_idx;
107        for (i, c) in text[line.byte_idx..].char_indices() {
108            if k >= utf16_idx {
109                return Some(line.byte_idx + i);
110            }
111            k += c.len_utf16();
112        }
113
114        (k == utf16_idx).then_some(text.len())
115    }
116
117    /// Return the byte position at which the given line starts.
118    pub fn line_to_byte(&self, line_idx: usize) -> Option<usize> {
119        self.0.lines.get(line_idx).map(|line| line.byte_idx)
120    }
121
122    /// Return the range which encloses the given line.
123    pub fn line_to_range(&self, line_idx: usize) -> Option<Range<usize>> {
124        let start = self.line_to_byte(line_idx)?;
125        let end = self.line_to_byte(line_idx + 1).unwrap_or(self.text().len());
126        Some(start..end)
127    }
128
129    /// Return the byte index of the given (line, column) pair, or `None` if
130    /// either is out-of-range.
131    ///
132    /// The column defines the number of characters to go beyond the start of
133    /// the line.
134    pub fn line_column_to_byte(
135        &self,
136        line_idx: usize,
137        column_idx: usize,
138    ) -> Option<usize> {
139        let range = self.line_to_range(line_idx)?;
140        let line = &self.text()[range.clone()];
141        let mut chars = line.chars();
142        for _ in 0..column_idx {
143            chars.next()?;
144        }
145        Some(range.start + (line.len() - chars.as_str().len()))
146    }
147}
148
149impl Lines<String> {
150    /// Fully replace the source text.
151    ///
152    /// This performs a naive (suffix/prefix-based) diff of the old and new text
153    /// to produce the smallest single edit that transforms old into new and
154    /// then calls [`edit`](Self::edit) with it.
155    ///
156    /// Returns whether any changes were made.
157    pub fn replace(&mut self, new: &str) -> bool {
158        let Some((prefix, suffix)) = self.replacement_range(new) else {
159            return false;
160        };
161
162        let old = self.text();
163        let replace = prefix..old.len() - suffix;
164        let with = &new[prefix..new.len() - suffix];
165        self.edit(replace, with);
166
167        true
168    }
169
170    /// Returns the common prefix and suffix lengths.
171    /// Returns [`None`] if the old and new strings are equal.
172    pub fn replacement_range(&self, new: &str) -> Option<(usize, usize)> {
173        let old = self.text();
174
175        let mut prefix =
176            zip(old.bytes(), new.bytes()).take_while(|(x, y)| x == y).count();
177
178        if prefix == old.len() && prefix == new.len() {
179            return None;
180        }
181
182        while !old.is_char_boundary(prefix) || !new.is_char_boundary(prefix) {
183            prefix -= 1;
184        }
185
186        let mut suffix = zip(old[prefix..].bytes().rev(), new[prefix..].bytes().rev())
187            .take_while(|(x, y)| x == y)
188            .count();
189
190        while !old.is_char_boundary(old.len() - suffix)
191            || !new.is_char_boundary(new.len() - suffix)
192        {
193            suffix += 1;
194        }
195
196        Some((prefix, suffix))
197    }
198
199    /// Edit the source file by replacing the given range.
200    ///
201    /// Returns the range in the new source that was ultimately reparsed.
202    ///
203    /// The method panics if the `replace` range is out of bounds.
204    #[track_caller]
205    pub fn edit(&mut self, replace: Range<usize>, with: &str) {
206        let start_byte = replace.start;
207        let start_utf16 = self.byte_to_utf16(start_byte).unwrap();
208        let line = self.byte_to_line(start_byte).unwrap();
209
210        let inner = Arc::make_mut(&mut self.0);
211
212        // Update the text itself.
213        inner.text.replace_range(replace.clone(), with);
214
215        // Remove invalidated line starts.
216        inner.lines.truncate(line + 1);
217
218        // Handle adjoining of \r and \n.
219        if inner.text[..start_byte].ends_with('\r') && with.starts_with('\n') {
220            inner.lines.pop();
221        }
222
223        // Recalculate the line starts after the edit.
224        inner.lines.extend(lines_from(
225            start_byte,
226            start_utf16,
227            &inner.text[start_byte..],
228        ));
229    }
230}
231
232impl<S: Hash> Hash for Lines<S> {
233    fn hash<H: Hasher>(&self, state: &mut H) {
234        self.0.text.hash(state);
235    }
236}
237
238impl<S: AsRef<str>> AsRef<str> for Lines<S> {
239    fn as_ref(&self) -> &str {
240        self.0.text.as_ref()
241    }
242}
243
244/// Create a line vector.
245fn lines(text: &str) -> Vec<Line> {
246    std::iter::once(Line { byte_idx: 0, utf16_idx: 0 })
247        .chain(lines_from(0, 0, text))
248        .collect()
249}
250
251/// Compute a line iterator from an offset.
252fn lines_from(
253    byte_offset: usize,
254    utf16_offset: usize,
255    text: &str,
256) -> impl Iterator<Item = Line> + '_ {
257    let mut s = unscanny::Scanner::new(text);
258    let mut utf16_idx = utf16_offset;
259
260    std::iter::from_fn(move || {
261        s.eat_until(|c: char| {
262            utf16_idx += c.len_utf16();
263            is_newline(c)
264        });
265
266        if s.done() {
267            return None;
268        }
269
270        if s.eat() == Some('\r') && s.eat_if('\n') {
271            utf16_idx += 1;
272        }
273
274        Some(Line { byte_idx: byte_offset + s.cursor(), utf16_idx })
275    })
276}
277
278/// The number of code units this string would use if it was encoded in
279/// UTF16. This runs in linear time.
280fn len_utf16(string: &str) -> usize {
281    string.chars().map(char::len_utf16).sum()
282}
283
284#[cfg(test)]
285mod tests {
286    use super::*;
287
288    const TEST: &str = "ä\tcde\nf💛g\r\nhi\rjkl";
289
290    #[test]
291    fn test_source_file_new() {
292        let lines = Lines::new(TEST);
293        assert_eq!(
294            lines.0.lines,
295            [
296                Line { byte_idx: 0, utf16_idx: 0 },
297                Line { byte_idx: 7, utf16_idx: 6 },
298                Line { byte_idx: 15, utf16_idx: 12 },
299                Line { byte_idx: 18, utf16_idx: 15 },
300            ]
301        );
302    }
303
304    #[test]
305    fn test_source_file_pos_to_line() {
306        let lines = Lines::new(TEST);
307        assert_eq!(lines.byte_to_line(0), Some(0));
308        assert_eq!(lines.byte_to_line(2), Some(0));
309        assert_eq!(lines.byte_to_line(6), Some(0));
310        assert_eq!(lines.byte_to_line(7), Some(1));
311        assert_eq!(lines.byte_to_line(8), Some(1));
312        assert_eq!(lines.byte_to_line(12), Some(1));
313        assert_eq!(lines.byte_to_line(21), Some(3));
314        assert_eq!(lines.byte_to_line(22), None);
315    }
316
317    #[test]
318    fn test_source_file_pos_to_column() {
319        let lines = Lines::new(TEST);
320        assert_eq!(lines.byte_to_column(0), Some(0));
321        assert_eq!(lines.byte_to_column(2), Some(1));
322        assert_eq!(lines.byte_to_column(6), Some(5));
323        assert_eq!(lines.byte_to_column(7), Some(0));
324        assert_eq!(lines.byte_to_column(8), Some(1));
325        assert_eq!(lines.byte_to_column(12), Some(2));
326    }
327
328    #[test]
329    fn test_source_file_utf16() {
330        #[track_caller]
331        fn roundtrip(lines: &Lines<&str>, byte_idx: usize, utf16_idx: usize) {
332            let middle = lines.byte_to_utf16(byte_idx).unwrap();
333            let result = lines.utf16_to_byte(middle).unwrap();
334            assert_eq!(middle, utf16_idx);
335            assert_eq!(result, byte_idx);
336        }
337
338        let lines = Lines::new(TEST);
339        roundtrip(&lines, 0, 0);
340        roundtrip(&lines, 2, 1);
341        roundtrip(&lines, 3, 2);
342        roundtrip(&lines, 8, 7);
343        roundtrip(&lines, 12, 9);
344        roundtrip(&lines, 21, 18);
345        assert_eq!(lines.byte_to_utf16(22), None);
346        assert_eq!(lines.utf16_to_byte(19), None);
347    }
348
349    #[test]
350    fn test_source_file_roundtrip() {
351        #[track_caller]
352        fn roundtrip(lines: &Lines<&str>, byte_idx: usize) {
353            let line = lines.byte_to_line(byte_idx).unwrap();
354            let column = lines.byte_to_column(byte_idx).unwrap();
355            let result = lines.line_column_to_byte(line, column).unwrap();
356            assert_eq!(result, byte_idx);
357        }
358
359        let lines = Lines::new(TEST);
360        roundtrip(&lines, 0);
361        roundtrip(&lines, 7);
362        roundtrip(&lines, 12);
363        roundtrip(&lines, 21);
364    }
365
366    #[test]
367    fn test_source_file_edit() {
368        // This tests only the non-parser parts. The reparsing itself is
369        // tested separately.
370        #[track_caller]
371        fn test(prev: &str, range: Range<usize>, with: &str, after: &str) {
372            let reference = Lines::new(after);
373
374            let mut edited = Lines::new(prev.to_string());
375            edited.edit(range.clone(), with);
376            assert_eq!(edited.text(), reference.text());
377            assert_eq!(edited.0.lines, reference.0.lines);
378
379            let mut replaced = Lines::new(prev.to_string());
380            replaced.replace(&{
381                let mut s = prev.to_string();
382                s.replace_range(range, with);
383                s
384            });
385            assert_eq!(replaced.text(), reference.text());
386            assert_eq!(replaced.0.lines, reference.0.lines);
387        }
388
389        // Test inserting at the beginning.
390        test("abc\n", 0..0, "hi\n", "hi\nabc\n");
391        test("\nabc", 0..0, "hi\r", "hi\r\nabc");
392
393        // Test editing in the middle.
394        test(TEST, 4..16, "❌", "ä\tc❌i\rjkl");
395
396        // Test appending.
397        test("abc\ndef", 7..7, "hi", "abc\ndefhi");
398        test("abc\ndef\n", 8..8, "hi", "abc\ndef\nhi");
399
400        // Test appending with adjoining \r and \n.
401        test("abc\ndef\r", 8..8, "\nghi", "abc\ndef\r\nghi");
402
403        // Test removing everything.
404        test(TEST, 0..21, "", "");
405    }
406}