lsp_positions/
lib.rs

1// -*- coding: utf-8 -*-
2// ------------------------------------------------------------------------------------------------
3// Copyright © 2021, stack-graphs authors.
4// Licensed under either of Apache License, Version 2.0, or MIT license, at your option.
5// Please see the LICENSE-APACHE or LICENSE-MIT files in this distribution for license details.
6// ------------------------------------------------------------------------------------------------
7
8//! Defines LSP-compatible positioning information for source code.
9//!
10//! When writing a tool that analyzes or operates on source code, there's a good chance you need to
11//! interoperate with the [Language Server Protocol][lsp].  This seemingly simple requirement makes
12//! it surprisingly difficult to deal with _character locations_.  This is because Rust stores
13//! Unicode string content (i.e., the source code you're analyzing) in UTF-8, while LSP specifies
14//! character locations using [_UTF-16 code units_][lsp-utf16].
15//!
16//! For some background, Unicode characters, or code points, are encoded as one or more code units.
17//! In UTF-8 a code unit is 1 byte, and a character is encoded in 1–4 code units (1–4 bytes).  In
18//! UTF-16 a code unit is 2 bytes, and characters are encoded in 1–2 code units (2 or 4 bytes).
19//! Rust strings are encoded as UTF-8, and indexed by byte (which is the same as by code unit).
20//! Indices are only valid if they point to the first code unit of a code point.
21//!
22//! We keep track of each source code position using two units: the UTF-8 byte position within the
23//! file or containing line, which can be used to index into UTF-8 encoded `str` and `[u8]` data,
24//! and the UTF-16 code unit position within the line, which can be used to generate `Position`
25//! values for LSP.
26//!
27//! [lsp]: https://microsoft.github.io/language-server-protocol/
28//! [lsp-utf16]: https://microsoft.github.io/language-server-protocol/specifications/specification-current/#textDocuments
29
30use std::ops::Range;
31
32use memchr::memchr;
33
34use unicode_segmentation::UnicodeSegmentation as _;
35
36fn grapheme_len(string: &str) -> usize {
37    string.graphemes(true).count()
38}
39
40fn utf16_len(string: &str) -> usize {
41    string.chars().map(char::len_utf16).sum()
42}
43
44/// All of the position information that we have about a character in a source file
45#[repr(C)]
46#[derive(Clone, Debug, Default, Eq, Hash, PartialEq)]
47#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
48#[cfg_attr(feature = "bincode", derive(bincode::Encode, bincode::Decode))]
49pub struct Position {
50    /// The 0-indexed line number containing the character
51    pub line: usize,
52    /// The offset of the character within its containing line, expressed as both a UTF-8 byte
53    /// index and a UTF-16 code unit index
54    pub column: Offset,
55    /// The UTF-8 byte indexes (within the file) of the start and end of the line containing the
56    /// character
57    pub containing_line: Range<usize>,
58    /// The UTF-8 byte indexes (within the file) of the start and end of the line containing the
59    /// character, with any leading and trailing whitespace removed
60    pub trimmed_line: Range<usize>,
61}
62
63impl Position {
64    /// Returns a tree-sitter [`Point`][Point] for this position.
65    ///
66    /// [Point]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Point.html
67    #[cfg(feature = "tree-sitter")]
68    pub fn as_point(&self) -> tree_sitter::Point {
69        tree_sitter::Point {
70            row: self.line,
71            column: self.column.utf8_offset,
72        }
73    }
74}
75
76impl Ord for Position {
77    fn cmp(&self, other: &Position) -> std::cmp::Ordering {
78        self.line
79            .cmp(&other.line)
80            .then_with(|| self.column.utf8_offset.cmp(&other.column.utf8_offset))
81    }
82}
83
84impl PartialOrd for Position {
85    fn partial_cmp(&self, other: &Position) -> Option<std::cmp::Ordering> {
86        Some(self.cmp(other))
87    }
88}
89
90#[cfg(feature = "tree-sitter")]
91impl PartialEq<tree_sitter::Point> for Position {
92    fn eq(&self, other: &tree_sitter::Point) -> bool {
93        self.line == other.row && self.column.utf8_offset == other.column
94    }
95}
96
97#[cfg(feature = "tree-sitter")]
98impl PartialOrd<tree_sitter::Point> for Position {
99    fn partial_cmp(&self, other: &tree_sitter::Point) -> Option<std::cmp::Ordering> {
100        Some(
101            self.line
102                .cmp(&other.row)
103                .then_with(|| self.column.utf8_offset.cmp(&other.column)),
104        )
105    }
106}
107
108/// All of the position information that we have about a range of content in a source file
109#[repr(C)]
110#[derive(Clone, Debug, Default, Eq, Hash, PartialEq)]
111#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
112#[cfg_attr(feature = "bincode", derive(bincode::Encode, bincode::Decode))]
113pub struct Span {
114    pub start: Position,
115    pub end: Position,
116}
117
118impl Span {
119    pub fn contains(&self, position: &Position) -> bool {
120        &self.start <= position && &self.end > position
121    }
122
123    #[cfg(feature = "tree-sitter")]
124    pub fn contains_point(&self, point: &tree_sitter::Point) -> bool {
125        &self.start <= point && &self.end > point
126    }
127}
128
129impl Ord for Span {
130    fn cmp(&self, other: &Span) -> std::cmp::Ordering {
131        self.start
132            .cmp(&other.start)
133            .then_with(|| self.end.cmp(&other.end))
134    }
135}
136
137impl PartialOrd for Span {
138    fn partial_cmp(&self, other: &Span) -> Option<std::cmp::Ordering> {
139        Some(self.cmp(other))
140    }
141}
142
143/// The offset of a character within a string (typically a line of source code), using several
144/// different units
145///
146/// All offsets are 0-indexed.
147#[derive(Clone, Copy, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
148#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
149#[cfg_attr(feature = "bincode", derive(bincode::Encode, bincode::Decode))]
150pub struct Offset {
151    /// The number of UTF-8-encoded bytes appearing before this character in the string
152    pub utf8_offset: usize,
153    /// The number of UTF-16 code units appearing before this character in the string
154    pub utf16_offset: usize,
155    /// The number of graphemes appearing before this character in the string
156    pub grapheme_offset: usize,
157}
158
159impl Offset {
160    /// Calculates the length of a string, expressed as the position of the non-existent character
161    /// after the end of the line.
162    pub fn string_length(string: &str) -> Offset {
163        Offset {
164            utf8_offset: string.len(),
165            utf16_offset: utf16_len(string),
166            grapheme_offset: grapheme_len(string),
167        }
168    }
169
170    /// Calculates the offset of each character within a string.  Typically the string will contain
171    /// a single line of text, in which case the results are column offsets.  (In this case, the
172    /// string should not contain any newlines, though we don't verify this.)
173    ///
174    /// Each character's offset is returned both as the byte offset from the beginning of the line,
175    /// as well as the number of UTF-16 code units before the character in the line.  (This is the
176    /// column unit used by the [Language Server Protocol][lsp-utf16].)
177    ///
178    /// The result is an iterator of offsets, one for each character.  The results will be sorted,
179    /// so you can collect them into a `Vec` and use `binary_search_by_key` to look for particular
180    /// characters.
181    ///
182    /// [lsp-utf16]: https://microsoft.github.io/language-server-protocol/specification#textDocuments
183    pub fn all_chars(line: &str) -> impl Iterator<Item = Offset> + '_ {
184        let mut grapheme_utf8_offsets = line
185            .grapheme_indices(true)
186            .map(|(utf8_offset, cluster)| Range {
187                start: utf8_offset,
188                end: utf8_offset + cluster.len(),
189            })
190            .peekable();
191        // We want the output to include an entry for the end of the string — i.e., for the byte
192        // offset immediately after the last character of the string.  To do this, we add a dummy
193        // character to list of actual characters from the string.
194        line.chars()
195            .chain(std::iter::once(' '))
196            .scan(Offset::default(), move |offset, ch| {
197                let result = Some(*offset);
198                // If there is no next grapheme, we assume it is the extra ' ' that was chained
199                if grapheme_utf8_offsets
200                    .peek()
201                    .map(|r| r.start == offset.utf8_offset)
202                    .unwrap_or(true)
203                {
204                    grapheme_utf8_offsets.next();
205                    offset.grapheme_offset += 1;
206                }
207                offset.utf8_offset += ch.len_utf8();
208                offset.utf16_offset += ch.len_utf16();
209                result
210            })
211    }
212}
213
214/// A substring and information about where that substring occurs in a larger string.  (Most often,
215/// this is a “line” and information about where that line occurs within a “file”.)
216#[derive(Clone)]
217pub struct PositionedSubstring<'a> {
218    /// The content of the substring
219    pub content: &'a str,
220    /// The UTF-8 byte offsets of the beginning and end of the substring within the larger string
221    pub utf8_bounds: Range<usize>,
222    /// The number of UTF-16 code units in the substring
223    pub utf16_length: usize,
224    /// The number of graphemes in the substring
225    pub grapheme_length: usize,
226}
227
228impl<'a> PositionedSubstring<'a> {
229    /// Constructs a new positioned substring.  You must provide the larger string, and the byte
230    /// range of the desired substring.
231    pub fn from_range(string: &'a str, utf8_bounds: Range<usize>) -> PositionedSubstring<'a> {
232        let substring = &string[utf8_bounds.clone()];
233        PositionedSubstring {
234            content: substring,
235            utf8_bounds,
236            utf16_length: utf16_len(substring),
237            grapheme_length: grapheme_len(substring),
238        }
239    }
240
241    /// Constructs a new positioned substring for a newline-terminated line within a file.  You
242    /// provide the byte offset of the start of the line, and we automatically find the end of the
243    /// line.
244    pub fn from_line(string: &'a str, line_utf8_offset: usize) -> PositionedSubstring<'a> {
245        // The line's byte index lets us trim all preceding lines in the file.
246        let line_plus_others = &string[line_utf8_offset..];
247
248        // The requested line stops at the first newline, or at the end of the file if there aren't
249        // any newlines.
250        let line = match memchr(b'\n', line_plus_others.as_bytes()) {
251            Some(newline_offset) => &line_plus_others[..newline_offset],
252            None => line_plus_others,
253        };
254
255        let length = Offset::string_length(line);
256        PositionedSubstring {
257            content: line,
258            utf8_bounds: Range {
259                start: line_utf8_offset,
260                end: line_utf8_offset + length.utf8_offset,
261            },
262            utf16_length: length.utf16_offset,
263            grapheme_length: length.grapheme_offset,
264        }
265    }
266
267    // Returns an iterator over the lines of the given string.
268    pub fn lines_iter(string: &'a str) -> impl Iterator<Item = PositionedSubstring<'a>> + 'a {
269        let mut next_utf8_offset = 0;
270        std::iter::from_fn(move || {
271            if string.len() <= next_utf8_offset {
272                return None;
273            }
274            let next = PositionedSubstring::from_line(string, next_utf8_offset);
275            next_utf8_offset = next.utf8_bounds.end + 1;
276            Some(next)
277        })
278    }
279
280    /// Trims ASCII whitespace from both ends of a substring.
281    pub fn trim_whitespace(&mut self) {
282        let leading_whitespace = self
283            .content
284            .bytes()
285            .enumerate()
286            .find(|(_, ch)| !(*ch as char).is_ascii_whitespace())
287            .map(|(index, _)| index)
288            .unwrap_or(self.content.len());
289        let left_whitespace = &self.content[0..leading_whitespace];
290        let trimmed_left = &self.content[leading_whitespace..];
291
292        let trailing_whitespace = trimmed_left
293            .bytes()
294            .enumerate()
295            // Point at the last non-whitespace character
296            .rfind(|(_, ch)| !(*ch as char).is_ascii_whitespace())
297            // Point at the immediately following whitespace character.  Note we are only looking
298            // for _ASCII_ whitespace, so we can assume that the last whitespace character that we
299            // found is 1 byte long.
300            .map(|(index, _)| index + 1)
301            .unwrap_or(0);
302        let trimmed = &trimmed_left[0..trailing_whitespace];
303        let right_whitespace = &trimmed_left[trailing_whitespace..];
304
305        self.content = trimmed;
306        self.utf8_bounds.start += left_whitespace.len();
307        self.utf8_bounds.end -= right_whitespace.len();
308        self.utf16_length -= utf16_len(left_whitespace);
309        self.utf16_length -= utf16_len(right_whitespace);
310        self.grapheme_length -= grapheme_len(left_whitespace);
311        self.grapheme_length -= grapheme_len(right_whitespace);
312    }
313}
314
315/// Automates the construction of [`Span`][] instances for content within a string.
316pub struct SpanCalculator<'a> {
317    string: &'a str,
318    containing_line: Option<PositionedSubstring<'a>>,
319    trimmed_line: Option<PositionedSubstring<'a>>,
320    columns: Vec<Offset>,
321}
322
323// Note that each time you calculate the position of a node on a _different line_, we have to
324// calculate some information about line.  You'd think that would mean it would be most efficient
325// to use this type if you made to sure group all of your nodes by their rows before asking for us
326// to create Spans for them.  However, it turns out that sorting your nodes to make sure that
327// they're in row order is just as much work as recalculating the UTF16 column offsets if we ever
328// revisit a line!
329
330impl<'a> SpanCalculator<'a> {
331    /// Creates a new span calculator for locations within the given string.
332    pub fn new(string: &'a str) -> SpanCalculator<'a> {
333        SpanCalculator {
334            string,
335            containing_line: None,
336            trimmed_line: None,
337            columns: Vec::new(),
338        }
339    }
340
341    /// Constructs a [`Position`][] instance for a particular line and column in the string.
342    /// You must provide the 0-indexed line number, the byte offset of the line within the string,
343    /// and the UTF-8 byte offset of the character within the line.
344    pub fn for_line_and_column(
345        &mut self,
346        line: usize,
347        line_utf8_offset: usize,
348        column_utf8_offset: usize,
349    ) -> Position {
350        self.replace_current_line(line_utf8_offset);
351        Position {
352            line: line,
353            column: *self.for_utf8_offset(column_utf8_offset),
354            containing_line: self.containing_line.as_ref().unwrap().utf8_bounds.clone(),
355            trimmed_line: self.trimmed_line.as_ref().unwrap().utf8_bounds.clone(),
356        }
357    }
358
359    /// Constructs a [`Span`][] instance for a tree-sitter node.
360    #[cfg(feature = "tree-sitter")]
361    pub fn for_node(&mut self, node: &tree_sitter::Node) -> Span {
362        let start = self.position_for_node(node.start_byte(), node.start_position());
363        let end = self.position_for_node(node.end_byte(), node.end_position());
364        Span { start, end }
365    }
366
367    /// Constructs a [`Position`][] instance for a tree-sitter location.
368    #[cfg(feature = "tree-sitter")]
369    pub fn position_for_node(
370        &mut self,
371        byte_offset: usize,
372        position: tree_sitter::Point,
373    ) -> Position {
374        // Since we know the byte offset of the node within the file, and of the node within the
375        // line, subtracting gives us the offset of the line within the file.
376        let line_utf8_offset = byte_offset - position.column;
377        self.for_line_and_column(position.row, line_utf8_offset, position.column)
378    }
379
380    /// Constructs a [`Position`][] instance for a particular line and column in the string.
381    /// You must provide the 0-indexed line number, the byte offset of the line within the string,
382    /// and the grapheme offset of the character within the line.
383    pub fn for_line_and_grapheme(
384        &mut self,
385        line: usize,
386        line_utf8_offset: usize,
387        column_grapheme_offset: usize,
388    ) -> Position {
389        self.replace_current_line(line_utf8_offset);
390        Position {
391            line: line,
392            column: *self.for_grapheme_offset(column_grapheme_offset),
393            containing_line: self.containing_line.as_ref().unwrap().utf8_bounds.clone(),
394            trimmed_line: self.trimmed_line.as_ref().unwrap().utf8_bounds.clone(),
395        }
396    }
397
398    /// Updates our internal state to represent the information about the line that starts at a
399    /// particular byte offset within the file.
400    fn replace_current_line(&mut self, line_utf8_offset: usize) {
401        if let Some(containing_line) = &self.containing_line {
402            if containing_line.utf8_bounds.start == line_utf8_offset {
403                return;
404            }
405        }
406        let line = PositionedSubstring::from_line(self.string, line_utf8_offset);
407        self.columns.clear();
408        self.columns.extend(Offset::all_chars(line.content));
409        let mut trimmed = line.clone();
410        trimmed.trim_whitespace();
411        self.containing_line = Some(line);
412        self.trimmed_line = Some(trimmed);
413    }
414
415    /// Returns the offset of the character at a particular UTF-8 offset in the line.
416    /// Assumes that you've already called `replace_current_line` for the containing line.
417    fn for_utf8_offset(&self, utf8_offset: usize) -> &Offset {
418        let index = self
419            .columns
420            .binary_search_by_key(&utf8_offset, |pos| pos.utf8_offset)
421            .unwrap();
422        &self.columns[index]
423    }
424
425    /// Returns the offset of the character at a particular grapheme offset in the line.
426    /// Assumes that you've already called `replace_current_line` for the containing line.
427    fn for_grapheme_offset(&self, grapheme_offset: usize) -> &Offset {
428        let mut index = self
429            .columns
430            .binary_search_by_key(&grapheme_offset, |pos| pos.grapheme_offset)
431            .unwrap();
432        // make sure to return the first offset for this grapheme
433        let mut offset = &self.columns[index];
434        while index > 0 {
435            index -= 1;
436            let prev_offset = &self.columns[index];
437            if prev_offset.grapheme_offset != offset.grapheme_offset {
438                break;
439            }
440            offset = prev_offset;
441        }
442        offset
443    }
444}