lsp_positions/lib.rs
1// -*- coding: utf-8 -*-
2// ------------------------------------------------------------------------------------------------
3// Copyright © 2021, stack-graphs authors.
4// Licensed under either of Apache License, Version 2.0, or MIT license, at your option.
5// Please see the LICENSE-APACHE or LICENSE-MIT files in this distribution for license details.
6// ------------------------------------------------------------------------------------------------
7
8//! Defines LSP-compatible positioning information for source code.
9//!
10//! When writing a tool that analyzes or operates on source code, there's a good chance you need to
11//! interoperate with the [Language Server Protocol][lsp]. This seemingly simple requirement makes
12//! it surprisingly difficult to deal with _character locations_. This is because Rust stores
13//! Unicode string content (i.e., the source code you're analyzing) in UTF-8, while LSP specifies
14//! character locations using [_UTF-16 code units_][lsp-utf16].
15//!
16//! For some background, Unicode characters, or code points, are encoded as one or more code units.
17//! In UTF-8 a code unit is 1 byte, and a character is encoded in 1–4 code units (1–4 bytes). In
18//! UTF-16 a code unit is 2 bytes, and characters are encoded in 1–2 code units (2 or 4 bytes).
19//! Rust strings are encoded as UTF-8, and indexed by byte (which is the same as by code unit).
20//! Indices are only valid if they point to the first code unit of a code point.
21//!
22//! We keep track of each source code position using two units: the UTF-8 byte position within the
23//! file or containing line, which can be used to index into UTF-8 encoded `str` and `[u8]` data,
24//! and the UTF-16 code unit position within the line, which can be used to generate `Position`
25//! values for LSP.
26//!
27//! [lsp]: https://microsoft.github.io/language-server-protocol/
28//! [lsp-utf16]: https://microsoft.github.io/language-server-protocol/specifications/specification-current/#textDocuments
29
30use std::ops::Range;
31
32use memchr::memchr;
33
34use unicode_segmentation::UnicodeSegmentation as _;
35
36fn grapheme_len(string: &str) -> usize {
37 string.graphemes(true).count()
38}
39
40fn utf16_len(string: &str) -> usize {
41 string.chars().map(char::len_utf16).sum()
42}
43
44/// All of the position information that we have about a character in a source file
45#[repr(C)]
46#[derive(Clone, Debug, Default, Eq, Hash, PartialEq)]
47#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
48#[cfg_attr(feature = "bincode", derive(bincode::Encode, bincode::Decode))]
49pub struct Position {
50 /// The 0-indexed line number containing the character
51 pub line: usize,
52 /// The offset of the character within its containing line, expressed as both a UTF-8 byte
53 /// index and a UTF-16 code unit index
54 pub column: Offset,
55 /// The UTF-8 byte indexes (within the file) of the start and end of the line containing the
56 /// character
57 pub containing_line: Range<usize>,
58 /// The UTF-8 byte indexes (within the file) of the start and end of the line containing the
59 /// character, with any leading and trailing whitespace removed
60 pub trimmed_line: Range<usize>,
61}
62
63impl Position {
64 /// Returns a tree-sitter [`Point`][Point] for this position.
65 ///
66 /// [Point]: https://docs.rs/tree-sitter/*/tree_sitter/struct.Point.html
67 #[cfg(feature = "tree-sitter")]
68 pub fn as_point(&self) -> tree_sitter::Point {
69 tree_sitter::Point {
70 row: self.line,
71 column: self.column.utf8_offset,
72 }
73 }
74}
75
76impl Ord for Position {
77 fn cmp(&self, other: &Position) -> std::cmp::Ordering {
78 self.line
79 .cmp(&other.line)
80 .then_with(|| self.column.utf8_offset.cmp(&other.column.utf8_offset))
81 }
82}
83
84impl PartialOrd for Position {
85 fn partial_cmp(&self, other: &Position) -> Option<std::cmp::Ordering> {
86 Some(self.cmp(other))
87 }
88}
89
90#[cfg(feature = "tree-sitter")]
91impl PartialEq<tree_sitter::Point> for Position {
92 fn eq(&self, other: &tree_sitter::Point) -> bool {
93 self.line == other.row && self.column.utf8_offset == other.column
94 }
95}
96
97#[cfg(feature = "tree-sitter")]
98impl PartialOrd<tree_sitter::Point> for Position {
99 fn partial_cmp(&self, other: &tree_sitter::Point) -> Option<std::cmp::Ordering> {
100 Some(
101 self.line
102 .cmp(&other.row)
103 .then_with(|| self.column.utf8_offset.cmp(&other.column)),
104 )
105 }
106}
107
108/// All of the position information that we have about a range of content in a source file
109#[repr(C)]
110#[derive(Clone, Debug, Default, Eq, Hash, PartialEq)]
111#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
112#[cfg_attr(feature = "bincode", derive(bincode::Encode, bincode::Decode))]
113pub struct Span {
114 pub start: Position,
115 pub end: Position,
116}
117
118impl Span {
119 pub fn contains(&self, position: &Position) -> bool {
120 &self.start <= position && &self.end > position
121 }
122
123 #[cfg(feature = "tree-sitter")]
124 pub fn contains_point(&self, point: &tree_sitter::Point) -> bool {
125 &self.start <= point && &self.end > point
126 }
127}
128
129impl Ord for Span {
130 fn cmp(&self, other: &Span) -> std::cmp::Ordering {
131 self.start
132 .cmp(&other.start)
133 .then_with(|| self.end.cmp(&other.end))
134 }
135}
136
137impl PartialOrd for Span {
138 fn partial_cmp(&self, other: &Span) -> Option<std::cmp::Ordering> {
139 Some(self.cmp(other))
140 }
141}
142
143/// The offset of a character within a string (typically a line of source code), using several
144/// different units
145///
146/// All offsets are 0-indexed.
147#[derive(Clone, Copy, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
148#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
149#[cfg_attr(feature = "bincode", derive(bincode::Encode, bincode::Decode))]
150pub struct Offset {
151 /// The number of UTF-8-encoded bytes appearing before this character in the string
152 pub utf8_offset: usize,
153 /// The number of UTF-16 code units appearing before this character in the string
154 pub utf16_offset: usize,
155 /// The number of graphemes appearing before this character in the string
156 pub grapheme_offset: usize,
157}
158
159impl Offset {
160 /// Calculates the length of a string, expressed as the position of the non-existent character
161 /// after the end of the line.
162 pub fn string_length(string: &str) -> Offset {
163 Offset {
164 utf8_offset: string.len(),
165 utf16_offset: utf16_len(string),
166 grapheme_offset: grapheme_len(string),
167 }
168 }
169
170 /// Calculates the offset of each character within a string. Typically the string will contain
171 /// a single line of text, in which case the results are column offsets. (In this case, the
172 /// string should not contain any newlines, though we don't verify this.)
173 ///
174 /// Each character's offset is returned both as the byte offset from the beginning of the line,
175 /// as well as the number of UTF-16 code units before the character in the line. (This is the
176 /// column unit used by the [Language Server Protocol][lsp-utf16].)
177 ///
178 /// The result is an iterator of offsets, one for each character. The results will be sorted,
179 /// so you can collect them into a `Vec` and use `binary_search_by_key` to look for particular
180 /// characters.
181 ///
182 /// [lsp-utf16]: https://microsoft.github.io/language-server-protocol/specification#textDocuments
183 pub fn all_chars(line: &str) -> impl Iterator<Item = Offset> + '_ {
184 let mut grapheme_utf8_offsets = line
185 .grapheme_indices(true)
186 .map(|(utf8_offset, cluster)| Range {
187 start: utf8_offset,
188 end: utf8_offset + cluster.len(),
189 })
190 .peekable();
191 // We want the output to include an entry for the end of the string — i.e., for the byte
192 // offset immediately after the last character of the string. To do this, we add a dummy
193 // character to list of actual characters from the string.
194 line.chars()
195 .chain(std::iter::once(' '))
196 .scan(Offset::default(), move |offset, ch| {
197 let result = Some(*offset);
198 // If there is no next grapheme, we assume it is the extra ' ' that was chained
199 if grapheme_utf8_offsets
200 .peek()
201 .map(|r| r.start == offset.utf8_offset)
202 .unwrap_or(true)
203 {
204 grapheme_utf8_offsets.next();
205 offset.grapheme_offset += 1;
206 }
207 offset.utf8_offset += ch.len_utf8();
208 offset.utf16_offset += ch.len_utf16();
209 result
210 })
211 }
212}
213
214/// A substring and information about where that substring occurs in a larger string. (Most often,
215/// this is a “line” and information about where that line occurs within a “file”.)
216#[derive(Clone)]
217pub struct PositionedSubstring<'a> {
218 /// The content of the substring
219 pub content: &'a str,
220 /// The UTF-8 byte offsets of the beginning and end of the substring within the larger string
221 pub utf8_bounds: Range<usize>,
222 /// The number of UTF-16 code units in the substring
223 pub utf16_length: usize,
224 /// The number of graphemes in the substring
225 pub grapheme_length: usize,
226}
227
228impl<'a> PositionedSubstring<'a> {
229 /// Constructs a new positioned substring. You must provide the larger string, and the byte
230 /// range of the desired substring.
231 pub fn from_range(string: &'a str, utf8_bounds: Range<usize>) -> PositionedSubstring<'a> {
232 let substring = &string[utf8_bounds.clone()];
233 PositionedSubstring {
234 content: substring,
235 utf8_bounds,
236 utf16_length: utf16_len(substring),
237 grapheme_length: grapheme_len(substring),
238 }
239 }
240
241 /// Constructs a new positioned substring for a newline-terminated line within a file. You
242 /// provide the byte offset of the start of the line, and we automatically find the end of the
243 /// line.
244 pub fn from_line(string: &'a str, line_utf8_offset: usize) -> PositionedSubstring<'a> {
245 // The line's byte index lets us trim all preceding lines in the file.
246 let line_plus_others = &string[line_utf8_offset..];
247
248 // The requested line stops at the first newline, or at the end of the file if there aren't
249 // any newlines.
250 let line = match memchr(b'\n', line_plus_others.as_bytes()) {
251 Some(newline_offset) => &line_plus_others[..newline_offset],
252 None => line_plus_others,
253 };
254
255 let length = Offset::string_length(line);
256 PositionedSubstring {
257 content: line,
258 utf8_bounds: Range {
259 start: line_utf8_offset,
260 end: line_utf8_offset + length.utf8_offset,
261 },
262 utf16_length: length.utf16_offset,
263 grapheme_length: length.grapheme_offset,
264 }
265 }
266
267 // Returns an iterator over the lines of the given string.
268 pub fn lines_iter(string: &'a str) -> impl Iterator<Item = PositionedSubstring<'a>> + 'a {
269 let mut next_utf8_offset = 0;
270 std::iter::from_fn(move || {
271 if string.len() <= next_utf8_offset {
272 return None;
273 }
274 let next = PositionedSubstring::from_line(string, next_utf8_offset);
275 next_utf8_offset = next.utf8_bounds.end + 1;
276 Some(next)
277 })
278 }
279
280 /// Trims ASCII whitespace from both ends of a substring.
281 pub fn trim_whitespace(&mut self) {
282 let leading_whitespace = self
283 .content
284 .bytes()
285 .enumerate()
286 .find(|(_, ch)| !(*ch as char).is_ascii_whitespace())
287 .map(|(index, _)| index)
288 .unwrap_or(self.content.len());
289 let left_whitespace = &self.content[0..leading_whitespace];
290 let trimmed_left = &self.content[leading_whitespace..];
291
292 let trailing_whitespace = trimmed_left
293 .bytes()
294 .enumerate()
295 // Point at the last non-whitespace character
296 .rfind(|(_, ch)| !(*ch as char).is_ascii_whitespace())
297 // Point at the immediately following whitespace character. Note we are only looking
298 // for _ASCII_ whitespace, so we can assume that the last whitespace character that we
299 // found is 1 byte long.
300 .map(|(index, _)| index + 1)
301 .unwrap_or(0);
302 let trimmed = &trimmed_left[0..trailing_whitespace];
303 let right_whitespace = &trimmed_left[trailing_whitespace..];
304
305 self.content = trimmed;
306 self.utf8_bounds.start += left_whitespace.len();
307 self.utf8_bounds.end -= right_whitespace.len();
308 self.utf16_length -= utf16_len(left_whitespace);
309 self.utf16_length -= utf16_len(right_whitespace);
310 self.grapheme_length -= grapheme_len(left_whitespace);
311 self.grapheme_length -= grapheme_len(right_whitespace);
312 }
313}
314
315/// Automates the construction of [`Span`][] instances for content within a string.
316pub struct SpanCalculator<'a> {
317 string: &'a str,
318 containing_line: Option<PositionedSubstring<'a>>,
319 trimmed_line: Option<PositionedSubstring<'a>>,
320 columns: Vec<Offset>,
321}
322
323// Note that each time you calculate the position of a node on a _different line_, we have to
324// calculate some information about line. You'd think that would mean it would be most efficient
325// to use this type if you made to sure group all of your nodes by their rows before asking for us
326// to create Spans for them. However, it turns out that sorting your nodes to make sure that
327// they're in row order is just as much work as recalculating the UTF16 column offsets if we ever
328// revisit a line!
329
330impl<'a> SpanCalculator<'a> {
331 /// Creates a new span calculator for locations within the given string.
332 pub fn new(string: &'a str) -> SpanCalculator<'a> {
333 SpanCalculator {
334 string,
335 containing_line: None,
336 trimmed_line: None,
337 columns: Vec::new(),
338 }
339 }
340
341 /// Constructs a [`Position`][] instance for a particular line and column in the string.
342 /// You must provide the 0-indexed line number, the byte offset of the line within the string,
343 /// and the UTF-8 byte offset of the character within the line.
344 pub fn for_line_and_column(
345 &mut self,
346 line: usize,
347 line_utf8_offset: usize,
348 column_utf8_offset: usize,
349 ) -> Position {
350 self.replace_current_line(line_utf8_offset);
351 Position {
352 line: line,
353 column: *self.for_utf8_offset(column_utf8_offset),
354 containing_line: self.containing_line.as_ref().unwrap().utf8_bounds.clone(),
355 trimmed_line: self.trimmed_line.as_ref().unwrap().utf8_bounds.clone(),
356 }
357 }
358
359 /// Constructs a [`Span`][] instance for a tree-sitter node.
360 #[cfg(feature = "tree-sitter")]
361 pub fn for_node(&mut self, node: &tree_sitter::Node) -> Span {
362 let start = self.position_for_node(node.start_byte(), node.start_position());
363 let end = self.position_for_node(node.end_byte(), node.end_position());
364 Span { start, end }
365 }
366
367 /// Constructs a [`Position`][] instance for a tree-sitter location.
368 #[cfg(feature = "tree-sitter")]
369 pub fn position_for_node(
370 &mut self,
371 byte_offset: usize,
372 position: tree_sitter::Point,
373 ) -> Position {
374 // Since we know the byte offset of the node within the file, and of the node within the
375 // line, subtracting gives us the offset of the line within the file.
376 let line_utf8_offset = byte_offset - position.column;
377 self.for_line_and_column(position.row, line_utf8_offset, position.column)
378 }
379
380 /// Constructs a [`Position`][] instance for a particular line and column in the string.
381 /// You must provide the 0-indexed line number, the byte offset of the line within the string,
382 /// and the grapheme offset of the character within the line.
383 pub fn for_line_and_grapheme(
384 &mut self,
385 line: usize,
386 line_utf8_offset: usize,
387 column_grapheme_offset: usize,
388 ) -> Position {
389 self.replace_current_line(line_utf8_offset);
390 Position {
391 line: line,
392 column: *self.for_grapheme_offset(column_grapheme_offset),
393 containing_line: self.containing_line.as_ref().unwrap().utf8_bounds.clone(),
394 trimmed_line: self.trimmed_line.as_ref().unwrap().utf8_bounds.clone(),
395 }
396 }
397
398 /// Updates our internal state to represent the information about the line that starts at a
399 /// particular byte offset within the file.
400 fn replace_current_line(&mut self, line_utf8_offset: usize) {
401 if let Some(containing_line) = &self.containing_line {
402 if containing_line.utf8_bounds.start == line_utf8_offset {
403 return;
404 }
405 }
406 let line = PositionedSubstring::from_line(self.string, line_utf8_offset);
407 self.columns.clear();
408 self.columns.extend(Offset::all_chars(line.content));
409 let mut trimmed = line.clone();
410 trimmed.trim_whitespace();
411 self.containing_line = Some(line);
412 self.trimmed_line = Some(trimmed);
413 }
414
415 /// Returns the offset of the character at a particular UTF-8 offset in the line.
416 /// Assumes that you've already called `replace_current_line` for the containing line.
417 fn for_utf8_offset(&self, utf8_offset: usize) -> &Offset {
418 let index = self
419 .columns
420 .binary_search_by_key(&utf8_offset, |pos| pos.utf8_offset)
421 .unwrap();
422 &self.columns[index]
423 }
424
425 /// Returns the offset of the character at a particular grapheme offset in the line.
426 /// Assumes that you've already called `replace_current_line` for the containing line.
427 fn for_grapheme_offset(&self, grapheme_offset: usize) -> &Offset {
428 let mut index = self
429 .columns
430 .binary_search_by_key(&grapheme_offset, |pos| pos.grapheme_offset)
431 .unwrap();
432 // make sure to return the first offset for this grapheme
433 let mut offset = &self.columns[index];
434 while index > 0 {
435 index -= 1;
436 let prev_offset = &self.columns[index];
437 if prev_offset.grapheme_offset != offset.grapheme_offset {
438 break;
439 }
440 offset = prev_offset;
441 }
442 offset
443 }
444}