lutra_compiler/codespan/
line_numbers.rs

1use crate::codespan::{LineColumn, Range, Span};
2use std::collections::HashMap;
3
4/// A struct which contains information about line numbers of a source file,
5/// and can convert between byte offsets that are used in the compiler and
6/// line-column pairs used in LSP.
7#[derive(Debug, Clone, PartialEq, Eq)]
8pub struct LineNumbers {
9    /// The byte offsets of the start of each line of the source file
10    pub line_starts: Vec<u32>,
11    /// The total length of the source file
12    pub length: u32,
13    /// A mapping of byte offsets to character length information. This is used
14    /// when converting between byte indices and line-column numbers, because
15    /// LSP uses UTF-16, while Rust encodes strings as UTF-8.
16    ///
17    /// This only contains characters which are more than one byte in UTF-8,
18    /// because one byte UTF-8 characters are one UTF-16 segment also, so no
19    /// translation is needed.
20    ///
21    /// We could store the whole source file here instead, however that would
22    /// be quite wasteful. Most Gleam programs use only ASCII characters, meaning
23    /// UTF-8 offsets are the same as UTF-16 ones. With this representation, we
24    /// only need to store a few characters.
25    ///
26    /// In most programs this will be empty because they will only be using
27    /// ASCII characters.
28    pub mapping: HashMap<usize, CharLen>,
29}
30
31/// Information about how a character is encoded in UTF-8 and UTF-16.
32#[derive(Debug, Clone, Copy, PartialEq, Eq)]
33pub struct CharLen {
34    /// The number of bytes needed to encode this in UTF-8.
35    pub length_utf8: u8,
36    /// The number of 16-bit segments needed to encode this in UTF-16.
37    pub length_utf16: u8,
38}
39
40impl LineNumbers {
41    pub fn new(src: &str) -> Self {
42        Self {
43            length: src.len() as u32,
44            line_starts: std::iter::once(0)
45                .chain(src.match_indices('\n').map(|(i, _)| i as u32 + 1))
46                .collect(),
47            mapping: Self::mapping(src),
48        }
49    }
50
51    fn mapping(src: &str) -> HashMap<usize, CharLen> {
52        let mut map = HashMap::new();
53
54        for (i, char) in src.char_indices() {
55            let length = char.len_utf8();
56            if length != 1 {
57                _ = map.insert(
58                    i,
59                    CharLen {
60                        length_utf8: length as u8,
61                        length_utf16: char.len_utf16() as u8,
62                    },
63                );
64            }
65        }
66
67        map
68    }
69
70    /// Returns the 0-indexed line number of a given byte index
71    pub fn line_number(&self, byte_index: u32) -> u32 {
72        self.line_starts
73            .binary_search(&byte_index)
74            .unwrap_or_else(|next_line| next_line - 1) as u32
75    }
76
77    /// Returns the line and column of a given byte index.
78    pub fn line_and_column_number(&self, byte_index: u32) -> LineColumn {
79        let line = self.line_number(byte_index);
80        let line_start = self
81            .line_starts
82            .get(line as usize)
83            .copied()
84            .unwrap_or_default();
85
86        let mut u8_offset = line_start;
87        let mut u16_offset = 0;
88
89        loop {
90            if u8_offset >= byte_index {
91                break;
92            }
93
94            if let Some(length) = self.mapping.get(&(u8_offset as usize)) {
95                u8_offset += length.length_utf8 as u32;
96                u16_offset += length.length_utf16 as u32;
97            } else {
98                u16_offset += 1;
99                u8_offset += 1;
100            }
101        }
102
103        LineColumn {
104            line,
105            column: u16_offset,
106        }
107    }
108
109    /// Returns the starting byte index of a Unicode code point at a given line
110    /// and column.
111    pub fn byte_index_of_line_col(&self, position: LineColumn) -> u32 {
112        let line_start = match self.line_starts.get(position.line as usize) {
113            Some(&line_start) => line_start,
114            None => return self.length,
115        };
116
117        let mut u8_offset = line_start;
118        let mut u16_offset = 0;
119
120        loop {
121            if u16_offset >= position.column {
122                break;
123            }
124
125            if let Some(length) = self.mapping.get(&(u8_offset as usize)) {
126                u8_offset += length.length_utf8 as u32;
127                u16_offset += length.length_utf16 as u32;
128            } else {
129                u16_offset += 1;
130                u8_offset += 1;
131            }
132        }
133
134        u8_offset
135    }
136
137    /// Checks if the given span spans an entire line (excluding the newline
138    /// character itself).
139    pub fn spans_entire_line(&self, span: &Span) -> bool {
140        self.line_starts.iter().any(|&line_start| {
141            line_start == span.start && self.line_starts.contains(&(span.end() + 1))
142        })
143    }
144
145    /// Converts a [Range] of some source file to a [Span].
146    pub fn span_of_range(&self, range: &Range, source_id: u16) -> Span {
147        let start = self.byte_index_of_line_col(range.start);
148        let end = self.byte_index_of_line_col(range.end);
149        let len = end.saturating_sub(start) as u16;
150        Span {
151            start,
152            len,
153            source_id,
154        }
155    }
156
157    /// Converts a [Span] to a [Range].
158    pub fn range_of_span(&self, span: Span) -> Range {
159        let start = self.line_and_column_number(span.start);
160        let end = self.line_and_column_number(span.end());
161        Range { start, end }
162    }
163}
lutra_compiler/codespan/line_numbers.rs

lutra_compiler/codespan/
line_numbers.rs