Skip to main content

bibtex_parser/
source.rs

1//! Source identity and byte-to-line location utilities.
2
3use crate::{SourceId, SourceSpan};
4use std::borrow::Cow;
5
6/// A line-indexed view of a parsed source.
7///
8/// Byte offsets are zero-based UTF-8 offsets. Lines and columns are one-based,
9/// and columns count Unicode scalar values, not bytes. End positions point to
10/// the location immediately after the covered byte range.
11#[derive(Debug, Clone)]
12pub struct SourceMap<'a> {
13    source: Option<SourceId>,
14    name: Option<Cow<'a, str>>,
15    input: &'a str,
16    line_starts: Vec<usize>,
17    line_ascii: Vec<bool>,
18}
19
20impl<'a> SourceMap<'a> {
21    /// Create an anonymous source map.
22    #[must_use]
23    pub fn anonymous(input: &'a str) -> Self {
24        Self::new(None, None, input)
25    }
26
27    /// Create a source map with a document-local source identifier and optional name.
28    #[must_use]
29    pub fn new(source: Option<SourceId>, name: Option<Cow<'a, str>>, input: &'a str) -> Self {
30        let line_capacity = estimate_line_capacity(input.len());
31        if input.is_ascii() {
32            let mut line_starts = Vec::with_capacity(line_capacity);
33            line_starts.push(0);
34            line_starts.extend(memchr::memchr_iter(b'\n', input.as_bytes()).map(|index| index + 1));
35            let line_ascii = vec![true; line_starts.len()];
36
37            return Self {
38                source,
39                name,
40                input,
41                line_starts,
42                line_ascii,
43            };
44        }
45
46        let mut line_starts = Vec::with_capacity(line_capacity);
47        let mut line_ascii = Vec::with_capacity(line_capacity);
48        let mut current_line_ascii = true;
49        line_starts.push(0);
50        for (index, byte) in input.bytes().enumerate() {
51            if byte >= 0x80 {
52                current_line_ascii = false;
53            }
54            if byte == b'\n' {
55                line_ascii.push(current_line_ascii);
56                line_starts.push(index + 1);
57                current_line_ascii = true;
58            }
59        }
60        line_ascii.push(current_line_ascii);
61
62        Self {
63            source,
64            name,
65            input,
66            line_starts,
67            line_ascii,
68        }
69    }
70
71    /// Return this source's identifier, when it has one.
72    #[must_use]
73    pub const fn source_id(&self) -> Option<SourceId> {
74        self.source
75    }
76
77    /// Return this source's caller-provided name.
78    #[must_use]
79    pub fn name(&self) -> Option<&str> {
80        self.name.as_deref()
81    }
82
83    /// Return the number of bytes in the source.
84    #[must_use]
85    pub const fn len(&self) -> usize {
86        self.input.len()
87    }
88
89    /// Return the underlying source text.
90    #[must_use]
91    pub const fn input(&self) -> &'a str {
92        self.input
93    }
94
95    /// Return true when this source is empty.
96    #[must_use]
97    pub const fn is_empty(&self) -> bool {
98        self.input.is_empty()
99    }
100
101    /// Return the line and column for a byte offset.
102    ///
103    /// Offsets past end-of-file are clamped to the end of the source.
104    #[must_use]
105    pub fn line_column(&self, byte: usize) -> (usize, usize) {
106        let byte = byte.min(self.input.len());
107        let line_index = match self.line_starts.binary_search(&byte) {
108            Ok(index) => index,
109            Err(0) => 0,
110            Err(index) => index - 1,
111        };
112        let line_start = self.line_starts[line_index];
113        let column = if self.line_ascii.get(line_index).copied().unwrap_or(false) {
114            byte - line_start + 1
115        } else {
116            self.input[line_start..byte].chars().count() + 1
117        };
118        (line_index + 1, column)
119    }
120
121    /// Return the byte offset for a one-based line and column.
122    ///
123    /// Columns count Unicode scalar values. A column one past the end of the
124    /// line resolves to the line-end byte offset.
125    #[must_use]
126    pub fn byte_at_line_column(&self, line: usize, column: usize) -> Option<usize> {
127        if line == 0 || column == 0 {
128            return None;
129        }
130        let line_start = *self.line_starts.get(line - 1)?;
131        let line_end = self
132            .line_starts
133            .get(line)
134            .map_or(self.input.len(), |next| next.saturating_sub(1));
135        let line_text = self.input.get(line_start..line_end)?;
136        if column == 1 {
137            return Some(line_start);
138        }
139        let mut current_column = 1usize;
140        for (offset, _) in line_text.char_indices() {
141            if current_column == column {
142                return Some(line_start + offset);
143            }
144            current_column += 1;
145        }
146        if current_column == column {
147            Some(line_end)
148        } else {
149            None
150        }
151    }
152
153    /// Create a source span for a byte range.
154    ///
155    /// The range is clamped to the source length. The returned span keeps the
156    /// half-open byte offsets and one-based start/end line-column positions.
157    #[must_use]
158    pub fn span(&self, byte_start: usize, byte_end: usize) -> SourceSpan {
159        let byte_start = byte_start.min(self.input.len());
160        let byte_end = byte_end.min(self.input.len()).max(byte_start);
161        let (line, column) = self.line_column(byte_start);
162        let (end_line, end_column) = self.line_column(byte_end);
163        let span = SourceSpan::with_end(byte_start, byte_end, line, column, end_line, end_column);
164        self.source.map_or(span, |source| span.with_source(source))
165    }
166
167    pub(crate) const fn cursor(&self) -> SourceCursor<'_, 'a> {
168        SourceCursor {
169            map: self,
170            line_index: 0,
171        }
172    }
173
174    /// Return a borrowed slice for a source span when it belongs to this source.
175    #[must_use]
176    pub fn slice(&self, span: SourceSpan) -> Option<&'a str> {
177        if span.source.is_some() && span.source != self.source {
178            return None;
179        }
180        self.input.get(span.byte_start..span.byte_end)
181    }
182
183    /// Return a short line-oriented snippet for a span.
184    #[must_use]
185    pub fn snippet(&self, span: SourceSpan, max_chars: usize) -> Option<String> {
186        if span.source.is_some() && span.source != self.source {
187            return None;
188        }
189
190        let anchor_start = if span.is_empty() && span.byte_start > 0 {
191            span.byte_start - 1
192        } else {
193            span.byte_start
194        };
195        let anchor_end = if span.is_empty() && span.byte_end > 0 {
196            span.byte_end - 1
197        } else {
198            span.byte_end
199        };
200
201        let start = self.input[..anchor_start]
202            .rfind('\n')
203            .map_or(0, |index| index + 1);
204        let end = self.input[anchor_end..]
205            .find('\n')
206            .map_or(self.input.len(), |index| anchor_end + index);
207        let snippet = self.input.get(start..end)?;
208
209        if snippet.chars().count() <= max_chars {
210            return Some(snippet.to_string());
211        }
212
213        Some(snippet.chars().take(max_chars).collect())
214    }
215}
216
217fn estimate_line_capacity(input_len: usize) -> usize {
218    (input_len / 64).clamp(1, 1_000_000)
219}
220
221pub(crate) struct SourceCursor<'map, 'source> {
222    map: &'map SourceMap<'source>,
223    line_index: usize,
224}
225
226impl SourceCursor<'_, '_> {
227    pub(crate) fn span(&mut self, byte_start: usize, byte_end: usize) -> SourceSpan {
228        let byte_start = byte_start.min(self.map.input.len());
229        let byte_end = byte_end.min(self.map.input.len()).max(byte_start);
230        let start_line_index = self.line_index_at(byte_start);
231        let end_line_index = self.line_index_from(start_line_index, byte_end);
232        let column = self.column_at(start_line_index, byte_start);
233        let end_column = self.column_at(end_line_index, byte_end);
234        let span = SourceSpan::with_end(
235            byte_start,
236            byte_end,
237            start_line_index + 1,
238            column,
239            end_line_index + 1,
240            end_column,
241        );
242        self.map
243            .source
244            .map_or(span, |source| span.with_source(source))
245    }
246
247    fn line_index_at(&mut self, byte: usize) -> usize {
248        if self
249            .map
250            .line_starts
251            .get(self.line_index)
252            .is_some_and(|start| byte < *start)
253        {
254            self.line_index = self.map.line_index_for(byte);
255            return self.line_index;
256        }
257
258        while self
259            .map
260            .line_starts
261            .get(self.line_index + 1)
262            .is_some_and(|next| *next <= byte)
263        {
264            self.line_index += 1;
265        }
266
267        self.line_index
268    }
269
270    fn line_index_from(&self, mut line_index: usize, byte: usize) -> usize {
271        if self
272            .map
273            .line_starts
274            .get(line_index)
275            .is_some_and(|start| byte < *start)
276        {
277            return self.map.line_index_for(byte);
278        }
279
280        while self
281            .map
282            .line_starts
283            .get(line_index + 1)
284            .is_some_and(|next| *next <= byte)
285        {
286            line_index += 1;
287        }
288
289        line_index
290    }
291
292    fn column_at(&self, line_index: usize, byte: usize) -> usize {
293        let line_start = self.map.line_starts[line_index];
294        if self
295            .map
296            .line_ascii
297            .get(line_index)
298            .copied()
299            .unwrap_or(false)
300        {
301            byte - line_start + 1
302        } else {
303            self.map.input[line_start..byte].chars().count() + 1
304        }
305    }
306}
307
308impl SourceMap<'_> {
309    fn line_index_for(&self, byte: usize) -> usize {
310        match self.line_starts.binary_search(&byte) {
311            Ok(index) => index,
312            Err(0) => 0,
313            Err(index) => index - 1,
314        }
315    }
316}