libgraphql_parser/source_map.rs
1use crate::ByteSpan;
2use crate::SourceSpan;
3use crate::SourcePosition;
4use std::path::Path;
5use std::path::PathBuf;
6
7/// Internal storage mode for position resolution.
8#[derive(Clone)]
9enum SourceMapData<'src> {
10 /// Source-text mode: positions are resolved on demand by binary-searching
11 /// `line_starts` and counting chars from the line start offset.
12 SourceText {
13 source: &'src str,
14 line_starts: Vec<u32>,
15 },
16
17 /// Pre-computed columns mode: positions were pushed by the token source
18 /// during lexing. Lookups binary-search the sorted offset table.
19 PrecomputedColumns {
20 /// Sorted by byte offset (first element of each tuple). Entries are
21 /// inserted in lexing order, which is monotonically increasing.
22 entries: Vec<(u32, SourcePosition)>,
23 },
24}
25
26impl<'src> SourceMapData<'src> {
27 /// Resolves a byte offset to a [`SourcePosition`], dispatching to the
28 /// appropriate mode-specific implementation.
29 ///
30 /// Returns `None` if the offset cannot be resolved (e.g., offset out of
31 /// bounds, or no pre-computed entries available).
32 fn resolve_offset(
33 &self,
34 byte_offset: u32,
35 ) -> Option<SourcePosition> {
36 match self {
37 Self::SourceText {
38 source,
39 line_starts,
40 } => {
41 let offset = byte_offset as usize;
42
43 if offset > source.len() {
44 return None;
45 }
46
47 // partition_point returns the first index where
48 // line_starts[i] > byte_offset, so the line index is
49 // one less.
50 let line_index =
51 line_starts.partition_point(|&ls| ls <= byte_offset);
52 let line =
53 if line_index > 0 { line_index - 1 } else { 0 };
54 let line_start = line_starts[line] as usize;
55
56 // Guard against byte offsets that land in the
57 // middle of a multibyte UTF-8 sequence. In normal
58 // use, offsets come from the lexer and always land
59 // on char boundaries, but manually-constructed
60 // ByteSpans could violate this.
61 if !source.is_char_boundary(offset) {
62 return None;
63 }
64
65 // Count Unicode scalar values and UTF-16 code units
66 // from line start to the target byte offset.
67 //
68 // TODO: `col_utf8` in SourcePosition counts Unicode
69 // scalar values (what `str::chars()` yields), NOT
70 // UTF-8 bytes. The name is inherited from
71 // SourcePosition and is misleading — consider renaming
72 // to `col_char` or `col_scalar` in a future cleanup.
73 let line_slice = &source[line_start..offset];
74 let (col_utf8, col_utf16) = if line_slice.is_ascii() {
75 // Fast path: all ASCII — each byte is exactly
76 // one Unicode scalar value and one UTF-16 code
77 // unit. `str::is_ascii()` uses SSE2
78 // `pmovmskb` on x86_64, checking 64 bytes per
79 // iteration, so the branch is very cheap for
80 // the common case of ASCII-only GraphQL source.
81 let len = line_slice.len();
82 (len, len)
83 } else {
84 // Slow path: multibyte UTF-8 chars present —
85 // walk char-by-char to count scalars and
86 // UTF-16 code units.
87 let mut utf8: usize = 0;
88 let mut utf16: usize = 0;
89 for ch in line_slice.chars() {
90 utf8 += 1;
91 utf16 += ch.len_utf16();
92 }
93 (utf8, utf16)
94 };
95
96 Some(SourcePosition::new(
97 line,
98 col_utf8,
99 Some(col_utf16),
100 offset,
101 ))
102 },
103 Self::PrecomputedColumns { entries } => {
104 if entries.is_empty() {
105 return None;
106 }
107
108 // Floor lookup: find the largest entry offset <=
109 // byte_offset.
110 let idx = entries
111 .partition_point(|&(off, _)| off <= byte_offset);
112
113 if idx > 0 {
114 Some(entries[idx - 1].1)
115 } else {
116 None
117 }
118 },
119 }
120 }
121
122 /// Returns the source text, if this is source-text-mode data.
123 fn source(&self) -> Option<&'src str> {
124 match self {
125 Self::SourceText { source, .. } => Some(source),
126 Self::PrecomputedColumns { .. } => None,
127 }
128 }
129}
130
131/// Provides utilities for mapping [`ByteSpan`]s to [`SourceSpan`]s,
132/// byte offsets to [`SourcePosition`]s, and extracting content from the
133/// original source text.
134///
135/// The `'src` lifetime ties the `SourceMap` to the source text it was built
136/// from.
137///
138/// `SourceMap` is a key part of what makes `libgraphql-parser` fast.
139/// The lexer and parser operate exclusively on compact `u32` byte
140/// offsets (stored in [`ByteSpan`]s), deferring line/column
141/// computation until it is actually needed — typically only for
142/// error formatting or tooling & IDE features. `SourceMap` is the mechanism
143/// that makes this deferred resolution possible, translating raw
144/// byte offsets back to human-readable [`SourcePosition`]s on
145/// demand.
146///
147/// Construction of `SourceMap`s is typically owned by a
148/// [`GraphQLTokenSource`](crate::token::GraphQLTokenSource) and uses one of two
149/// modes of data-fill: [`SourceMap::new_with_source`] or
150/// [`SourceMap::new_precomputed`].
151///
152/// ### Construction with source text ([`SourceMap::new_with_source`])
153///
154/// An O(sourcelen) (SIMD-accelerated) pre-pass scans the source string for line
155/// terminators (`\n`, `\r`, `\r\n`) and records the byte offset of each line
156/// start. Individual position lookups via [`SourceMap::resolve_offset`] later
157/// are then O(log n) binary searches on the line-start table, plus a short
158/// char-counting walk from line start to target byte offset to compute UTF-8
159/// and UTF-16 column values.
160///
161/// This mode is used by
162/// [`StrGraphQLTokenSource`](crate::token::StrGraphQLTokenSource),
163/// which has direct access to the source string. It is memory-efficient
164/// (only one `u32` per line) and avoids as much per-token bookkeeping as
165/// possible during lexing — the lexer only tracks a single `curr_byte_offset`
166/// and defers all line/column computation.
167///
168/// ### Pre-Computed Columns Mode ([`SourceMap::new_precomputed`])
169///
170/// Some token sources do not have access to the underlying source text at
171/// resolution time. For example,
172/// [`libgraphql_macros::RustMacroGraphQLTokenSource`](https://docs.rs/libgraphql-macros)
173/// produces tokens from a `proc_macro2::TokenStream`. Each `proc_macro2::Span`
174/// carries line/column information at the time the token is produced, but there
175/// is no contiguous source `&str` to scan after the fact. In this mode, the
176/// token source collects `(byte_offset, SourcePosition)` entries during lexing
177/// and then constructs the `SourceMap` at the end by passing the entries to
178/// [`new_precomputed()`](Self::new_precomputed). Later, lookups will
179/// binary-search that entries table.
180///
181/// This mode uses more memory (one full `SourcePosition` per inserted
182/// offset, rather than one `u32` per line), but lookups are O(log n) with
183/// no char-counting walk — just a binary search and a direct return.
184///
185/// When a `SourceMap` is constructed in pre-computed mode, the `'src` lifetime
186/// is typically `'static` since no source text is borrowed.
187///
188/// # UTF-16 Column Recovery
189///
190/// In source-text mode, UTF-16 columns are computed on demand by iterating
191/// chars from the line start to the target byte offset and summing
192/// [`char::len_utf16()`]. In pre-computed columns mode, UTF-16 columns are
193/// whatever the token source provided (or `None` if the token source cannot
194/// compute them).
195#[derive(Clone)]
196pub struct SourceMap<'src> {
197 /// The resolution data — either source-text-backed or pre-computed.
198 data: SourceMapData<'src>,
199
200 /// Optional file path for the source text.
201 file_path: Option<PathBuf>,
202}
203
204impl<'src> std::fmt::Debug for SourceMap<'src> {
205 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
206 let mode = match &self.data {
207 SourceMapData::SourceText { line_starts, .. } => {
208 format!("SourceText({} lines)", line_starts.len())
209 },
210 SourceMapData::PrecomputedColumns { entries } => {
211 format!("PrecomputedColumns({} entries)", entries.len())
212 },
213 };
214 f.debug_struct("SourceMap")
215 .field("mode", &mode)
216 .field("file_path", &self.file_path)
217 .finish()
218 }
219}
220
221impl<'src> SourceMap<'src> {
222 /// Builds a `SourceMap` in source-text mode by scanning `source` for
223 /// line terminators.
224 ///
225 /// This is an O(n) pre-pass that identifies all line start byte offsets.
226 /// Line terminators recognized: `\n`, `\r`, `\r\n` (the pair counts as
227 /// one terminator).
228 pub fn new_with_source(
229 source: &'src str,
230 file_path: Option<PathBuf>,
231 ) -> Self {
232 let line_starts = Self::compute_line_starts(source);
233 Self {
234 data: SourceMapData::SourceText {
235 source,
236 line_starts,
237 },
238 file_path,
239 }
240 }
241
242 /// Creates a `SourceMap` in pre-computed columns mode.
243 ///
244 /// The `entries` parameter contains `(byte_offset, SourcePosition)`
245 /// pairs that were collected during lexing. Entries must be sorted
246 /// by byte offset in monotonically increasing order (which is
247 /// naturally the case when collected during a left-to-right lex
248 /// pass).
249 ///
250 /// This mode is intended for token sources that know line/column
251 /// information at lex time but do not have access to the underlying
252 /// source text afterward.
253 ///
254 /// See the [type-level documentation](Self) for a detailed comparison
255 /// of the two modes.
256 ///
257 /// # Example
258 ///
259 /// ```rust
260 /// # use libgraphql_parser::SourceMap;
261 /// # use libgraphql_parser::SourcePosition;
262 /// # let byte_offset = 0;
263 /// # let position = SourcePosition::new(0, 0, None, 0);
264 /// // During lexing, collect entries into a Vec:
265 /// let mut entries = Vec::new();
266 /// entries.push((byte_offset, position));
267 /// // After lexing, build the SourceMap:
268 /// let source_map = SourceMap::new_precomputed(entries, None);
269 /// ```
270 pub fn new_precomputed(
271 entries: Vec<(u32, SourcePosition)>,
272 file_path: Option<PathBuf>,
273 ) -> Self {
274 debug_assert!(
275 entries.windows(2).all(|w| w[0].0 <= w[1].0),
276 "new_precomputed entries must be sorted by byte offset",
277 );
278 Self {
279 data: SourceMapData::PrecomputedColumns { entries },
280 file_path,
281 }
282 }
283
284 /// Creates an empty `SourceMap` that cannot resolve any offsets.
285 ///
286 /// Useful for token sources that don't have source text (e.g.
287 /// proc-macro token sources or test mocks).
288 pub fn empty() -> Self {
289 Self {
290 data: SourceMapData::PrecomputedColumns {
291 entries: Vec::new(),
292 },
293 file_path: None,
294 }
295 }
296
297 /// Returns the source text, if this is a source-text-mode `SourceMap`.
298 pub fn source(&self) -> Option<&'src str> {
299 self.data.source()
300 }
301
302 /// Returns the file path, if available.
303 pub fn file_path(&self) -> Option<&Path> {
304 self.file_path.as_deref()
305 }
306
307 /// Resolves a byte offset to a full [`SourcePosition`] (line, col_utf8,
308 /// col_utf16, byte_offset).
309 ///
310 /// Returns `None` if the offset cannot be resolved — for example, if
311 /// the byte offset is out of bounds (source-text mode) or if no
312 /// pre-computed entries cover the requested offset.
313 ///
314 /// # Source-text mode
315 ///
316 /// Uses binary search on `line_starts` to find the line, then counts
317 /// chars from the line start to compute columns.
318 ///
319 /// # Pre-computed columns mode
320 ///
321 /// Binary-searches the pre-computed entries for the largest byte offset
322 /// `<=` the requested offset (floor lookup). If the requested offset
323 /// falls between two entries, the earlier entry's position is returned
324 /// (this handles lookups for byte offsets mid-token, returning the
325 /// position of the nearest preceding entry).
326 pub fn resolve_offset(
327 &self,
328 byte_offset: u32,
329 ) -> Option<SourcePosition> {
330 self.data.resolve_offset(byte_offset)
331 }
332
333 /// Resolves a [`ByteSpan`] to a full [`SourceSpan`] with
334 /// line/column information and file path.
335 ///
336 /// Returns `None` if either endpoint of the span cannot be
337 /// resolved. Common scenarios where resolution fails:
338 ///
339 /// - **Empty `SourceMap`** (`SourceMap::empty()`): no entries
340 /// exist, so no offset can be resolved.
341 /// - **Out-of-bounds offset**: the byte offset exceeds the
342 /// source text length (source-text mode) or falls before the
343 /// first pre-computed entry (pre-computed columns mode).
344 /// - **Mid-UTF-8 offset**: the byte offset lands in the middle
345 /// of a multi-byte UTF-8 character (source-text mode only).
346 ///
347 /// For error reporting, the parser's internal `resolve_span()`
348 /// wrapper falls back to `SourceSpan::zero()` when this method
349 /// returns `None`. For AST tooling that needs accurate
350 /// positions, callers should handle the `None` case explicitly.
351 pub fn resolve_span(
352 &self,
353 span: ByteSpan,
354 ) -> Option<SourceSpan> {
355 let start = self.data.resolve_offset(span.start)?;
356 let end = self.data.resolve_offset(span.end)?;
357 Some(match &self.file_path {
358 Some(path) => {
359 SourceSpan::with_file(start, end, path.clone())
360 },
361 None => SourceSpan::new(start, end),
362 })
363 }
364
365 /// Returns the content of the line at the given 0-based line index,
366 /// stripped of any trailing line terminator (`\n`, `\r`, `\r\n`).
367 ///
368 /// Returns `None` if this is not a source-text-mode `SourceMap`, or if
369 /// `line_index` is out of bounds.
370 ///
371 /// This uses the `line_starts` table built by
372 /// `compute_line_starts()`, which correctly recognizes bare
373 /// `\r` as a line terminator per the GraphQL spec. Code that needs to
374 /// extract line content should use this method rather than
375 /// [`str::lines()`], which does **not** handle bare `\r`.
376 ///
377 /// Note: `graphql_parse_error::get_line()` provides similar
378 /// functionality via a linear scan (no pre-computed table).
379 /// Both must use the same line-terminator semantics.
380 pub fn get_line(&self, line_index: usize) -> Option<&'src str> {
381 match &self.data {
382 SourceMapData::SourceText { source, line_starts } => {
383 if line_index >= line_starts.len() {
384 return None;
385 }
386 let start = line_starts[line_index] as usize;
387 let end = if line_index + 1 < line_starts.len() {
388 line_starts[line_index + 1] as usize
389 } else {
390 source.len()
391 };
392 let line = &source[start..end];
393 // Strip trailing line terminator(s)
394 let line = line.strip_suffix("\r\n")
395 .or_else(|| line.strip_suffix('\n'))
396 .or_else(|| line.strip_suffix('\r'))
397 .unwrap_or(line);
398 Some(line)
399 },
400 SourceMapData::PrecomputedColumns { .. } => None,
401 }
402 }
403
404 /// Returns the number of lines in the source text.
405 ///
406 /// In source-text mode, this is the number of line-start entries
407 /// computed during construction. In pre-computed columns mode, this
408 /// is derived from the highest line number seen in the entries
409 /// (plus one). Returns 0 if no entries have been inserted.
410 pub fn line_count(&self) -> usize {
411 match &self.data {
412 SourceMapData::SourceText { line_starts, .. } => {
413 line_starts.len()
414 },
415 SourceMapData::PrecomputedColumns { entries } => {
416 entries.last()
417 .map(|(_, pos)| pos.line() + 1)
418 .unwrap_or(0)
419 },
420 }
421 }
422
423 // ── Line-start computation ─────────────────────────────
424
425 /// Scans source text and returns the byte offset of the start of each
426 /// line.
427 ///
428 /// Line terminators: `\n`, `\r`, `\r\n` (the pair counts as one).
429 fn compute_line_starts(source: &str) -> Vec<u32> {
430 let bytes = source.as_bytes();
431 let len = bytes.len();
432
433 // Pre-allocate: ~40 chars per line as a rough heuristic.
434 let mut line_starts = Vec::with_capacity(1 + len / 40);
435 line_starts.push(0); // First line always starts at byte 0
436
437 // SIMD-accelerated scan: jump directly to the next
438 // `\n` or `\r` instead of checking every byte. Most
439 // bytes in a GraphQL document are not line terminators,
440 // so this skips large stretches at 16–32 bytes/cycle.
441 let mut i = 0;
442 while let Some(offset) = memchr::memchr2(b'\n', b'\r', &bytes[i..]) {
443 i += offset;
444 match bytes[i] {
445 b'\n' => {
446 line_starts.push((i + 1) as u32);
447 i += 1;
448 },
449 b'\r' => {
450 // \r\n is a single line terminator
451 if i + 1 < len && bytes[i + 1] == b'\n' {
452 line_starts.push((i + 2) as u32);
453 i += 2;
454 } else {
455 line_starts.push((i + 1) as u32);
456 i += 1;
457 }
458 },
459 _ => unreachable!(),
460 }
461 }
462
463 line_starts
464 }
465}