Skip to main content

libgraphql_parser/
source_map.rs

1use crate::ByteSpan;
2use crate::SourceSpan;
3use crate::SourcePosition;
4use std::path::Path;
5use std::path::PathBuf;
6
7/// Internal storage mode for position resolution.
8#[derive(Clone)]
9enum SourceMapData<'src> {
10    /// Source-text mode: positions are resolved on demand by binary-searching
11    /// `line_starts` and counting chars from the line start offset.
12    SourceText {
13        source: &'src str,
14        line_starts: Vec<u32>,
15    },
16
17    /// Pre-computed columns mode: positions were pushed by the token source
18    /// during lexing. Lookups binary-search the sorted offset table.
19    PrecomputedColumns {
20        /// Sorted by byte offset (first element of each tuple). Entries are
21        /// inserted in lexing order, which is monotonically increasing.
22        entries: Vec<(u32, SourcePosition)>,
23    },
24}
25
26impl<'src> SourceMapData<'src> {
27    /// Resolves a byte offset to a [`SourcePosition`], dispatching to the
28    /// appropriate mode-specific implementation.
29    ///
30    /// Returns `None` if the offset cannot be resolved (e.g., offset out of
31    /// bounds, or no pre-computed entries available).
32    fn resolve_offset(
33        &self,
34        byte_offset: u32,
35    ) -> Option<SourcePosition> {
36        match self {
37            Self::SourceText {
38                source,
39                line_starts,
40            } => {
41                let offset = byte_offset as usize;
42
43                if offset > source.len() {
44                    return None;
45                }
46
47                // partition_point returns the first index where
48                // line_starts[i] > byte_offset, so the line index is
49                // one less.
50                let line_index =
51                    line_starts.partition_point(|&ls| ls <= byte_offset);
52                let line =
53                    if line_index > 0 { line_index - 1 } else { 0 };
54                let line_start = line_starts[line] as usize;
55
56                // Guard against byte offsets that land in the
57                // middle of a multibyte UTF-8 sequence. In normal
58                // use, offsets come from the lexer and always land
59                // on char boundaries, but manually-constructed
60                // ByteSpans could violate this.
61                if !source.is_char_boundary(offset) {
62                    return None;
63                }
64
65                // Count Unicode scalar values and UTF-16 code units
66                // from line start to the target byte offset.
67                //
68                // TODO: `col_utf8` in SourcePosition counts Unicode
69                // scalar values (what `str::chars()` yields), NOT
70                // UTF-8 bytes. The name is inherited from
71                // SourcePosition and is misleading — consider renaming
72                // to `col_char` or `col_scalar` in a future cleanup.
73                let line_slice = &source[line_start..offset];
74                let (col_utf8, col_utf16) = if line_slice.is_ascii() {
75                    // Fast path: all ASCII — each byte is exactly
76                    // one Unicode scalar value and one UTF-16 code
77                    // unit. `str::is_ascii()` uses SSE2
78                    // `pmovmskb` on x86_64, checking 64 bytes per
79                    // iteration, so the branch is very cheap for
80                    // the common case of ASCII-only GraphQL source.
81                    let len = line_slice.len();
82                    (len, len)
83                } else {
84                    // Slow path: multibyte UTF-8 chars present —
85                    // walk char-by-char to count scalars and
86                    // UTF-16 code units.
87                    let mut utf8: usize = 0;
88                    let mut utf16: usize = 0;
89                    for ch in line_slice.chars() {
90                        utf8 += 1;
91                        utf16 += ch.len_utf16();
92                    }
93                    (utf8, utf16)
94                };
95
96                Some(SourcePosition::new(
97                    line,
98                    col_utf8,
99                    Some(col_utf16),
100                    offset,
101                ))
102            },
103            Self::PrecomputedColumns { entries } => {
104                if entries.is_empty() {
105                    return None;
106                }
107
108                // Floor lookup: find the largest entry offset <=
109                // byte_offset.
110                let idx = entries
111                    .partition_point(|&(off, _)| off <= byte_offset);
112
113                if idx > 0 {
114                    Some(entries[idx - 1].1)
115                } else {
116                    None
117                }
118            },
119        }
120    }
121
122    /// Returns the source text, if this is source-text-mode data.
123    fn source(&self) -> Option<&'src str> {
124        match self {
125            Self::SourceText { source, .. } => Some(source),
126            Self::PrecomputedColumns { .. } => None,
127        }
128    }
129}
130
131/// Provides utilities for mapping [`ByteSpan`]s to [`SourceSpan`]s,
132/// byte offsets to [`SourcePosition`]s, and extracting content from the
133/// original source text.
134///
135/// The `'src` lifetime ties the `SourceMap` to the source text it was built
136/// from.
137///
138/// `SourceMap` is a key part of what makes `libgraphql-parser` fast.
139/// The lexer and parser operate exclusively on compact `u32` byte
140/// offsets (stored in [`ByteSpan`]s), deferring line/column
141/// computation until it is actually needed — typically only for
142/// error formatting or tooling & IDE features. `SourceMap` is the mechanism
143/// that makes this deferred resolution possible, translating raw
144/// byte offsets back to human-readable [`SourcePosition`]s on
145/// demand.
146///
147/// Construction of `SourceMap`s is typically owned by a
148/// [`GraphQLTokenSource`](crate::token::GraphQLTokenSource) and uses one of two
149/// modes of data-fill: [`SourceMap::new_with_source`] or
150/// [`SourceMap::new_precomputed`].
151///
152/// ### Construction with source text ([`SourceMap::new_with_source`])
153///
154/// An O(sourcelen) (SIMD-accelerated) pre-pass scans the source string for line
155/// terminators (`\n`, `\r`, `\r\n`) and records the byte offset of each line
156/// start. Individual position lookups via [`SourceMap::resolve_offset`] later
157/// are then O(log n) binary searches on the line-start table, plus a short
158/// char-counting walk from line start to target byte offset to compute UTF-8
159/// and UTF-16 column values.
160///
161/// This mode is used by
162/// [`StrGraphQLTokenSource`](crate::token::StrGraphQLTokenSource),
163/// which has direct access to the source string. It is memory-efficient
164/// (only one `u32` per line) and avoids as much per-token bookkeeping as
165/// possible during lexing — the lexer only tracks a single `curr_byte_offset`
166/// and defers all line/column computation.
167///
168/// ### Pre-Computed Columns Mode ([`SourceMap::new_precomputed`])
169///
170/// Some token sources do not have access to the underlying source text at
171/// resolution time. For example,
172/// [`libgraphql_macros::RustMacroGraphQLTokenSource`](https://docs.rs/libgraphql-macros)
173/// produces tokens from a `proc_macro2::TokenStream`. Each `proc_macro2::Span`
174/// carries line/column information at the time the token is produced, but there
175/// is no contiguous source `&str` to scan after the fact. In this mode, the
176/// token source collects `(byte_offset, SourcePosition)` entries during lexing
177/// and then constructs the `SourceMap` at the end by passing the entries to
178/// [`new_precomputed()`](Self::new_precomputed). Later, lookups will
179/// binary-search that entries table.
180///
181/// This mode uses more memory (one full `SourcePosition` per inserted
182/// offset, rather than one `u32` per line), but lookups are O(log n) with
183/// no char-counting walk — just a binary search and a direct return.
184///
185/// When a `SourceMap` is constructed in pre-computed mode, the `'src` lifetime
186/// is typically `'static` since no source text is borrowed.
187///
188/// # UTF-16 Column Recovery
189///
190/// In source-text mode, UTF-16 columns are computed on demand by iterating
191/// chars from the line start to the target byte offset and summing
192/// [`char::len_utf16()`]. In pre-computed columns mode, UTF-16 columns are
193/// whatever the token source provided (or `None` if the token source cannot
194/// compute them).
195#[derive(Clone)]
196pub struct SourceMap<'src> {
197    /// The resolution data — either source-text-backed or pre-computed.
198    data: SourceMapData<'src>,
199
200    /// Optional file path for the source text.
201    file_path: Option<PathBuf>,
202}
203
204impl<'src> std::fmt::Debug for SourceMap<'src> {
205    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
206        let mode = match &self.data {
207            SourceMapData::SourceText { line_starts, .. } => {
208                format!("SourceText({} lines)", line_starts.len())
209            },
210            SourceMapData::PrecomputedColumns { entries } => {
211                format!("PrecomputedColumns({} entries)", entries.len())
212            },
213        };
214        f.debug_struct("SourceMap")
215            .field("mode", &mode)
216            .field("file_path", &self.file_path)
217            .finish()
218    }
219}
220
221impl<'src> SourceMap<'src> {
222    /// Builds a `SourceMap` in source-text mode by scanning `source` for
223    /// line terminators.
224    ///
225    /// This is an O(n) pre-pass that identifies all line start byte offsets.
226    /// Line terminators recognized: `\n`, `\r`, `\r\n` (the pair counts as
227    /// one terminator).
228    pub fn new_with_source(
229        source: &'src str,
230        file_path: Option<PathBuf>,
231    ) -> Self {
232        let line_starts = Self::compute_line_starts(source);
233        Self {
234            data: SourceMapData::SourceText {
235                source,
236                line_starts,
237            },
238            file_path,
239        }
240    }
241
242    /// Creates a `SourceMap` in pre-computed columns mode.
243    ///
244    /// The `entries` parameter contains `(byte_offset, SourcePosition)`
245    /// pairs that were collected during lexing. Entries must be sorted
246    /// by byte offset in monotonically increasing order (which is
247    /// naturally the case when collected during a left-to-right lex
248    /// pass).
249    ///
250    /// This mode is intended for token sources that know line/column
251    /// information at lex time but do not have access to the underlying
252    /// source text afterward.
253    ///
254    /// See the [type-level documentation](Self) for a detailed comparison
255    /// of the two modes.
256    ///
257    /// # Example
258    ///
259    /// ```rust
260    /// # use libgraphql_parser::SourceMap;
261    /// # use libgraphql_parser::SourcePosition;
262    /// # let byte_offset = 0;
263    /// # let position = SourcePosition::new(0, 0, None, 0);
264    /// // During lexing, collect entries into a Vec:
265    /// let mut entries = Vec::new();
266    /// entries.push((byte_offset, position));
267    /// // After lexing, build the SourceMap:
268    /// let source_map = SourceMap::new_precomputed(entries, None);
269    /// ```
270    pub fn new_precomputed(
271        entries: Vec<(u32, SourcePosition)>,
272        file_path: Option<PathBuf>,
273    ) -> Self {
274        debug_assert!(
275            entries.windows(2).all(|w| w[0].0 <= w[1].0),
276            "new_precomputed entries must be sorted by byte offset",
277        );
278        Self {
279            data: SourceMapData::PrecomputedColumns { entries },
280            file_path,
281        }
282    }
283
284    /// Creates an empty `SourceMap` that cannot resolve any offsets.
285    ///
286    /// Useful for token sources that don't have source text (e.g.
287    /// proc-macro token sources or test mocks).
288    pub fn empty() -> Self {
289        Self {
290            data: SourceMapData::PrecomputedColumns {
291                entries: Vec::new(),
292            },
293            file_path: None,
294        }
295    }
296
297    /// Returns the source text, if this is a source-text-mode `SourceMap`.
298    pub fn source(&self) -> Option<&'src str> {
299        self.data.source()
300    }
301
302    /// Returns the file path, if available.
303    pub fn file_path(&self) -> Option<&Path> {
304        self.file_path.as_deref()
305    }
306
307    /// Resolves a byte offset to a full [`SourcePosition`] (line, col_utf8,
308    /// col_utf16, byte_offset).
309    ///
310    /// Returns `None` if the offset cannot be resolved — for example, if
311    /// the byte offset is out of bounds (source-text mode) or if no
312    /// pre-computed entries cover the requested offset.
313    ///
314    /// # Source-text mode
315    ///
316    /// Uses binary search on `line_starts` to find the line, then counts
317    /// chars from the line start to compute columns.
318    ///
319    /// # Pre-computed columns mode
320    ///
321    /// Binary-searches the pre-computed entries for the largest byte offset
322    /// `<=` the requested offset (floor lookup). If the requested offset
323    /// falls between two entries, the earlier entry's position is returned
324    /// (this handles lookups for byte offsets mid-token, returning the
325    /// position of the nearest preceding entry).
326    pub fn resolve_offset(
327        &self,
328        byte_offset: u32,
329    ) -> Option<SourcePosition> {
330        self.data.resolve_offset(byte_offset)
331    }
332
333    /// Resolves a [`ByteSpan`] to a full [`SourceSpan`] with
334    /// line/column information and file path.
335    ///
336    /// Returns `None` if either endpoint of the span cannot be
337    /// resolved. Common scenarios where resolution fails:
338    ///
339    /// - **Empty `SourceMap`** (`SourceMap::empty()`): no entries
340    ///   exist, so no offset can be resolved.
341    /// - **Out-of-bounds offset**: the byte offset exceeds the
342    ///   source text length (source-text mode) or falls before the
343    ///   first pre-computed entry (pre-computed columns mode).
344    /// - **Mid-UTF-8 offset**: the byte offset lands in the middle
345    ///   of a multi-byte UTF-8 character (source-text mode only).
346    ///
347    /// For error reporting, the parser's internal `resolve_span()`
348    /// wrapper falls back to `SourceSpan::zero()` when this method
349    /// returns `None`. For AST tooling that needs accurate
350    /// positions, callers should handle the `None` case explicitly.
351    pub fn resolve_span(
352        &self,
353        span: ByteSpan,
354    ) -> Option<SourceSpan> {
355        let start = self.data.resolve_offset(span.start)?;
356        let end = self.data.resolve_offset(span.end)?;
357        Some(match &self.file_path {
358            Some(path) => {
359                SourceSpan::with_file(start, end, path.clone())
360            },
361            None => SourceSpan::new(start, end),
362        })
363    }
364
365    /// Returns the content of the line at the given 0-based line index,
366    /// stripped of any trailing line terminator (`\n`, `\r`, `\r\n`).
367    ///
368    /// Returns `None` if this is not a source-text-mode `SourceMap`, or if
369    /// `line_index` is out of bounds.
370    ///
371    /// This uses the `line_starts` table built by
372    /// `compute_line_starts()`, which correctly recognizes bare
373    /// `\r` as a line terminator per the GraphQL spec. Code that needs to
374    /// extract line content should use this method rather than
375    /// [`str::lines()`], which does **not** handle bare `\r`.
376    ///
377    /// Note: `graphql_parse_error::get_line()` provides similar
378    /// functionality via a linear scan (no pre-computed table).
379    /// Both must use the same line-terminator semantics.
380    pub fn get_line(&self, line_index: usize) -> Option<&'src str> {
381        match &self.data {
382            SourceMapData::SourceText { source, line_starts } => {
383                if line_index >= line_starts.len() {
384                    return None;
385                }
386                let start = line_starts[line_index] as usize;
387                let end = if line_index + 1 < line_starts.len() {
388                    line_starts[line_index + 1] as usize
389                } else {
390                    source.len()
391                };
392                let line = &source[start..end];
393                // Strip trailing line terminator(s)
394                let line = line.strip_suffix("\r\n")
395                    .or_else(|| line.strip_suffix('\n'))
396                    .or_else(|| line.strip_suffix('\r'))
397                    .unwrap_or(line);
398                Some(line)
399            },
400            SourceMapData::PrecomputedColumns { .. } => None,
401        }
402    }
403
404    /// Returns the number of lines in the source text.
405    ///
406    /// In source-text mode, this is the number of line-start entries
407    /// computed during construction. In pre-computed columns mode, this
408    /// is derived from the highest line number seen in the entries
409    /// (plus one). Returns 0 if no entries have been inserted.
410    pub fn line_count(&self) -> usize {
411        match &self.data {
412            SourceMapData::SourceText { line_starts, .. } => {
413                line_starts.len()
414            },
415            SourceMapData::PrecomputedColumns { entries } => {
416                entries.last()
417                    .map(|(_, pos)| pos.line() + 1)
418                    .unwrap_or(0)
419            },
420        }
421    }
422
423    // ── Line-start computation ─────────────────────────────
424
425    /// Scans source text and returns the byte offset of the start of each
426    /// line.
427    ///
428    /// Line terminators: `\n`, `\r`, `\r\n` (the pair counts as one).
429    fn compute_line_starts(source: &str) -> Vec<u32> {
430        let bytes = source.as_bytes();
431        let len = bytes.len();
432
433        // Pre-allocate: ~40 chars per line as a rough heuristic.
434        let mut line_starts = Vec::with_capacity(1 + len / 40);
435        line_starts.push(0); // First line always starts at byte 0
436
437        // SIMD-accelerated scan: jump directly to the next
438        // `\n` or `\r` instead of checking every byte. Most
439        // bytes in a GraphQL document are not line terminators,
440        // so this skips large stretches at 16–32 bytes/cycle.
441        let mut i = 0;
442        while let Some(offset) = memchr::memchr2(b'\n', b'\r', &bytes[i..]) {
443            i += offset;
444            match bytes[i] {
445                b'\n' => {
446                    line_starts.push((i + 1) as u32);
447                    i += 1;
448                },
449                b'\r' => {
450                    // \r\n is a single line terminator
451                    if i + 1 < len && bytes[i + 1] == b'\n' {
452                        line_starts.push((i + 2) as u32);
453                        i += 2;
454                    } else {
455                        line_starts.push((i + 1) as u32);
456                        i += 1;
457                    }
458                },
459                _ => unreachable!(),
460            }
461        }
462
463        line_starts
464    }
465}