libgraphql-parser 0.0.5

A blazing fast, error-focused, lossless GraphQL parser for schema, executable, and mixed documents.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
use crate::ByteSpan;
use crate::SourceSpan;
use crate::SourcePosition;
use std::path::Path;
use std::path::PathBuf;

/// Internal storage mode for position resolution.
#[derive(Clone)]
enum SourceMapData<'src> {
    /// Source-text mode: positions are resolved on demand by binary-searching
    /// `line_starts` and counting chars from the line start offset.
    SourceText {
        source: &'src str,
        line_starts: Vec<u32>,
    },

    /// Pre-computed columns mode: positions were pushed by the token source
    /// during lexing. Lookups binary-search the sorted offset table.
    PrecomputedColumns {
        /// Sorted by byte offset (first element of each tuple). Entries are
        /// inserted in lexing order, which is monotonically increasing.
        entries: Vec<(u32, SourcePosition)>,
    },
}

impl<'src> SourceMapData<'src> {
    /// Resolves a byte offset to a [`SourcePosition`], dispatching to the
    /// appropriate mode-specific implementation.
    ///
    /// Returns `None` if the offset cannot be resolved (e.g., offset out of
    /// bounds, or no pre-computed entries available).
    fn resolve_offset(
        &self,
        byte_offset: u32,
    ) -> Option<SourcePosition> {
        match self {
            Self::SourceText {
                source,
                line_starts,
            } => {
                let offset = byte_offset as usize;

                if offset > source.len() {
                    return None;
                }

                // partition_point returns the first index where
                // line_starts[i] > byte_offset, so the line index is
                // one less.
                let line_index =
                    line_starts.partition_point(|&ls| ls <= byte_offset);
                let line =
                    if line_index > 0 { line_index - 1 } else { 0 };
                let line_start = line_starts[line] as usize;

                // Guard against byte offsets that land in the
                // middle of a multibyte UTF-8 sequence. In normal
                // use, offsets come from the lexer and always land
                // on char boundaries, but manually-constructed
                // ByteSpans could violate this.
                if !source.is_char_boundary(offset) {
                    return None;
                }

                // Count Unicode scalar values and UTF-16 code units
                // from line start to the target byte offset.
                //
                // TODO: `col_utf8` in SourcePosition counts Unicode
                // scalar values (what `str::chars()` yields), NOT
                // UTF-8 bytes. The name is inherited from
                // SourcePosition and is misleading — consider renaming
                // to `col_char` or `col_scalar` in a future cleanup.
                let line_slice = &source[line_start..offset];
                let (col_utf8, col_utf16) = if line_slice.is_ascii() {
                    // Fast path: all ASCII — each byte is exactly
                    // one Unicode scalar value and one UTF-16 code
                    // unit. `str::is_ascii()` uses SSE2
                    // `pmovmskb` on x86_64, checking 64 bytes per
                    // iteration, so the branch is very cheap for
                    // the common case of ASCII-only GraphQL source.
                    let len = line_slice.len();
                    (len, len)
                } else {
                    // Slow path: multibyte UTF-8 chars present —
                    // walk char-by-char to count scalars and
                    // UTF-16 code units.
                    let mut utf8: usize = 0;
                    let mut utf16: usize = 0;
                    for ch in line_slice.chars() {
                        utf8 += 1;
                        utf16 += ch.len_utf16();
                    }
                    (utf8, utf16)
                };

                Some(SourcePosition::new(
                    line,
                    col_utf8,
                    Some(col_utf16),
                    offset,
                ))
            },
            Self::PrecomputedColumns { entries } => {
                if entries.is_empty() {
                    return None;
                }

                // Floor lookup: find the largest entry offset <=
                // byte_offset.
                let idx = entries
                    .partition_point(|&(off, _)| off <= byte_offset);

                if idx > 0 {
                    Some(entries[idx - 1].1)
                } else {
                    None
                }
            },
        }
    }

    /// Returns the source text, if this is source-text-mode data.
    fn source(&self) -> Option<&'src str> {
        match self {
            Self::SourceText { source, .. } => Some(source),
            Self::PrecomputedColumns { .. } => None,
        }
    }
}

/// Provides utilities for mapping [`ByteSpan`]s to [`SourceSpan`]s,
/// byte offsets to [`SourcePosition`]s, and extracting content from the
/// original source text.
///
/// The `'src` lifetime ties the `SourceMap` to the source text it was built
/// from.
///
/// `SourceMap` is a key part of what makes `libgraphql-parser` fast.
/// The lexer and parser operate exclusively on compact `u32` byte
/// offsets (stored in [`ByteSpan`]s), deferring line/column
/// computation until it is actually needed — typically only for
/// error formatting or tooling & IDE features. `SourceMap` is the mechanism
/// that makes this deferred resolution possible, translating raw
/// byte offsets back to human-readable [`SourcePosition`]s on
/// demand.
///
/// Construction of `SourceMap`s is typically owned by a
/// [`GraphQLTokenSource`](crate::token::GraphQLTokenSource) and uses one of two
/// modes of data-fill: [`SourceMap::new_with_source`] or
/// [`SourceMap::new_precomputed`].
///
/// ### Construction with source text ([`SourceMap::new_with_source`])
///
/// An O(sourcelen) (SIMD-accelerated) pre-pass scans the source string for line
/// terminators (`\n`, `\r`, `\r\n`) and records the byte offset of each line
/// start. Individual position lookups via [`SourceMap::resolve_offset`] later
/// are then O(log n) binary searches on the line-start table, plus a short
/// char-counting walk from line start to target byte offset to compute UTF-8
/// and UTF-16 column values.
///
/// This mode is used by
/// [`StrGraphQLTokenSource`](crate::token::StrGraphQLTokenSource),
/// which has direct access to the source string. It is memory-efficient
/// (only one `u32` per line) and avoids as much per-token bookkeeping as
/// possible during lexing — the lexer only tracks a single `curr_byte_offset`
/// and defers all line/column computation.
///
/// ### Pre-Computed Columns Mode ([`SourceMap::new_precomputed`])
///
/// Some token sources do not have access to the underlying source text at
/// resolution time. For example,
/// [`libgraphql_macros::RustMacroGraphQLTokenSource`](https://docs.rs/libgraphql-macros)
/// produces tokens from a `proc_macro2::TokenStream`. Each `proc_macro2::Span`
/// carries line/column information at the time the token is produced, but there
/// is no contiguous source `&str` to scan after the fact. In this mode, the
/// token source collects `(byte_offset, SourcePosition)` entries during lexing
/// and then constructs the `SourceMap` at the end by passing the entries to
/// [`new_precomputed()`](Self::new_precomputed). Later, lookups will
/// binary-search that entries table.
///
/// This mode uses more memory (one full `SourcePosition` per inserted
/// offset, rather than one `u32` per line), but lookups are O(log n) with
/// no char-counting walk — just a binary search and a direct return.
///
/// When a `SourceMap` is constructed in pre-computed mode, the `'src` lifetime
/// is typically `'static` since no source text is borrowed.
///
/// # UTF-16 Column Recovery
///
/// In source-text mode, UTF-16 columns are computed on demand by iterating
/// chars from the line start to the target byte offset and summing
/// [`char::len_utf16()`]. In pre-computed columns mode, UTF-16 columns are
/// whatever the token source provided (or `None` if the token source cannot
/// compute them).
#[derive(Clone)]
pub struct SourceMap<'src> {
    /// The resolution data — either source-text-backed or pre-computed.
    data: SourceMapData<'src>,

    /// Optional file path for the source text.
    file_path: Option<PathBuf>,
}

impl<'src> std::fmt::Debug for SourceMap<'src> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let mode = match &self.data {
            SourceMapData::SourceText { line_starts, .. } => {
                format!("SourceText({} lines)", line_starts.len())
            },
            SourceMapData::PrecomputedColumns { entries } => {
                format!("PrecomputedColumns({} entries)", entries.len())
            },
        };
        f.debug_struct("SourceMap")
            .field("mode", &mode)
            .field("file_path", &self.file_path)
            .finish()
    }
}

impl<'src> SourceMap<'src> {
    /// Builds a `SourceMap` in source-text mode by scanning `source` for
    /// line terminators.
    ///
    /// This is an O(n) pre-pass that identifies all line start byte offsets.
    /// Line terminators recognized: `\n`, `\r`, `\r\n` (the pair counts as
    /// one terminator).
    pub fn new_with_source(
        source: &'src str,
        file_path: Option<PathBuf>,
    ) -> Self {
        let line_starts = Self::compute_line_starts(source);
        Self {
            data: SourceMapData::SourceText {
                source,
                line_starts,
            },
            file_path,
        }
    }

    /// Creates a `SourceMap` in pre-computed columns mode.
    ///
    /// The `entries` parameter contains `(byte_offset, SourcePosition)`
    /// pairs that were collected during lexing. Entries must be sorted
    /// by byte offset in monotonically increasing order (which is
    /// naturally the case when collected during a left-to-right lex
    /// pass).
    ///
    /// This mode is intended for token sources that know line/column
    /// information at lex time but do not have access to the underlying
    /// source text afterward.
    ///
    /// See the [type-level documentation](Self) for a detailed comparison
    /// of the two modes.
    ///
    /// # Example
    ///
    /// ```rust
    /// # use libgraphql_parser::SourceMap;
    /// # use libgraphql_parser::SourcePosition;
    /// # let byte_offset = 0;
    /// # let position = SourcePosition::new(0, 0, None, 0);
    /// // During lexing, collect entries into a Vec:
    /// let mut entries = Vec::new();
    /// entries.push((byte_offset, position));
    /// // After lexing, build the SourceMap:
    /// let source_map = SourceMap::new_precomputed(entries, None);
    /// ```
    pub fn new_precomputed(
        entries: Vec<(u32, SourcePosition)>,
        file_path: Option<PathBuf>,
    ) -> Self {
        debug_assert!(
            entries.windows(2).all(|w| w[0].0 <= w[1].0),
            "new_precomputed entries must be sorted by byte offset",
        );
        Self {
            data: SourceMapData::PrecomputedColumns { entries },
            file_path,
        }
    }

    /// Creates an empty `SourceMap` that cannot resolve any offsets.
    ///
    /// Useful for token sources that don't have source text (e.g.
    /// proc-macro token sources or test mocks).
    pub fn empty() -> Self {
        Self {
            data: SourceMapData::PrecomputedColumns {
                entries: Vec::new(),
            },
            file_path: None,
        }
    }

    /// Returns the source text, if this is a source-text-mode `SourceMap`.
    pub fn source(&self) -> Option<&'src str> {
        self.data.source()
    }

    /// Returns the file path, if available.
    pub fn file_path(&self) -> Option<&Path> {
        self.file_path.as_deref()
    }

    /// Resolves a byte offset to a full [`SourcePosition`] (line, col_utf8,
    /// col_utf16, byte_offset).
    ///
    /// Returns `None` if the offset cannot be resolved — for example, if
    /// the byte offset is out of bounds (source-text mode) or if no
    /// pre-computed entries cover the requested offset.
    ///
    /// # Source-text mode
    ///
    /// Uses binary search on `line_starts` to find the line, then counts
    /// chars from the line start to compute columns.
    ///
    /// # Pre-computed columns mode
    ///
    /// Binary-searches the pre-computed entries for the largest byte offset
    /// `<=` the requested offset (floor lookup). If the requested offset
    /// falls between two entries, the earlier entry's position is returned
    /// (this handles lookups for byte offsets mid-token, returning the
    /// position of the nearest preceding entry).
    pub fn resolve_offset(
        &self,
        byte_offset: u32,
    ) -> Option<SourcePosition> {
        self.data.resolve_offset(byte_offset)
    }

    /// Resolves a [`ByteSpan`] to a full [`SourceSpan`] with
    /// line/column information and file path.
    ///
    /// Returns `None` if either endpoint of the span cannot be
    /// resolved. Common scenarios where resolution fails:
    ///
    /// - **Empty `SourceMap`** (`SourceMap::empty()`): no entries
    ///   exist, so no offset can be resolved.
    /// - **Out-of-bounds offset**: the byte offset exceeds the
    ///   source text length (source-text mode) or falls before the
    ///   first pre-computed entry (pre-computed columns mode).
    /// - **Mid-UTF-8 offset**: the byte offset lands in the middle
    ///   of a multi-byte UTF-8 character (source-text mode only).
    ///
    /// For error reporting, the parser's internal `resolve_span()`
    /// wrapper falls back to `SourceSpan::zero()` when this method
    /// returns `None`. For AST tooling that needs accurate
    /// positions, callers should handle the `None` case explicitly.
    pub fn resolve_span(
        &self,
        span: ByteSpan,
    ) -> Option<SourceSpan> {
        let start = self.data.resolve_offset(span.start)?;
        let end = self.data.resolve_offset(span.end)?;
        Some(match &self.file_path {
            Some(path) => {
                SourceSpan::with_file(start, end, path.clone())
            },
            None => SourceSpan::new(start, end),
        })
    }

    /// Returns the content of the line at the given 0-based line index,
    /// stripped of any trailing line terminator (`\n`, `\r`, `\r\n`).
    ///
    /// Returns `None` if this is not a source-text-mode `SourceMap`, or if
    /// `line_index` is out of bounds.
    ///
    /// This uses the `line_starts` table built by
    /// `compute_line_starts()`, which correctly recognizes bare
    /// `\r` as a line terminator per the GraphQL spec. Code that needs to
    /// extract line content should use this method rather than
    /// [`str::lines()`], which does **not** handle bare `\r`.
    ///
    /// Note: `graphql_parse_error::get_line()` provides similar
    /// functionality via a linear scan (no pre-computed table).
    /// Both must use the same line-terminator semantics.
    pub fn get_line(&self, line_index: usize) -> Option<&'src str> {
        match &self.data {
            SourceMapData::SourceText { source, line_starts } => {
                if line_index >= line_starts.len() {
                    return None;
                }
                let start = line_starts[line_index] as usize;
                let end = if line_index + 1 < line_starts.len() {
                    line_starts[line_index + 1] as usize
                } else {
                    source.len()
                };
                let line = &source[start..end];
                // Strip trailing line terminator(s)
                let line = line.strip_suffix("\r\n")
                    .or_else(|| line.strip_suffix('\n'))
                    .or_else(|| line.strip_suffix('\r'))
                    .unwrap_or(line);
                Some(line)
            },
            SourceMapData::PrecomputedColumns { .. } => None,
        }
    }

    /// Returns the number of lines in the source text.
    ///
    /// In source-text mode, this is the number of line-start entries
    /// computed during construction. In pre-computed columns mode, this
    /// is derived from the highest line number seen in the entries
    /// (plus one). Returns 0 if no entries have been inserted.
    pub fn line_count(&self) -> usize {
        match &self.data {
            SourceMapData::SourceText { line_starts, .. } => {
                line_starts.len()
            },
            SourceMapData::PrecomputedColumns { entries } => {
                entries.last()
                    .map(|(_, pos)| pos.line() + 1)
                    .unwrap_or(0)
            },
        }
    }

    // ── Line-start computation ─────────────────────────────

    /// Scans source text and returns the byte offset of the start of each
    /// line.
    ///
    /// Line terminators: `\n`, `\r`, `\r\n` (the pair counts as one).
    fn compute_line_starts(source: &str) -> Vec<u32> {
        let bytes = source.as_bytes();
        let len = bytes.len();

        // Pre-allocate: ~40 chars per line as a rough heuristic.
        let mut line_starts = Vec::with_capacity(1 + len / 40);
        line_starts.push(0); // First line always starts at byte 0

        // SIMD-accelerated scan: jump directly to the next
        // `\n` or `\r` instead of checking every byte. Most
        // bytes in a GraphQL document are not line terminators,
        // so this skips large stretches at 16–32 bytes/cycle.
        let mut i = 0;
        while let Some(offset) = memchr::memchr2(b'\n', b'\r', &bytes[i..]) {
            i += offset;
            match bytes[i] {
                b'\n' => {
                    line_starts.push((i + 1) as u32);
                    i += 1;
                },
                b'\r' => {
                    // \r\n is a single line terminator
                    if i + 1 < len && bytes[i + 1] == b'\n' {
                        line_starts.push((i + 2) as u32);
                        i += 2;
                    } else {
                        line_starts.push((i + 1) as u32);
                        i += 1;
                    }
                },
                _ => unreachable!(),
            }
        }

        line_starts
    }
}