Skip to main content

bibtex_parser/parser/
mod.rs

1//! BibTeX parser implementation using winnow
2//!
3//! This module provides both high-level and low-level APIs for parsing BibTeX files.
4//! Most users should use the high-level `Library` API, but the low-level API is available
5//! for advanced use cases that require access to raw parsed items.
6//!
7//! # Low-level API Example
8//!
9//! ```
10//! use bibtex_parser::parser::{parse_bibtex, ParsedItem};
11//!
12//! let input = r#"
13//!     @string{ieee = "IEEE"}
14//!     @preamble{"Test preamble"}
15//!     % Line comment
16//!     @article{test2024,
17//!         author = "John Doe",
18//!         title = ieee # " Article",
19//!         year = 2024
20//!     }
21//! "#;
22//!
23//! let items = parse_bibtex(input)?;
24//!
25//! for item in items {
26//!     match item {
27//!         ParsedItem::Entry(entry) => {
28//!             println!("Found entry: {}", entry.key());
29//!             // Variables are not expanded yet - title contains reference to 'ieee'
30//!         },
31//!         ParsedItem::String(name, value) => {
32//!             println!("String definition: {} = {:?}", name, value);
33//!         },
34//!         ParsedItem::Preamble(value) => {
35//!             println!("Preamble: {:?}", value);
36//!         },
37//!         ParsedItem::Comment(text) => {
38//!             println!("Comment: {}", text.trim());
39//!         },
40//!     }
41//! }
42//! # Ok::<(), bibtex_parser::Error>(())
43//! ```
44
45pub mod delimiter;
46pub mod entry;
47pub mod lexer;
48pub mod simd;
49pub mod utils;
50pub mod value;
51
52use crate::{Error, Result, SourceMap, SourceSpan};
53use winnow::ascii::multispace0;
54use winnow::prelude::*;
55
56pub use entry::parse_entry;
57
58/// Internal parser result type
59pub type PResult<'a, O> = winnow::PResult<O, winnow::error::ContextError>;
60
61#[cold]
62#[inline(never)]
63pub(crate) fn backtrack_err() -> winnow::error::ErrMode<winnow::error::ContextError> {
64    winnow::error::ErrMode::Backtrack(winnow::error::ContextError::default())
65}
66
67#[cold]
68#[inline(never)]
69pub(crate) fn backtrack<O>() -> PResult<'static, O> {
70    Err(backtrack_err())
71}
72
73/// Parse a BibTeX file into raw items without expansion or processing
74///
75/// This is a low-level API that returns the raw parsed items before
76/// string variable expansion or other processing. Most users should
77/// use `Library::parse()` instead.
78///
79/// The returned items preserve the original structure:
80/// - String variables are not expanded
81/// - Concatenations are preserved as `Value::Concat`
82/// - Comments are included (both `%` line comments and `@comment{}`)
83/// - All items are returned in parse order
84///
85/// # Performance
86///
87/// This function uses the same zero-copy value parser as the high-level API,
88/// but returns raw items without string expansion or library indexing.
89///
90/// # Example
91///
92/// ```
93/// use bibtex_parser::parser::{parse_bibtex, ParsedItem};
94/// use bibtex_parser::Value;
95///
96/// let input = r#"
97///     @string{name = "John Doe"}
98///     @article{test,
99///         author = name,
100///         title = "Part 1" # " and " # "Part 2"
101///     }
102/// "#;
103///
104/// let items = parse_bibtex(input)?;
105///
106/// // Find the entry
107/// let entry = items.iter().find_map(|item| {
108///     if let ParsedItem::Entry(e) = item { Some(e) } else { None }
109/// }).unwrap();
110///
111/// // Author field contains unexpanded variable reference
112/// let author_field = entry.fields.iter()
113///     .find(|f| f.name == "author").unwrap();
114/// match &author_field.value {
115///     Value::Variable(var) => println!("Variable reference: {}", var),
116///     _ => {}
117/// }
118///
119/// // Title field contains concatenation structure
120/// let title_field = entry.fields.iter()
121///     .find(|f| f.name == "title").unwrap();
122/// match &title_field.value {
123///     Value::Concat(parts) => println!("Concatenation with {} parts", parts.len()),
124///     _ => {}
125/// }
126/// # Ok::<(), bibtex_parser::Error>(())
127/// ```
128#[inline]
129pub fn parse_bibtex(input: &str) -> Result<Vec<ParsedItem<'_>>> {
130    let mut items = Vec::new();
131    parse_bibtex_stream(input, |item| {
132        items.push(item);
133        Ok(())
134    })?;
135    Ok(items)
136}
137
138/// Parse a BibTeX file and stream raw items to a callback.
139///
140/// This avoids allocating an intermediate `Vec<ParsedItem>` when the caller
141/// can process items incrementally.
142#[inline]
143pub(crate) fn parse_bibtex_stream<'a, F>(input: &'a str, mut on_item: F) -> Result<()>
144where
145    F: FnMut(ParsedItem<'a>) -> Result<()>,
146{
147    let mut remaining = input;
148
149    loop {
150        // Skip ASCII whitespace without Unicode trimming overhead.
151        lexer::skip_whitespace(&mut remaining);
152        if remaining.is_empty() {
153            break;
154        }
155
156        // Try to parse an item (including comments)
157        match parse_item(&mut remaining) {
158            Ok(item) => on_item(item)?,
159            Err(e) => {
160                // Calculate line/column for error
161                let consumed = input.len() - remaining.len();
162                let (line, column) = calculate_position(input, consumed);
163
164                return Err(Error::ParseError {
165                    line,
166                    column,
167                    message: format!("Failed to parse entry: {e}"),
168                    snippet: Some(get_snippet(remaining, 40)),
169                });
170            }
171        }
172    }
173
174    Ok(())
175}
176
177/// Parse a BibTeX file and stream raw items with source spans.
178#[inline]
179pub(crate) fn parse_bibtex_stream_with_spans<'a, F>(input: &'a str, mut on_item: F) -> Result<()>
180where
181    F: FnMut(ParsedItem<'a>, SourceSpan, &'a str) -> Result<()>,
182{
183    let source_map = SourceMap::anonymous(input);
184    let mut remaining = input;
185
186    loop {
187        lexer::skip_whitespace(&mut remaining);
188        if remaining.is_empty() {
189            break;
190        }
191
192        let start = input.len() - remaining.len();
193        let before_item = remaining;
194        match parse_item(&mut remaining) {
195            Ok(item) => {
196                let end = input.len() - remaining.len();
197                let span = source_map.span(start, end);
198                on_item(item, span, &input[start..end])?;
199            }
200            Err(e) => {
201                let (line, column) = calculate_position(input, start);
202
203                return Err(Error::ParseError {
204                    line,
205                    column,
206                    message: format!("Failed to parse entry: {e}"),
207                    snippet: Some(get_snippet(before_item, 40)),
208                });
209            }
210        }
211    }
212
213    Ok(())
214}
215
216#[inline]
217pub(crate) fn parse_bibtex_stream_with_entry_locations<'a, F>(
218    input: &'a str,
219    mut on_item: F,
220) -> Result<()>
221where
222    F: FnMut(LocatedParsedItem<'a>, usize, usize, &'a str) -> Result<()>,
223{
224    let mut remaining = input;
225
226    loop {
227        lexer::skip_whitespace(&mut remaining);
228        if remaining.is_empty() {
229            break;
230        }
231
232        let start = input.len() - remaining.len();
233        let before_item = remaining;
234        match parse_item_with_entry_locations(&mut remaining, start) {
235            Ok(item) => {
236                let end = input.len() - remaining.len();
237                on_item(item, start, end, &input[start..end])?;
238            }
239            Err(e) => {
240                let (line, column) = calculate_position(input, start);
241
242                return Err(Error::ParseError {
243                    line,
244                    column,
245                    message: format!("Failed to parse entry: {e}"),
246                    snippet: Some(get_snippet(before_item, 40)),
247                });
248            }
249        }
250    }
251
252    Ok(())
253}
254
255/// A raw parsed item from a BibTeX file before processing
256///
257/// This represents the different types of items that can appear in a BibTeX file,
258/// returned by the low-level `parse_bibtex()` function. These items are in their
259/// raw parsed form:
260///
261/// - String variables are not yet expanded
262/// - Field values preserve concatenation structure
263/// - Comments are preserved exactly as found
264/// - All items maintain their original order
265///
266/// # Examples
267///
268/// ```
269/// use bibtex_parser::parser::{parse_bibtex, ParsedItem};
270///
271/// let input = "@string{name = \"John\"}\n@article{key, author = name}";
272/// let items = parse_bibtex(input)?;
273///
274/// match &items[0] {
275///     ParsedItem::String(var_name, value) => {
276///         println!("String variable: {} = {:?}", var_name, value);
277///     },
278///     _ => {}
279/// }
280///
281/// match &items[1] {
282///     ParsedItem::Entry(entry) => {
283///         // The author field contains a variable reference, not the expanded value
284///         println!("Entry key: {}", entry.key());
285///     },
286///     _ => {}
287/// }
288/// # Ok::<(), bibtex_parser::Error>(())
289/// ```
290#[derive(Debug, Clone, PartialEq)]
291pub enum ParsedItem<'a> {
292    /// A bibliography entry (article, book, inproceedings, etc.)
293    ///
294    /// Contains the entry in its raw parsed form with field values that may
295    /// reference string variables or contain concatenations.
296    Entry(crate::Entry<'a>),
297
298    /// A string definition (`@string{name = value}`)
299    ///
300    /// Contains the variable name and its value. The value itself may contain
301    /// references to other string variables or concatenations.
302    String(&'a str, crate::Value<'a>),
303
304    /// A preamble (`@preamble{value}`)
305    ///
306    /// Contains the preamble value, which may reference string variables
307    /// or contain concatenations.
308    Preamble(crate::Value<'a>),
309
310    /// A comment (both `% line comment` and `@comment{...}`)
311    ///
312    /// Contains the raw comment text exactly as it appears in the source,
313    /// including any whitespace and formatting.
314    Comment(&'a str),
315}
316
317pub(crate) enum LocatedParsedItem<'a> {
318    Entry(entry::LocatedEntry<'a>),
319    String(&'a str, crate::Value<'a>),
320    Preamble(crate::Value<'a>),
321    Comment(&'a str),
322}
323
324/// Parse a single item (entry, string, preamble, or comment) with optimized delimiter search
325#[inline]
326pub(crate) fn parse_item<'a>(input: &mut &'a str) -> PResult<'a, ParsedItem<'a>> {
327    // Use optimized delimiter search to find @ or handle as comment
328    let bytes = input.as_bytes();
329
330    // Fast path: if we don't start with @, check if this is a comment
331    if !bytes.is_empty() && bytes[0] != b'@' {
332        // Look for the next @ to treat everything before it as a comment
333        if let Some(at_pos) = delimiter::find_byte(bytes, b'@', 0) {
334            let comment = &input[..at_pos];
335            *input = &input[at_pos..];
336            return Ok(ParsedItem::Comment(comment));
337        }
338        // No @ found, entire remaining input is a comment
339        let comment = *input;
340        *input = "";
341        return Ok(ParsedItem::Comment(comment));
342    }
343
344    // We have an @ at the start. For regular entries, avoid checking all
345    // special keywords and dispatch directly based on the first letter.
346    let second = bytes.get(1).copied().unwrap_or_default();
347    match second | 0x20 {
348        b's' if starts_with_keyword(bytes, b"string") => {
349            parse_string(input).map(|(k, v)| ParsedItem::String(k, v))
350        }
351        b'p' if starts_with_keyword(bytes, b"preamble") => {
352            parse_preamble(input).map(ParsedItem::Preamble)
353        }
354        b'c' if starts_with_keyword(bytes, b"comment") => {
355            parse_comment(input).map(ParsedItem::Comment)
356        }
357        _ => entry::parse_entry_at(input).map(ParsedItem::Entry),
358    }
359}
360
361#[inline]
362fn parse_item_with_entry_locations<'a>(
363    input: &mut &'a str,
364    absolute_start: usize,
365) -> PResult<'a, LocatedParsedItem<'a>> {
366    let bytes = input.as_bytes();
367
368    if !bytes.is_empty() && bytes[0] != b'@' {
369        if let Some(at_pos) = delimiter::find_byte(bytes, b'@', 0) {
370            let comment = &input[..at_pos];
371            *input = &input[at_pos..];
372            return Ok(LocatedParsedItem::Comment(comment));
373        }
374        let comment = *input;
375        *input = "";
376        return Ok(LocatedParsedItem::Comment(comment));
377    }
378
379    let second = bytes.get(1).copied().unwrap_or_default();
380    match second | 0x20 {
381        b's' if starts_with_keyword(bytes, b"string") => {
382            parse_string(input).map(|(k, v)| LocatedParsedItem::String(k, v))
383        }
384        b'p' if starts_with_keyword(bytes, b"preamble") => {
385            parse_preamble(input).map(LocatedParsedItem::Preamble)
386        }
387        b'c' if starts_with_keyword(bytes, b"comment") => {
388            parse_comment(input).map(LocatedParsedItem::Comment)
389        }
390        _ => entry::parse_entry_at_with_locations(input, absolute_start)
391            .map(LocatedParsedItem::Entry),
392    }
393}
394
395#[inline(never)]
396fn starts_with_keyword(input: &[u8], keyword: &[u8]) -> bool {
397    if input.first() != Some(&b'@') || input.len() < keyword.len() + 1 {
398        return false;
399    }
400
401    for (offset, &expected) in keyword.iter().enumerate() {
402        if (input[offset + 1] | 0x20) != expected {
403            return false;
404        }
405    }
406
407    if input.len() == keyword.len() + 1 {
408        return true;
409    }
410
411    !is_identifier_char(input[keyword.len() + 1])
412}
413
414#[inline]
415const fn is_identifier_char(byte: u8) -> bool {
416    matches!(
417        byte,
418        b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' | b'_' | b'-' | b':' | b'.'
419    )
420}
421
422/// Parse a @string definition
423fn parse_string<'a>(input: &mut &'a str) -> PResult<'a, (&'a str, crate::Value<'a>)> {
424    use winnow::combinator::{alt, delimited, preceded};
425
426    preceded(
427        (multispace0, '@', utils::tag_no_case("string"), multispace0),
428        alt((
429            delimited('{', parse_string_content, '}'),
430            delimited('(', parse_string_content, ')'),
431        )),
432    )
433    .parse_next(input)
434}
435
436/// Parse the content of a @string definition
437fn parse_string_content<'a>(input: &mut &'a str) -> PResult<'a, (&'a str, crate::Value<'a>)> {
438    use winnow::combinator::separated_pair;
439
440    separated_pair(
441        utils::ws(lexer::identifier),
442        utils::ws('='),
443        utils::ws(value::parse_value),
444    )
445    .parse_next(input)
446}
447
448/// Parse a @preamble
449fn parse_preamble<'a>(input: &mut &'a str) -> PResult<'a, crate::Value<'a>> {
450    use winnow::combinator::{alt, delimited, preceded};
451
452    preceded(
453        (
454            multispace0,
455            '@',
456            utils::tag_no_case("preamble"),
457            multispace0,
458        ),
459        alt((
460            delimited('{', parse_preamble_value, '}'),
461            delimited('(', parse_preamble_value, ')'),
462        )),
463    )
464    .parse_next(input)
465}
466
467/// Helper function to parse preamble value
468fn parse_preamble_value<'a>(input: &mut &'a str) -> PResult<'a, crate::Value<'a>> {
469    utils::ws(value::parse_value).parse_next(input)
470}
471
472/// Parse a comment (different formats)
473fn parse_comment<'a>(input: &mut &'a str) -> PResult<'a, &'a str> {
474    use winnow::ascii::till_line_ending;
475    use winnow::combinator::{alt, delimited, preceded};
476    use winnow::token::take_until;
477
478    alt((
479        // @comment{...}
480        preceded(
481            (multispace0, '@', utils::tag_no_case("comment"), multispace0),
482            alt((
483                delimited('{', lexer::balanced_braces, '}'),
484                delimited('(', lexer::balanced_parentheses, ')'),
485            )),
486        ),
487        // % line comment
488        preceded('%', till_line_ending),
489        // Any text before @ is considered a comment
490        take_until(1.., "@").verify(|s: &str| !s.trim().is_empty()),
491    ))
492    .parse_next(input)
493}
494
495/// Calculate line and column from position
496fn calculate_position(input: &str, pos: usize) -> (usize, usize) {
497    let mut line = 1;
498    let mut column = 1;
499
500    for (i, ch) in input.char_indices() {
501        if i >= pos {
502            break;
503        }
504        if ch == '\n' {
505            line += 1;
506            column = 1;
507        } else {
508            column += 1;
509        }
510    }
511
512    (line, column)
513}
514
515/// Get a snippet of input for error messages
516fn get_snippet(input: &str, max_len: usize) -> String {
517    let snippet: String = input.chars().take(max_len).collect();
518    if input.len() > max_len {
519        format!("{snippet}...")
520    } else {
521        snippet
522    }
523}