Skip to main content

bibtex_parser/parser/
mod.rs

1//! BibTeX parser implementation using winnow
2//!
3//! This module provides both high-level and low-level APIs for parsing BibTeX files.
4//! Most users should use the high-level `Library` API, but the low-level API is available
5//! for advanced use cases that require access to raw parsed items.
6//!
7//! # Low-level API Example
8//!
9//! ```
10//! use bibtex_parser::parser::{parse_bibtex, ParsedItem};
11//!
12//! let input = r#"
13//!     @string{ieee = "IEEE"}
14//!     @preamble{"Test preamble"}
15//!     % Line comment
16//!     @article{test2024,
17//!         author = "John Doe",
18//!         title = ieee # " Article",
19//!         year = 2024
20//!     }
21//! "#;
22//!
23//! let items = parse_bibtex(input)?;
24//!
25//! for item in items {
26//!     match item {
27//!         ParsedItem::Entry(entry) => {
28//!             println!("Found entry: {}", entry.key());
29//!             // Variables are not expanded yet - title contains reference to 'ieee'
30//!         },
31//!         ParsedItem::String(name, value) => {
32//!             println!("String definition: {} = {:?}", name, value);
33//!         },
34//!         ParsedItem::Preamble(value) => {
35//!             println!("Preamble: {:?}", value);
36//!         },
37//!         ParsedItem::Comment(text) => {
38//!             println!("Comment: {}", text.trim());
39//!         },
40//!     }
41//! }
42//! # Ok::<(), bibtex_parser::Error>(())
43//! ```
44
45pub mod delimiter;
46pub mod entry;
47pub mod lexer;
48pub mod simd;
49pub mod utils;
50pub mod value;
51
52use crate::{Error, Result, SourceMap, SourceSpan};
53use winnow::ascii::multispace0;
54use winnow::prelude::*;
55
56pub use entry::parse_entry;
57
58/// Internal parser result type
59pub type PResult<'a, O> = winnow::PResult<O, winnow::error::ContextError>;
60
61#[cold]
62#[inline(never)]
63pub(crate) fn backtrack_err() -> winnow::error::ErrMode<winnow::error::ContextError> {
64    winnow::error::ErrMode::Backtrack(winnow::error::ContextError::default())
65}
66
67#[cold]
68#[inline(never)]
69pub(crate) fn backtrack<O>() -> PResult<'static, O> {
70    Err(backtrack_err())
71}
72
73/// Parse a BibTeX file into raw items without expansion or processing
74///
75/// This is a low-level API that returns the raw parsed items before
76/// string variable expansion or other processing. Most users should
77/// use `Library::parse()` instead.
78///
79/// The returned items preserve the original structure:
80/// - String variables are not expanded
81/// - Concatenations are preserved as `Value::Concat`
82/// - Comments are included (both `%` line comments and `@comment{}`)
83/// - All items are returned in parse order
84///
85/// # Performance
86///
87/// This function uses the same zero-copy value parser as the high-level API,
88/// but returns raw items without string expansion or library indexing.
89///
90/// # Example
91///
92/// ```
93/// use bibtex_parser::parser::{parse_bibtex, ParsedItem};
94/// use bibtex_parser::Value;
95///
96/// let input = r#"
97///     @string{name = "John Doe"}
98///     @article{test,
99///         author = name,
100///         title = "Part 1" # " and " # "Part 2"
101///     }
102/// "#;
103///
104/// let items = parse_bibtex(input)?;
105///
106/// // Find the entry
107/// let entry = items.iter().find_map(|item| {
108///     if let ParsedItem::Entry(e) = item { Some(e) } else { None }
109/// }).unwrap();
110///
111/// // Author field contains unexpanded variable reference
112/// let author_field = entry.fields.iter()
113///     .find(|f| f.name == "author").unwrap();
114/// match &author_field.value {
115///     Value::Variable(var) => println!("Variable reference: {}", var),
116///     _ => {}
117/// }
118///
119/// // Title field contains concatenation structure
120/// let title_field = entry.fields.iter()
121///     .find(|f| f.name == "title").unwrap();
122/// match &title_field.value {
123///     Value::Concat(parts) => println!("Concatenation with {} parts", parts.len()),
124///     _ => {}
125/// }
126/// # Ok::<(), bibtex_parser::Error>(())
127/// ```
128#[inline]
129pub fn parse_bibtex(input: &str) -> Result<Vec<ParsedItem<'_>>> {
130    let mut items = Vec::new();
131    parse_bibtex_stream(input, |item| {
132        items.push(item);
133        Ok(())
134    })?;
135    Ok(items)
136}
137
138/// Parse a BibTeX file and stream raw items to a callback.
139///
140/// This avoids allocating an intermediate `Vec<ParsedItem>` when the caller
141/// can process items incrementally.
142#[inline]
143pub(crate) fn parse_bibtex_stream<'a, F>(input: &'a str, mut on_item: F) -> Result<()>
144where
145    F: FnMut(ParsedItem<'a>) -> Result<()>,
146{
147    let mut remaining = input;
148
149    loop {
150        // Skip ASCII whitespace without Unicode trimming overhead.
151        lexer::skip_whitespace(&mut remaining);
152        if remaining.is_empty() {
153            break;
154        }
155
156        // Try to parse an item (including comments)
157        match parse_item(&mut remaining) {
158            Ok(item) => on_item(item)?,
159            Err(e) => {
160                // Calculate line/column for error
161                let consumed = input.len() - remaining.len();
162                let (line, column) = calculate_position(input, consumed);
163
164                return Err(Error::ParseError {
165                    line,
166                    column,
167                    message: format!("Failed to parse entry: {e}"),
168                    snippet: Some(get_snippet(remaining, 40)),
169                });
170            }
171        }
172    }
173
174    Ok(())
175}
176
177/// Parse a BibTeX file and stream raw items with source spans.
178#[inline]
179pub(crate) fn parse_bibtex_stream_with_spans<'a, F>(input: &'a str, mut on_item: F) -> Result<()>
180where
181    F: FnMut(ParsedItem<'a>, SourceSpan, &'a str) -> Result<()>,
182{
183    let source_map = SourceMap::anonymous(input);
184    let mut remaining = input;
185
186    loop {
187        lexer::skip_whitespace(&mut remaining);
188        if remaining.is_empty() {
189            break;
190        }
191
192        let start = input.len() - remaining.len();
193        let before_item = remaining;
194        match parse_item(&mut remaining) {
195            Ok(item) => {
196                let end = input.len() - remaining.len();
197                let span = source_map.span(start, end);
198                on_item(item, span, &input[start..end])?;
199            }
200            Err(e) => {
201                let (line, column) = calculate_position(input, start);
202
203                return Err(Error::ParseError {
204                    line,
205                    column,
206                    message: format!("Failed to parse entry: {e}"),
207                    snippet: Some(get_snippet(before_item, 40)),
208                });
209            }
210        }
211    }
212
213    Ok(())
214}
215
216/// A raw parsed item from a BibTeX file before processing
217///
218/// This represents the different types of items that can appear in a BibTeX file,
219/// returned by the low-level `parse_bibtex()` function. These items are in their
220/// raw parsed form:
221///
222/// - String variables are not yet expanded
223/// - Field values preserve concatenation structure
224/// - Comments are preserved exactly as found
225/// - All items maintain their original order
226///
227/// # Examples
228///
229/// ```
230/// use bibtex_parser::parser::{parse_bibtex, ParsedItem};
231///
232/// let input = "@string{name = \"John\"}\n@article{key, author = name}";
233/// let items = parse_bibtex(input)?;
234///
235/// match &items[0] {
236///     ParsedItem::String(var_name, value) => {
237///         println!("String variable: {} = {:?}", var_name, value);
238///     },
239///     _ => {}
240/// }
241///
242/// match &items[1] {
243///     ParsedItem::Entry(entry) => {
244///         // The author field contains a variable reference, not the expanded value
245///         println!("Entry key: {}", entry.key());
246///     },
247///     _ => {}
248/// }
249/// # Ok::<(), bibtex_parser::Error>(())
250/// ```
251#[derive(Debug, Clone, PartialEq)]
252pub enum ParsedItem<'a> {
253    /// A bibliography entry (article, book, inproceedings, etc.)
254    ///
255    /// Contains the entry in its raw parsed form with field values that may
256    /// reference string variables or contain concatenations.
257    Entry(crate::Entry<'a>),
258
259    /// A string definition (`@string{name = value}`)
260    ///
261    /// Contains the variable name and its value. The value itself may contain
262    /// references to other string variables or concatenations.
263    String(&'a str, crate::Value<'a>),
264
265    /// A preamble (`@preamble{value}`)
266    ///
267    /// Contains the preamble value, which may reference string variables
268    /// or contain concatenations.
269    Preamble(crate::Value<'a>),
270
271    /// A comment (both `% line comment` and `@comment{...}`)
272    ///
273    /// Contains the raw comment text exactly as it appears in the source,
274    /// including any whitespace and formatting.
275    Comment(&'a str),
276}
277
278/// Parse a single item (entry, string, preamble, or comment) with optimized delimiter search
279#[inline]
280pub(crate) fn parse_item<'a>(input: &mut &'a str) -> PResult<'a, ParsedItem<'a>> {
281    // Use optimized delimiter search to find @ or handle as comment
282    let bytes = input.as_bytes();
283
284    // Fast path: if we don't start with @, check if this is a comment
285    if !bytes.is_empty() && bytes[0] != b'@' {
286        // Look for the next @ to treat everything before it as a comment
287        if let Some(at_pos) = delimiter::find_byte(bytes, b'@', 0) {
288            let comment = &input[..at_pos];
289            *input = &input[at_pos..];
290            return Ok(ParsedItem::Comment(comment));
291        }
292        // No @ found, entire remaining input is a comment
293        let comment = *input;
294        *input = "";
295        return Ok(ParsedItem::Comment(comment));
296    }
297
298    // We have an @ at the start. For regular entries, avoid checking all
299    // special keywords and dispatch directly based on the first letter.
300    let second = bytes.get(1).copied().unwrap_or_default();
301    match second | 0x20 {
302        b's' if starts_with_keyword(bytes, b"string") => {
303            parse_string(input).map(|(k, v)| ParsedItem::String(k, v))
304        }
305        b'p' if starts_with_keyword(bytes, b"preamble") => {
306            parse_preamble(input).map(ParsedItem::Preamble)
307        }
308        b'c' if starts_with_keyword(bytes, b"comment") => {
309            parse_comment(input).map(ParsedItem::Comment)
310        }
311        _ => entry::parse_entry_at(input).map(ParsedItem::Entry),
312    }
313}
314
315#[inline(never)]
316fn starts_with_keyword(input: &[u8], keyword: &[u8]) -> bool {
317    if input.first() != Some(&b'@') || input.len() < keyword.len() + 1 {
318        return false;
319    }
320
321    for (offset, &expected) in keyword.iter().enumerate() {
322        if (input[offset + 1] | 0x20) != expected {
323            return false;
324        }
325    }
326
327    if input.len() == keyword.len() + 1 {
328        return true;
329    }
330
331    !is_identifier_char(input[keyword.len() + 1])
332}
333
334#[inline]
335const fn is_identifier_char(byte: u8) -> bool {
336    matches!(
337        byte,
338        b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' | b'_' | b'-' | b':' | b'.'
339    )
340}
341
342/// Parse a @string definition
343fn parse_string<'a>(input: &mut &'a str) -> PResult<'a, (&'a str, crate::Value<'a>)> {
344    use winnow::combinator::{alt, delimited, preceded};
345
346    preceded(
347        (multispace0, '@', utils::tag_no_case("string"), multispace0),
348        alt((
349            delimited('{', parse_string_content, '}'),
350            delimited('(', parse_string_content, ')'),
351        )),
352    )
353    .parse_next(input)
354}
355
356/// Parse the content of a @string definition
357fn parse_string_content<'a>(input: &mut &'a str) -> PResult<'a, (&'a str, crate::Value<'a>)> {
358    use winnow::combinator::separated_pair;
359
360    separated_pair(
361        utils::ws(lexer::identifier),
362        utils::ws('='),
363        utils::ws(value::parse_value),
364    )
365    .parse_next(input)
366}
367
368/// Parse a @preamble
369fn parse_preamble<'a>(input: &mut &'a str) -> PResult<'a, crate::Value<'a>> {
370    use winnow::combinator::{alt, delimited, preceded};
371
372    preceded(
373        (
374            multispace0,
375            '@',
376            utils::tag_no_case("preamble"),
377            multispace0,
378        ),
379        alt((
380            delimited('{', parse_preamble_value, '}'),
381            delimited('(', parse_preamble_value, ')'),
382        )),
383    )
384    .parse_next(input)
385}
386
387/// Helper function to parse preamble value
388fn parse_preamble_value<'a>(input: &mut &'a str) -> PResult<'a, crate::Value<'a>> {
389    utils::ws(value::parse_value).parse_next(input)
390}
391
392/// Parse a comment (different formats)
393fn parse_comment<'a>(input: &mut &'a str) -> PResult<'a, &'a str> {
394    use winnow::ascii::till_line_ending;
395    use winnow::combinator::{alt, delimited, preceded};
396    use winnow::token::take_until;
397
398    alt((
399        // @comment{...}
400        preceded(
401            (multispace0, '@', utils::tag_no_case("comment"), multispace0),
402            alt((
403                delimited('{', lexer::balanced_braces, '}'),
404                delimited('(', lexer::balanced_parentheses, ')'),
405            )),
406        ),
407        // % line comment
408        preceded('%', till_line_ending),
409        // Any text before @ is considered a comment
410        take_until(1.., "@").verify(|s: &str| !s.trim().is_empty()),
411    ))
412    .parse_next(input)
413}
414
415/// Calculate line and column from position
416fn calculate_position(input: &str, pos: usize) -> (usize, usize) {
417    let mut line = 1;
418    let mut column = 1;
419
420    for (i, ch) in input.char_indices() {
421        if i >= pos {
422            break;
423        }
424        if ch == '\n' {
425            line += 1;
426            column = 1;
427        } else {
428            column += 1;
429        }
430    }
431
432    (line, column)
433}
434
435/// Get a snippet of input for error messages
436fn get_snippet(input: &str, max_len: usize) -> String {
437    let snippet: String = input.chars().take(max_len).collect();
438    if input.len() > max_len {
439        format!("{snippet}...")
440    } else {
441        snippet
442    }
443}