bibtex_parser/parser/mod.rs
1//! BibTeX parser implementation using winnow
2//!
3//! This module provides both high-level and low-level APIs for parsing BibTeX files.
4//! Most users should use the high-level `Library` API, but the low-level API is available
5//! for advanced use cases that require access to raw parsed items.
6//!
7//! # Low-level API Example
8//!
9//! ```
10//! use bibtex_parser::parser::{parse_bibtex, ParsedItem};
11//!
12//! let input = r#"
13//! @string{ieee = "IEEE"}
14//! @preamble{"Test preamble"}
15//! % Line comment
16//! @article{test2024,
17//! author = "John Doe",
18//! title = ieee # " Article",
19//! year = 2024
20//! }
21//! "#;
22//!
23//! let items = parse_bibtex(input)?;
24//!
25//! for item in items {
26//! match item {
27//! ParsedItem::Entry(entry) => {
28//! println!("Found entry: {}", entry.key());
29//! // Variables are not expanded yet - title contains reference to 'ieee'
30//! },
31//! ParsedItem::String(name, value) => {
32//! println!("String definition: {} = {:?}", name, value);
33//! },
34//! ParsedItem::Preamble(value) => {
35//! println!("Preamble: {:?}", value);
36//! },
37//! ParsedItem::Comment(text) => {
38//! println!("Comment: {}", text.trim());
39//! },
40//! }
41//! }
42//! # Ok::<(), bibtex_parser::Error>(())
43//! ```
44
45pub mod delimiter;
46pub mod entry;
47pub mod lexer;
48pub mod simd;
49pub mod utils;
50pub mod value;
51
52use crate::{Error, Result, SourceMap, SourceSpan};
53use winnow::ascii::multispace0;
54use winnow::prelude::*;
55
56pub use entry::parse_entry;
57
58/// Internal parser result type
59pub type PResult<'a, O> = winnow::PResult<O, winnow::error::ContextError>;
60
61#[cold]
62#[inline(never)]
63pub(crate) fn backtrack_err() -> winnow::error::ErrMode<winnow::error::ContextError> {
64 winnow::error::ErrMode::Backtrack(winnow::error::ContextError::default())
65}
66
67#[cold]
68#[inline(never)]
69pub(crate) fn backtrack<O>() -> PResult<'static, O> {
70 Err(backtrack_err())
71}
72
73/// Parse a BibTeX file into raw items without expansion or processing
74///
75/// This is a low-level API that returns the raw parsed items before
76/// string variable expansion or other processing. Most users should
77/// use `Library::parse()` instead.
78///
79/// The returned items preserve the original structure:
80/// - String variables are not expanded
81/// - Concatenations are preserved as `Value::Concat`
82/// - Comments are included (both `%` line comments and `@comment{}`)
83/// - All items are returned in parse order
84///
85/// # Performance
86///
87/// This function uses the same zero-copy value parser as the high-level API,
88/// but returns raw items without string expansion or library indexing.
89///
90/// # Example
91///
92/// ```
93/// use bibtex_parser::parser::{parse_bibtex, ParsedItem};
94/// use bibtex_parser::Value;
95///
96/// let input = r#"
97/// @string{name = "John Doe"}
98/// @article{test,
99/// author = name,
100/// title = "Part 1" # " and " # "Part 2"
101/// }
102/// "#;
103///
104/// let items = parse_bibtex(input)?;
105///
106/// // Find the entry
107/// let entry = items.iter().find_map(|item| {
108/// if let ParsedItem::Entry(e) = item { Some(e) } else { None }
109/// }).unwrap();
110///
111/// // Author field contains unexpanded variable reference
112/// let author_field = entry.fields.iter()
113/// .find(|f| f.name == "author").unwrap();
114/// match &author_field.value {
115/// Value::Variable(var) => println!("Variable reference: {}", var),
116/// _ => {}
117/// }
118///
119/// // Title field contains concatenation structure
120/// let title_field = entry.fields.iter()
121/// .find(|f| f.name == "title").unwrap();
122/// match &title_field.value {
123/// Value::Concat(parts) => println!("Concatenation with {} parts", parts.len()),
124/// _ => {}
125/// }
126/// # Ok::<(), bibtex_parser::Error>(())
127/// ```
128#[inline]
129pub fn parse_bibtex(input: &str) -> Result<Vec<ParsedItem<'_>>> {
130 let mut items = Vec::new();
131 parse_bibtex_stream(input, |item| {
132 items.push(item);
133 Ok(())
134 })?;
135 Ok(items)
136}
137
138/// Parse a BibTeX file and stream raw items to a callback.
139///
140/// This avoids allocating an intermediate `Vec<ParsedItem>` when the caller
141/// can process items incrementally.
142#[inline]
143pub(crate) fn parse_bibtex_stream<'a, F>(input: &'a str, mut on_item: F) -> Result<()>
144where
145 F: FnMut(ParsedItem<'a>) -> Result<()>,
146{
147 let mut remaining = input;
148
149 loop {
150 // Skip ASCII whitespace without Unicode trimming overhead.
151 lexer::skip_whitespace(&mut remaining);
152 if remaining.is_empty() {
153 break;
154 }
155
156 // Try to parse an item (including comments)
157 match parse_item(&mut remaining) {
158 Ok(item) => on_item(item)?,
159 Err(e) => {
160 // Calculate line/column for error
161 let consumed = input.len() - remaining.len();
162 let (line, column) = calculate_position(input, consumed);
163
164 return Err(Error::ParseError {
165 line,
166 column,
167 message: format!("Failed to parse entry: {e}"),
168 snippet: Some(get_snippet(remaining, 40)),
169 });
170 }
171 }
172 }
173
174 Ok(())
175}
176
177/// Parse a BibTeX file and stream raw items with source spans.
178#[inline]
179pub(crate) fn parse_bibtex_stream_with_spans<'a, F>(input: &'a str, mut on_item: F) -> Result<()>
180where
181 F: FnMut(ParsedItem<'a>, SourceSpan, &'a str) -> Result<()>,
182{
183 let source_map = SourceMap::anonymous(input);
184 let mut remaining = input;
185
186 loop {
187 lexer::skip_whitespace(&mut remaining);
188 if remaining.is_empty() {
189 break;
190 }
191
192 let start = input.len() - remaining.len();
193 let before_item = remaining;
194 match parse_item(&mut remaining) {
195 Ok(item) => {
196 let end = input.len() - remaining.len();
197 let span = source_map.span(start, end);
198 on_item(item, span, &input[start..end])?;
199 }
200 Err(e) => {
201 let (line, column) = calculate_position(input, start);
202
203 return Err(Error::ParseError {
204 line,
205 column,
206 message: format!("Failed to parse entry: {e}"),
207 snippet: Some(get_snippet(before_item, 40)),
208 });
209 }
210 }
211 }
212
213 Ok(())
214}
215
216/// A raw parsed item from a BibTeX file before processing
217///
218/// This represents the different types of items that can appear in a BibTeX file,
219/// returned by the low-level `parse_bibtex()` function. These items are in their
220/// raw parsed form:
221///
222/// - String variables are not yet expanded
223/// - Field values preserve concatenation structure
224/// - Comments are preserved exactly as found
225/// - All items maintain their original order
226///
227/// # Examples
228///
229/// ```
230/// use bibtex_parser::parser::{parse_bibtex, ParsedItem};
231///
232/// let input = "@string{name = \"John\"}\n@article{key, author = name}";
233/// let items = parse_bibtex(input)?;
234///
235/// match &items[0] {
236/// ParsedItem::String(var_name, value) => {
237/// println!("String variable: {} = {:?}", var_name, value);
238/// },
239/// _ => {}
240/// }
241///
242/// match &items[1] {
243/// ParsedItem::Entry(entry) => {
244/// // The author field contains a variable reference, not the expanded value
245/// println!("Entry key: {}", entry.key());
246/// },
247/// _ => {}
248/// }
249/// # Ok::<(), bibtex_parser::Error>(())
250/// ```
251#[derive(Debug, Clone, PartialEq)]
252pub enum ParsedItem<'a> {
253 /// A bibliography entry (article, book, inproceedings, etc.)
254 ///
255 /// Contains the entry in its raw parsed form with field values that may
256 /// reference string variables or contain concatenations.
257 Entry(crate::Entry<'a>),
258
259 /// A string definition (`@string{name = value}`)
260 ///
261 /// Contains the variable name and its value. The value itself may contain
262 /// references to other string variables or concatenations.
263 String(&'a str, crate::Value<'a>),
264
265 /// A preamble (`@preamble{value}`)
266 ///
267 /// Contains the preamble value, which may reference string variables
268 /// or contain concatenations.
269 Preamble(crate::Value<'a>),
270
271 /// A comment (both `% line comment` and `@comment{...}`)
272 ///
273 /// Contains the raw comment text exactly as it appears in the source,
274 /// including any whitespace and formatting.
275 Comment(&'a str),
276}
277
278/// Parse a single item (entry, string, preamble, or comment) with optimized delimiter search
279#[inline]
280pub(crate) fn parse_item<'a>(input: &mut &'a str) -> PResult<'a, ParsedItem<'a>> {
281 // Use optimized delimiter search to find @ or handle as comment
282 let bytes = input.as_bytes();
283
284 // Fast path: if we don't start with @, check if this is a comment
285 if !bytes.is_empty() && bytes[0] != b'@' {
286 // Look for the next @ to treat everything before it as a comment
287 if let Some(at_pos) = delimiter::find_byte(bytes, b'@', 0) {
288 let comment = &input[..at_pos];
289 *input = &input[at_pos..];
290 return Ok(ParsedItem::Comment(comment));
291 }
292 // No @ found, entire remaining input is a comment
293 let comment = *input;
294 *input = "";
295 return Ok(ParsedItem::Comment(comment));
296 }
297
298 // We have an @ at the start. For regular entries, avoid checking all
299 // special keywords and dispatch directly based on the first letter.
300 let second = bytes.get(1).copied().unwrap_or_default();
301 match second | 0x20 {
302 b's' if starts_with_keyword(bytes, b"string") => {
303 parse_string(input).map(|(k, v)| ParsedItem::String(k, v))
304 }
305 b'p' if starts_with_keyword(bytes, b"preamble") => {
306 parse_preamble(input).map(ParsedItem::Preamble)
307 }
308 b'c' if starts_with_keyword(bytes, b"comment") => {
309 parse_comment(input).map(ParsedItem::Comment)
310 }
311 _ => entry::parse_entry_at(input).map(ParsedItem::Entry),
312 }
313}
314
315#[inline(never)]
316fn starts_with_keyword(input: &[u8], keyword: &[u8]) -> bool {
317 if input.first() != Some(&b'@') || input.len() < keyword.len() + 1 {
318 return false;
319 }
320
321 for (offset, &expected) in keyword.iter().enumerate() {
322 if (input[offset + 1] | 0x20) != expected {
323 return false;
324 }
325 }
326
327 if input.len() == keyword.len() + 1 {
328 return true;
329 }
330
331 !is_identifier_char(input[keyword.len() + 1])
332}
333
334#[inline]
335const fn is_identifier_char(byte: u8) -> bool {
336 matches!(
337 byte,
338 b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' | b'_' | b'-' | b':' | b'.'
339 )
340}
341
342/// Parse a @string definition
343fn parse_string<'a>(input: &mut &'a str) -> PResult<'a, (&'a str, crate::Value<'a>)> {
344 use winnow::combinator::{alt, delimited, preceded};
345
346 preceded(
347 (multispace0, '@', utils::tag_no_case("string"), multispace0),
348 alt((
349 delimited('{', parse_string_content, '}'),
350 delimited('(', parse_string_content, ')'),
351 )),
352 )
353 .parse_next(input)
354}
355
356/// Parse the content of a @string definition
357fn parse_string_content<'a>(input: &mut &'a str) -> PResult<'a, (&'a str, crate::Value<'a>)> {
358 use winnow::combinator::separated_pair;
359
360 separated_pair(
361 utils::ws(lexer::identifier),
362 utils::ws('='),
363 utils::ws(value::parse_value),
364 )
365 .parse_next(input)
366}
367
368/// Parse a @preamble
369fn parse_preamble<'a>(input: &mut &'a str) -> PResult<'a, crate::Value<'a>> {
370 use winnow::combinator::{alt, delimited, preceded};
371
372 preceded(
373 (
374 multispace0,
375 '@',
376 utils::tag_no_case("preamble"),
377 multispace0,
378 ),
379 alt((
380 delimited('{', parse_preamble_value, '}'),
381 delimited('(', parse_preamble_value, ')'),
382 )),
383 )
384 .parse_next(input)
385}
386
387/// Helper function to parse preamble value
388fn parse_preamble_value<'a>(input: &mut &'a str) -> PResult<'a, crate::Value<'a>> {
389 utils::ws(value::parse_value).parse_next(input)
390}
391
392/// Parse a comment (different formats)
393fn parse_comment<'a>(input: &mut &'a str) -> PResult<'a, &'a str> {
394 use winnow::ascii::till_line_ending;
395 use winnow::combinator::{alt, delimited, preceded};
396 use winnow::token::take_until;
397
398 alt((
399 // @comment{...}
400 preceded(
401 (multispace0, '@', utils::tag_no_case("comment"), multispace0),
402 alt((
403 delimited('{', lexer::balanced_braces, '}'),
404 delimited('(', lexer::balanced_parentheses, ')'),
405 )),
406 ),
407 // % line comment
408 preceded('%', till_line_ending),
409 // Any text before @ is considered a comment
410 take_until(1.., "@").verify(|s: &str| !s.trim().is_empty()),
411 ))
412 .parse_next(input)
413}
414
415/// Calculate line and column from position
416fn calculate_position(input: &str, pos: usize) -> (usize, usize) {
417 let mut line = 1;
418 let mut column = 1;
419
420 for (i, ch) in input.char_indices() {
421 if i >= pos {
422 break;
423 }
424 if ch == '\n' {
425 line += 1;
426 column = 1;
427 } else {
428 column += 1;
429 }
430 }
431
432 (line, column)
433}
434
435/// Get a snippet of input for error messages
436fn get_snippet(input: &str, max_len: usize) -> String {
437 let snippet: String = input.chars().take(max_len).collect();
438 if input.len() > max_len {
439 format!("{snippet}...")
440 } else {
441 snippet
442 }
443}