Skip to main content

bibtex_parser/parser/
entry.rs

1//! Entry parsing for BibTeX
2
3use super::{lexer, value, PResult};
4use crate::model::{Entry, EntryType, Field};
5use crate::{EntryDelimiter, Value, ValueDelimiter};
6use std::borrow::Cow;
7
8const DEFAULT_FIELD_CAPACITY: usize = 17;
9
10#[derive(Debug, Clone)]
11pub(crate) struct LocatedEntry<'a> {
12    pub(crate) entry: Entry<'a>,
13    pub(crate) entry_type: (usize, usize),
14    pub(crate) key: (usize, usize),
15    pub(crate) delimiter: EntryDelimiter,
16    pub(crate) fields: Vec<LocatedField>,
17}
18
19#[derive(Debug, Clone, Copy)]
20pub(crate) struct LocatedField {
21    pub(crate) whole: (usize, usize),
22    pub(crate) name: (usize, usize),
23    pub(crate) value: (usize, usize),
24    pub(crate) value_delimiter: ValueDelimiter,
25}
26
27/// Parse a bibliography entry
28#[inline]
29pub fn parse_entry<'a>(input: &mut &'a str) -> PResult<'a, Entry<'a>> {
30    lexer::skip_whitespace(input);
31    parse_entry_at(input)
32}
33
34/// Parse a bibliography entry when `input` is already positioned at `@`.
35#[inline]
36pub fn parse_entry_at<'a>(input: &mut &'a str) -> PResult<'a, Entry<'a>> {
37    match input.as_bytes().first() {
38        Some(b'@') => {
39            *input = &input[1..];
40            parse_entry_content(input)
41        }
42        _ => super::backtrack(),
43    }
44}
45
46#[inline]
47pub(crate) fn parse_entry_at_with_locations<'a>(
48    input: &mut &'a str,
49    absolute_start: usize,
50) -> PResult<'a, LocatedEntry<'a>> {
51    let root = *input;
52    match input.as_bytes().first() {
53        Some(b'@') => {
54            *input = &input[1..];
55            parse_entry_content_with_locations(input, root, absolute_start)
56        }
57        _ => super::backtrack(),
58    }
59}
60
61#[inline]
62fn parse_entry_content<'a>(input: &mut &'a str) -> PResult<'a, Entry<'a>> {
63    let entry_type_str = lexer::identifier(input)?;
64    let entry_type = EntryType::parse(entry_type_str);
65
66    lexer::skip_whitespace(input);
67
68    let closing_delimiter = match input.as_bytes().first() {
69        Some(b'{') => b'}',
70        Some(b'(') => b')',
71        _ => return super::backtrack(),
72    };
73    *input = &input[1..];
74
75    parse_entry_body(input, entry_type, closing_delimiter)
76}
77
78#[inline]
79fn parse_entry_content_with_locations<'a>(
80    input: &mut &'a str,
81    root: &'a str,
82    absolute_start: usize,
83) -> PResult<'a, LocatedEntry<'a>> {
84    let entry_type_start = source_offset(root, input, absolute_start);
85    let entry_type_str = lexer::identifier(input)?;
86    let entry_type_end = source_offset(root, input, absolute_start);
87    let entry_type = EntryType::parse(entry_type_str);
88
89    lexer::skip_whitespace(input);
90
91    let opening = match input.as_bytes().first() {
92        Some(b'{') => b'{',
93        Some(b'(') => b'(',
94        _ => return super::backtrack(),
95    };
96    let (delimiter, closing_delimiter) = match opening {
97        b'{' => (EntryDelimiter::Braces, b'}'),
98        b'(' => (EntryDelimiter::Parentheses, b')'),
99        _ => unreachable!(),
100    };
101    *input = &input[1..];
102
103    parse_entry_body_with_locations(
104        input,
105        root,
106        absolute_start,
107        entry_type,
108        (entry_type_start, entry_type_end),
109        delimiter,
110        closing_delimiter,
111    )
112}
113
114/// Parse the body of an entry (key and fields)
115#[inline]
116fn parse_entry_body<'a>(
117    input: &mut &'a str,
118    entry_type: EntryType<'a>,
119    closing_delimiter: u8,
120) -> PResult<'a, Entry<'a>> {
121    lexer::skip_whitespace(input);
122    let key = lexer::identifier(input)?;
123
124    lexer::skip_whitespace(input);
125    expect_byte(input, b',')?;
126
127    let fields = parse_fields(input, closing_delimiter)?;
128    expect_byte(input, closing_delimiter)?;
129
130    Ok(Entry {
131        ty: entry_type,
132        key: Cow::Borrowed(key),
133        fields,
134    })
135}
136
137#[inline]
138fn parse_entry_body_with_locations<'a>(
139    input: &mut &'a str,
140    root: &'a str,
141    absolute_start: usize,
142    entry_type: EntryType<'a>,
143    entry_type_location: (usize, usize),
144    delimiter: EntryDelimiter,
145    closing_delimiter: u8,
146) -> PResult<'a, LocatedEntry<'a>> {
147    lexer::skip_whitespace(input);
148    let key_start = source_offset(root, input, absolute_start);
149    let key = lexer::identifier(input)?;
150    let key_end = source_offset(root, input, absolute_start);
151
152    lexer::skip_whitespace(input);
153    expect_byte(input, b',')?;
154
155    let (fields, field_locations) =
156        parse_fields_with_locations(input, root, absolute_start, closing_delimiter)?;
157    expect_byte(input, closing_delimiter)?;
158
159    Ok(LocatedEntry {
160        entry: Entry {
161            ty: entry_type,
162            key: Cow::Borrowed(key),
163            fields,
164        },
165        entry_type: entry_type_location,
166        key: (key_start, key_end),
167        delimiter,
168        fields: field_locations,
169    })
170}
171
172#[inline]
173fn expect_byte<'a>(input: &mut &'a str, byte: u8) -> PResult<'a, ()> {
174    match input.as_bytes().first() {
175        Some(&b) if b == byte => {
176            *input = &input[1..];
177            Ok(())
178        }
179        _ => super::backtrack(),
180    }
181}
182
183/// Parse all fields in an entry.
184#[inline]
185fn parse_fields<'a>(input: &mut &'a str, closing_delimiter: u8) -> PResult<'a, Vec<Field<'a>>> {
186    let mut fields = Vec::with_capacity(DEFAULT_FIELD_CAPACITY);
187
188    while let Some(first) = lexer::skip_whitespace_peek(input) {
189        if first == closing_delimiter {
190            break;
191        }
192
193        let name = lexer::field_name(input)?;
194        lexer::skip_whitespace(input);
195        expect_byte(input, b'=')?;
196        lexer::skip_whitespace(input);
197        let value = value::parse_value_field(input)?;
198
199        fields.push(Field {
200            name: Cow::Borrowed(name),
201            value,
202        });
203
204        match input.as_bytes().first() {
205            Some(b',') => {
206                *input = &input[1..];
207            }
208            Some(&b) if b == closing_delimiter => {}
209            _ => return super::backtrack(),
210        }
211    }
212
213    let max_reasonable_capacity = (fields.len() * 2).max(8);
214    if fields.capacity() > max_reasonable_capacity {
215        fields.shrink_to_fit();
216    }
217
218    Ok(fields)
219}
220
221#[inline]
222fn parse_fields_with_locations<'a>(
223    input: &mut &'a str,
224    root: &'a str,
225    absolute_start: usize,
226    closing_delimiter: u8,
227) -> PResult<'a, (Vec<Field<'a>>, Vec<LocatedField>)> {
228    let mut fields = Vec::with_capacity(DEFAULT_FIELD_CAPACITY);
229    let mut locations = Vec::with_capacity(DEFAULT_FIELD_CAPACITY);
230    let root_bytes = root.as_bytes();
231
232    while let Some(first) = lexer::skip_whitespace_peek(input) {
233        if first == closing_delimiter {
234            break;
235        }
236
237        let field_start = source_offset(root, input, absolute_start);
238        let name_start = field_start;
239        let name = lexer::field_name(input)?;
240        let name_end = source_offset(root, input, absolute_start);
241
242        lexer::skip_whitespace(input);
243        expect_byte(input, b'=')?;
244        lexer::skip_whitespace(input);
245
246        let value_start = source_offset(root, input, absolute_start);
247        let parsed_value = value::parse_value_field(input)?;
248        let value_boundary = source_offset(root, input, absolute_start);
249        let value_end = trim_ascii_whitespace_end_absolute(
250            root_bytes,
251            absolute_start,
252            value_start,
253            value_boundary,
254        );
255        let value_delimiter = value_delimiter_from_parse(
256            &parsed_value,
257            root_bytes,
258            absolute_start,
259            value_start,
260            value_end,
261        );
262
263        let mut whole_end = value_end;
264        match input.as_bytes().first() {
265            Some(b',') => {
266                whole_end = source_offset(root, input, absolute_start) + 1;
267                *input = &input[1..];
268            }
269            Some(&b) if b == closing_delimiter => {}
270            _ => return super::backtrack(),
271        }
272
273        fields.push(Field {
274            name: Cow::Borrowed(name),
275            value: parsed_value,
276        });
277        locations.push(LocatedField {
278            whole: (field_start, whole_end),
279            name: (name_start, name_end),
280            value: (value_start, value_end),
281            value_delimiter,
282        });
283    }
284
285    let max_reasonable_capacity = (fields.len() * 2).max(8);
286    if fields.capacity() > max_reasonable_capacity {
287        fields.shrink_to_fit();
288    }
289    if locations.capacity() > max_reasonable_capacity {
290        locations.shrink_to_fit();
291    }
292
293    Ok((fields, locations))
294}
295
296#[inline]
297const fn source_offset(root: &str, input: &str, absolute_start: usize) -> usize {
298    absolute_start + root.len() - input.len()
299}
300
301#[inline]
302fn trim_ascii_whitespace_end_absolute(
303    bytes: &[u8],
304    absolute_start: usize,
305    start: usize,
306    end: usize,
307) -> usize {
308    let mut pos = end - absolute_start;
309    let start = start - absolute_start;
310    while pos > start && bytes[pos - 1].is_ascii_whitespace() {
311        pos -= 1;
312    }
313    absolute_start + pos
314}
315
316#[inline]
317fn value_delimiter_from_parse(
318    value: &Value<'_>,
319    bytes: &[u8],
320    absolute_start: usize,
321    start: usize,
322    end: usize,
323) -> ValueDelimiter {
324    if matches!(value, Value::Concat(_)) {
325        return ValueDelimiter::Concatenation;
326    }
327
328    let start = start - absolute_start;
329    let end = end - absolute_start;
330    match bytes.get(start..end).and_then(|raw| raw.first()).copied() {
331        Some(b'{') => ValueDelimiter::Braces,
332        Some(b'"') => ValueDelimiter::Quotes,
333        _ => ValueDelimiter::Bare,
334    }
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340    use crate::model::Value;
341    use std::borrow::Cow;
342
343    #[test]
344    fn test_parse_simple_entry() {
345        let mut input = r#"@article{einstein1905,
346            author = "Albert Einstein",
347            title = {Zur Elektrodynamik bewegter Körper},
348            year = 1905
349        }"#;
350
351        let entry = parse_entry(&mut input).unwrap();
352        assert_eq!(entry.ty, EntryType::Article);
353        assert_eq!(entry.key, Cow::Borrowed("einstein1905"));
354        assert_eq!(entry.fields.len(), 3);
355
356        assert_eq!(entry.fields[0].name, "author");
357        assert_eq!(
358            entry.fields[0].value,
359            Value::Literal(Cow::Borrowed("Albert Einstein"))
360        );
361
362        assert_eq!(entry.fields[1].name, "title");
363        assert_eq!(
364            entry.fields[1].value,
365            Value::Literal(Cow::Borrowed("Zur Elektrodynamik bewegter Körper"))
366        );
367
368        assert_eq!(entry.fields[2].name, "year");
369        assert_eq!(entry.fields[2].value, Value::Number(1905));
370    }
371
372    #[test]
373    fn test_parse_entry_with_concatenation() {
374        let mut input = r#"@misc{test,
375            author = name # " et al.",
376            note = "See " # url
377        }"#;
378
379        let entry = parse_entry(&mut input).unwrap();
380        assert_eq!(entry.ty, EntryType::Misc);
381        assert_eq!(entry.key, Cow::Borrowed("test"));
382        assert_eq!(entry.fields.len(), 2);
383
384        match &entry.fields[0].value {
385            Value::Concat(parts) => {
386                assert_eq!(parts.len(), 2);
387                assert_eq!(parts[0], Value::Variable(Cow::Borrowed("name")));
388                assert_eq!(parts[1], Value::Literal(Cow::Borrowed(" et al.")));
389            }
390            _ => panic!("Expected concatenated value"),
391        }
392    }
393
394    #[test]
395    fn test_parse_entry_with_trailing_comma() {
396        let mut input = r#"@book{knuth1984,
397            author = "Donald Knuth",
398            title = "The TeXbook",
399            year = 1984,
400        }"#;
401
402        let entry = parse_entry(&mut input).unwrap();
403        assert_eq!(entry.fields.len(), 3);
404    }
405
406    #[test]
407    fn test_parse_entry_with_parentheses() {
408        let mut input = r#"@article(einstein1905,
409            author = "Albert Einstein",
410            title = {Zur Elektrodynamik bewegter Körper},
411            year = 1905
412        )"#;
413
414        let entry = parse_entry(&mut input).unwrap();
415        assert_eq!(entry.ty, EntryType::Article);
416        assert_eq!(entry.key, Cow::Borrowed("einstein1905"));
417        assert_eq!(entry.fields.len(), 3);
418
419        assert_eq!(entry.fields[0].name, "author");
420        assert_eq!(
421            entry.fields[0].value,
422            Value::Literal(Cow::Borrowed("Albert Einstein"))
423        );
424
425        assert_eq!(entry.fields[1].name, "title");
426        assert_eq!(
427            entry.fields[1].value,
428            Value::Literal(Cow::Borrowed("Zur Elektrodynamik bewegter Körper"))
429        );
430
431        assert_eq!(entry.fields[2].name, "year");
432        assert_eq!(entry.fields[2].value, Value::Number(1905));
433    }
434
435    #[test]
436    fn test_parse_entry_mixed_delimiters() {
437        // Entry uses parentheses, but field values can use braces
438        let mut input = r#"@book(test2024,
439            title = {A Title with {Nested} Braces},
440            author = "John Doe"
441        )"#;
442
443        let entry = parse_entry(&mut input).unwrap();
444        assert_eq!(entry.ty, EntryType::Book);
445        assert_eq!(entry.key, Cow::Borrowed("test2024"));
446        assert_eq!(entry.fields.len(), 2);
447
448        assert_eq!(entry.fields[0].name, "title");
449        assert_eq!(
450            entry.fields[0].value,
451            Value::Literal(Cow::Borrowed("A Title with {Nested} Braces"))
452        );
453
454        assert_eq!(entry.fields[1].name, "author");
455        assert_eq!(
456            entry.fields[1].value,
457            Value::Literal(Cow::Borrowed("John Doe"))
458        );
459    }
460}