bibtex-parser 0.3.1

BibTeX parser for Rust
Documentation
use bibtex_parser::{
    document_to_string, parse_bibtex, CorpusSource, DiagnosticCode, EntryType, ParseEvent,
    ParseFlow, ParseStatus, Parser, Value,
};

#[test]
fn regression_fixture_covers_common_bibtex_shapes() {
    let input = format!(
        r#"@string{{conf = "ToolConf"}}
@online{{unicode,
  author = "{}",
  title = "Café {{nested {{braces}}}} and escaped \{{ brace",
  date = "2026-05-13",
  url = "https://example.test",
  month = may
}}
@software{{concat,
  author = "Jane Doe and {{Literal Research Lab}}",
  title = "Part " # "One",
  journaltitle = conf,
  year = 2026
}}"#,
        author_list(64)
    );

    let library = Parser::new().parse(&input).unwrap();
    assert_eq!(library.entries().len(), 2);
    assert_eq!(library.entries()[0].ty, EntryType::Online);
    assert_eq!(library.entries()[1].ty, EntryType::Software);
    assert_eq!(library.entries()[0].authors().len(), 64);
    assert_eq!(
        library.entries()[0].date_parts().unwrap().unwrap().month,
        Some(5)
    );
    assert_eq!(
        library.entries()[0].url().as_deref(),
        Some("https://example.test")
    );

    let raw_items = parse_bibtex(&input).unwrap();
    let concat_entry = raw_items.iter().find_map(|item| {
        if let bibtex_parser::ParsedItem::Entry(entry) = item {
            (entry.key() == "concat").then_some(entry)
        } else {
            None
        }
    });
    let title = concat_entry
        .unwrap()
        .field("title")
        .expect("title field should exist");
    assert!(matches!(title.value, Value::Concat(_)));
}

#[test]
fn malformed_recovery_regression_asserts_diagnostics() {
    let input = r#"@article{bad title = "missing comma"}
@article{partial, title = "Recovered"
@book{valid, title = "After"}"#;

    let document = Parser::new()
        .tolerant()
        .preserve_raw()
        .parse_document(input)
        .unwrap();

    assert_eq!(document.status(), ParseStatus::Partial);
    assert_eq!(
        document
            .entries()
            .iter()
            .map(|entry| entry.key())
            .collect::<Vec<_>>(),
        ["partial", "valid"]
    );
    assert_eq!(document.failed_blocks().len(), 1);
    assert!(document
        .diagnostics()
        .iter()
        .any(|diagnostic| diagnostic.code == DiagnosticCode::MISSING_FIELD_SEPARATOR));
    assert!(document
        .diagnostics()
        .iter()
        .all(|diagnostic| diagnostic.source.is_some()));
}

#[test]
fn parse_modes_agree_on_successful_entries() {
    let input = r#"% comment
@article{a, title = "A", year = 2024}
@book{b, title = {B}, year = 2025}"#;

    let strict = Parser::new().parse(input).unwrap();
    let tolerant = Parser::new().tolerant().parse_document(input).unwrap();
    let preserved = Parser::new().preserve_raw().parse_document(input).unwrap();
    let mut streamed = Vec::new();
    Parser::new()
        .parse_events(input, |event| {
            if let ParseEvent::Entry(entry) = event {
                streamed.push((entry.ty, entry.key.into_owned(), entry.fields.len()));
            }
            Ok(ParseFlow::Continue)
        })
        .unwrap();

    let strict_entries = strict
        .entries()
        .iter()
        .map(|entry| {
            (
                entry.ty.clone(),
                entry.key().to_string(),
                entry.fields().len(),
            )
        })
        .collect::<Vec<_>>();
    let tolerant_entries = tolerant
        .entries()
        .iter()
        .map(|entry| {
            (
                entry.ty.clone(),
                entry.key().to_string(),
                entry.fields.len(),
            )
        })
        .collect::<Vec<_>>();
    let preserved_entries = preserved
        .entries()
        .iter()
        .map(|entry| {
            (
                entry.ty.clone(),
                entry.key().to_string(),
                entry.fields.len(),
            )
        })
        .collect::<Vec<_>>();

    assert_eq!(strict_entries, tolerant_entries);
    assert_eq!(strict_entries, preserved_entries);
    assert_eq!(strict_entries, streamed);
}

#[test]
fn source_preserving_parse_write_parse_keeps_representative_counts() {
    let input = include_str!("fixtures/complex.bib");
    let document = Parser::new().preserve_raw().parse_document(input).unwrap();
    let output = document_to_string(&document).unwrap();
    let reparsed = Parser::new()
        .preserve_raw()
        .parse_document(&output)
        .unwrap();

    assert_eq!(reparsed.entries().len(), document.entries().len());
    assert_eq!(reparsed.strings().len(), document.strings().len());
    assert_eq!(reparsed.preambles().len(), document.preambles().len());
    assert_eq!(reparsed.comments().len(), document.comments().len());
}

#[test]
fn practical_large_file_streaming_and_corpus_regression() {
    let input = synthetic_entries(2_000);
    let strict = Parser::new().parse(&input).unwrap();
    let mut streamed = 0usize;
    Parser::new()
        .parse_events(&input, |event| {
            if matches!(event, ParseEvent::Entry(_)) {
                streamed += 1;
            }
            Ok(ParseFlow::Continue)
        })
        .unwrap();

    let first_half = synthetic_entries(1_000);
    let second_half = synthetic_entries_from(1_000, 1_000);
    let sources = [
        CorpusSource::new("a.bib", &first_half),
        CorpusSource::new("b.bib", &second_half),
    ];
    let corpus = Parser::new().parse_sources(&sources).unwrap();

    assert_eq!(strict.entries().len(), 2_000);
    assert_eq!(streamed, 2_000);
    assert_eq!(corpus.entries().count(), 2_000);
    assert!(corpus.duplicate_keys().is_empty());
}

#[test]
fn huge_author_list_regression() {
    let input = format!(
        "@article{{many, author = \"{}\", title = \"Many\", year = 2026}}",
        author_list(2_000)
    );
    let library = Parser::new().parse(&input).unwrap();

    let authors = library.entries()[0].authors();
    assert_eq!(authors.len(), 2_000);
    assert_eq!(authors[0].display_name(), "Given0 Family0");
    assert_eq!(authors[1_999].display_name(), "Given1999 Family1999");
}

#[test]
#[ignore = "release stress: set aside for explicit pre-release scale verification"]
fn release_stress_hundred_thousand_entries() {
    let input = synthetic_entries(100_000);
    let library = Parser::new().parse(&input).unwrap();
    assert_eq!(library.entries().len(), 100_000);
}

#[test]
#[ignore = "release stress: set aside for explicit pre-release scale verification"]
fn release_stress_million_entry_streaming() {
    let input = synthetic_entries(1_000_000);
    let mut entries = 0usize;
    Parser::new()
        .parse_events(&input, |event| {
            if matches!(event, ParseEvent::Entry(_)) {
                entries += 1;
            }
            Ok(ParseFlow::Continue)
        })
        .unwrap();
    assert_eq!(entries, 1_000_000);
}

fn synthetic_entries(count: usize) -> String {
    synthetic_entries_from(0, count)
}

fn synthetic_entries_from(start: usize, count: usize) -> String {
    let mut input = String::with_capacity(count * 56);
    for index in start..start + count {
        input.push_str("@article{key");
        input.push_str(&index.to_string());
        input.push_str(", title = \"Title ");
        input.push_str(&index.to_string());
        input.push_str("\", year = 2026}\n");
    }
    input
}

fn author_list(count: usize) -> String {
    let mut authors = String::with_capacity(count * 24);
    for index in 0..count {
        if index > 0 {
            authors.push_str(" and ");
        }
        authors.push_str("Given");
        authors.push_str(&index.to_string());
        authors.push_str(" Family");
        authors.push_str(&index.to_string());
    }
    authors
}