Skip to main content

bibtex_parser/
corpus.rs

1//! Corpus-level parsed bibliography model.
2
3use crate::{
4    Diagnostic, ParseEvent, ParseStatus, ParsedDocument, ParsedEntry, ParsedSource, SourceId,
5    SourceSpan,
6};
7use std::borrow::Cow;
8use std::collections::BTreeMap;
9
10/// Borrowed input source for corpus parsing.
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub struct CorpusSource<'a> {
13    /// Human-readable source name or path.
14    pub name: &'a str,
15    /// BibTeX source text.
16    pub input: &'a str,
17}
18
19/// Corpus-level streaming event.
20#[derive(Debug, Clone, PartialEq)]
21pub enum CorpusEvent<'a> {
22    /// A source is about to be parsed.
23    SourceStart(ParsedSource<'a>),
24    /// A source-order parse event from one source.
25    Event {
26        /// Corpus-wide source id.
27        source: SourceId,
28        /// Parsed source event.
29        event: Box<ParseEvent<'a>>,
30    },
31    /// A source finished parsing or was stopped.
32    SourceEnd(ParsedSource<'a>),
33}
34
35impl<'a> CorpusSource<'a> {
36    /// Create a corpus source from a name and input string.
37    #[must_use]
38    pub const fn new(name: &'a str, input: &'a str) -> Self {
39        Self { name, input }
40    }
41}
42
43/// Parsed multi-source bibliography corpus.
44#[derive(Debug, Clone)]
45pub struct ParsedCorpus<'a> {
46    documents: Vec<ParsedDocument<'a>>,
47    sources: Vec<ParsedSource<'a>>,
48    duplicate_keys: Vec<DuplicateKeyGroup>,
49    status: ParseStatus,
50}
51
52impl<'a> ParsedCorpus<'a> {
53    pub(crate) fn from_documents(documents: Vec<ParsedDocument<'a>>) -> Self {
54        let sources = documents
55            .iter()
56            .flat_map(|document| document.sources().iter().cloned())
57            .collect::<Vec<_>>();
58        let duplicate_keys = find_duplicate_keys(&documents);
59        let status = corpus_status(&documents);
60
61        Self {
62            documents,
63            sources,
64            duplicate_keys,
65            status,
66        }
67    }
68
69    /// Return parsed documents in corpus input order.
70    #[must_use]
71    pub fn documents(&self) -> &[ParsedDocument<'a>] {
72        &self.documents
73    }
74
75    /// Return corpus sources in input order.
76    #[must_use]
77    pub fn sources(&self) -> &[ParsedSource<'a>] {
78        &self.sources
79    }
80
81    /// Return a source by corpus-wide source id.
82    #[must_use]
83    pub fn source(&self, id: SourceId) -> Option<&ParsedSource<'a>> {
84        self.sources.iter().find(|source| source.id == id)
85    }
86
87    /// Iterate entries across all documents in corpus order.
88    pub fn entries(&self) -> impl Iterator<Item = &ParsedEntry<'a>> + '_ {
89        self.documents
90            .iter()
91            .flat_map(|document| document.entries().iter())
92    }
93
94    /// Iterate diagnostics across all documents in corpus order.
95    pub fn diagnostics(&self) -> impl Iterator<Item = &Diagnostic> + '_ {
96        self.documents
97            .iter()
98            .flat_map(|document| document.diagnostics().iter())
99    }
100
101    /// Return duplicate citation key groups.
102    #[must_use]
103    pub fn duplicate_keys(&self) -> &[DuplicateKeyGroup] {
104        &self.duplicate_keys
105    }
106
107    /// Return aggregate corpus parse status.
108    #[must_use]
109    pub const fn status(&self) -> ParseStatus {
110        self.status
111    }
112}
113
114/// Duplicate citation key group with source provenance.
115#[derive(Debug, Clone, PartialEq, Eq)]
116pub struct DuplicateKeyGroup {
117    /// Citation key text.
118    pub key: String,
119    /// Occurrences in corpus order.
120    pub occurrences: Vec<DuplicateKeyOccurrence>,
121    /// `true` when occurrences come from more than one source.
122    pub cross_source: bool,
123}
124
125impl DuplicateKeyGroup {
126    /// Return `true` when every occurrence is in the same source.
127    #[must_use]
128    pub const fn is_same_source(&self) -> bool {
129        !self.cross_source
130    }
131}
132
133/// One duplicate citation key occurrence.
134#[derive(Debug, Clone, PartialEq, Eq)]
135pub struct DuplicateKeyOccurrence {
136    /// Corpus-wide source id.
137    pub source: SourceId,
138    /// Source name or path, when available.
139    pub source_name: Option<String>,
140    /// Document index inside the corpus.
141    pub document_index: usize,
142    /// Entry index inside that parsed document.
143    pub entry_index: usize,
144    /// Key token location, when available.
145    pub key_source: Option<SourceSpan>,
146}
147
148fn find_duplicate_keys(documents: &[ParsedDocument<'_>]) -> Vec<DuplicateKeyGroup> {
149    let mut groups: BTreeMap<String, Vec<DuplicateKeyOccurrence>> = BTreeMap::new();
150
151    for (document_index, document) in documents.iter().enumerate() {
152        for (entry_index, entry) in document.entries().iter().enumerate() {
153            let source = entry
154                .source
155                .and_then(|span| span.source)
156                .unwrap_or_else(|| SourceId::new(document_index));
157            let source_name = document
158                .sources()
159                .iter()
160                .find(|parsed_source| parsed_source.id == source)
161                .and_then(|parsed_source| parsed_source.name.as_ref())
162                .map(Cow::as_ref)
163                .map(ToOwned::to_owned);
164
165            groups
166                .entry(entry.key().to_string())
167                .or_default()
168                .push(DuplicateKeyOccurrence {
169                    source,
170                    source_name,
171                    document_index,
172                    entry_index,
173                    key_source: entry.key_source,
174                });
175        }
176    }
177
178    groups
179        .into_iter()
180        .filter_map(|(key, occurrences)| {
181            if occurrences.len() < 2 {
182                return None;
183            }
184            let first_source = occurrences[0].source;
185            let cross_source = occurrences
186                .iter()
187                .any(|occurrence| occurrence.source != first_source);
188            Some(DuplicateKeyGroup {
189                key,
190                occurrences,
191                cross_source,
192            })
193        })
194        .collect()
195}
196
197fn corpus_status(documents: &[ParsedDocument<'_>]) -> ParseStatus {
198    let has_content = documents.iter().any(|document| {
199        !document.entries().is_empty()
200            || !document.strings().is_empty()
201            || !document.preambles().is_empty()
202    });
203    let has_problem = documents
204        .iter()
205        .any(|document| document.status() != ParseStatus::Ok);
206
207    if !has_problem {
208        ParseStatus::Ok
209    } else if has_content {
210        ParseStatus::Partial
211    } else {
212        ParseStatus::Failed
213    }
214}