Skip to main content

citum_engine/processor/document/
markdown.rs

1/*
2SPDX-License-Identifier: MIT OR Apache-2.0
3SPDX-FileCopyrightText: © 2023-2026 Bruce D'Arcus and Citum contributors
4*/
5
6//! Markdown document parsing for Pandoc-style citations.
7
8use super::djot::parsing::parse_frontmatter;
9use super::{CitationParser, CitationPlacement, CitationStructure, ParsedCitation, ParsedDocument};
10use crate::{Citation, CitationItem};
11use citum_schema::citation::{CitationMode, normalize_locator_text};
12use citum_schema::locale::Locale;
13use std::collections::HashSet;
14
15/// A parser for Markdown documents with Pandoc-style citation syntax.
16///
17/// This parser currently supports inline prose citations and maps them into the
18/// shared document-processing pipeline. Markdown-specific footnotes, document
19/// metadata, and inline bibliography blocks remain future work.
20pub struct MarkdownParser;
21
22impl Default for MarkdownParser {
23    fn default() -> Self {
24        Self
25    }
26}
27
28impl CitationParser for MarkdownParser {
29    fn parse_document(&self, content: &str, locale: &Locale) -> ParsedDocument {
30        let (frontmatter_result, body) = parse_frontmatter(content);
31        let body_start = content.len() - body.len();
32        let (frontmatter, frontmatter_error) = match frontmatter_result {
33            Ok(fm) => (fm, None),
34            Err(e) => (None, Some(e)),
35        };
36        let frontmatter_options = frontmatter.as_ref().and_then(|fm| fm.options.clone());
37        // Legacy top-level fields are superseded by their `options.*` counterparts.
38        let frontmatter_integral_name_memory = frontmatter
39            .as_ref()
40            .and_then(|fm| fm.integral_name_memory.clone())
41            .filter(|_| {
42                frontmatter_options
43                    .as_ref()
44                    .and_then(|o| o.integral_name_memory.as_ref())
45                    .is_none()
46            });
47        let frontmatter_org_abbreviation_memory = frontmatter
48            .and_then(|fm| fm.org_abbreviation_memory)
49            .filter(|_| {
50                frontmatter_options
51                    .as_ref()
52                    .and_then(|o| o.org_abbreviation_memory.as_ref())
53                    .is_none()
54            });
55
56        let citations = find_citations(body, locale)
57            .into_iter()
58            .map(|(start, end, citation)| ParsedCitation {
59                start: body_start + start,
60                end: body_start + end,
61                citation,
62                placement: CitationPlacement::InlineProse,
63                structure: CitationStructure::default(),
64            })
65            .collect();
66
67        ParsedDocument {
68            citations,
69            manual_note_order: Vec::new(),
70            manual_note_references: Vec::new(),
71            manual_note_labels: HashSet::new(),
72            bibliography_blocks: Vec::new(),
73            frontmatter_groups: None,
74            frontmatter_integral_name_memory,
75            frontmatter_org_abbreviation_memory,
76            frontmatter_options,
77            frontmatter_error,
78            body_start,
79        }
80    }
81}
82
83#[allow(
84    clippy::string_slice,
85    clippy::unreachable,
86    reason = "Markdown scanning logic"
87)]
88fn find_citations(content: &str, locale: &Locale) -> Vec<(usize, usize, Citation)> {
89    let mut results = Vec::new();
90    let mut offset = 0;
91
92    while offset < content.len() {
93        let remaining = &content[offset..];
94        let next_at = remaining.find('@');
95        let next_bracket = remaining.find('[');
96
97        let (relative_start, kind) = match (next_at, next_bracket) {
98            (Some(at), Some(bracket)) if bracket <= at => (bracket, ScanKind::Bracket),
99            (Some(at), Some(bracket)) if at < bracket => (at, ScanKind::Textual),
100            (Some(at), None) => (at, ScanKind::Textual),
101            (None, Some(bracket)) => (bracket, ScanKind::Bracket),
102            (None, None) => break,
103            _ => unreachable!(),
104        };
105
106        let start = offset + relative_start;
107        let candidate = &content[start..];
108
109        let parsed = match kind {
110            ScanKind::Bracket => parse_bracketed_citation(candidate, locale),
111            ScanKind::Textual => parse_textual_citation(content, start, locale),
112        };
113
114        if let Some((consumed, citation)) = parsed {
115            results.push((start, start + consumed, citation));
116            offset = start + consumed;
117        } else if matches!(kind, ScanKind::Bracket) {
118            offset = start + candidate.find(']').map_or(1, |idx| idx + 1);
119        } else {
120            offset = start + 1;
121        }
122    }
123
124    results
125}
126
127#[derive(Debug, Clone, Copy)]
128enum ScanKind {
129    Bracket,
130    Textual,
131}
132
133#[allow(clippy::string_slice, reason = "Brackets and @ are 1-byte ASCII")]
134fn parse_bracketed_citation(input: &str, locale: &Locale) -> Option<(usize, Citation)> {
135    if !input.starts_with('[') {
136        return None;
137    }
138
139    let closing = input.find(']')?;
140    let inner = input[1..closing].trim();
141    if inner.is_empty() || !inner.contains('@') {
142        return None;
143    }
144
145    let mut items = Vec::new();
146    let mut suppress_author = None;
147
148    for segment in inner.split(';') {
149        let (item, suppress) = parse_bracketed_item(segment, locale)?;
150        if let Some(existing) = suppress_author {
151            if existing != suppress {
152                return None;
153            }
154        } else {
155            suppress_author = Some(suppress);
156        }
157        items.push(item);
158    }
159
160    Some((
161        closing + 1,
162        Citation {
163            items,
164            suppress_author: suppress_author.unwrap_or(false),
165            ..Default::default()
166        },
167    ))
168}
169
170#[allow(
171    clippy::string_slice,
172    clippy::indexing_slicing,
173    reason = "Citations are ASCII-heavy; indices from find() are on char boundaries"
174)]
175fn parse_bracketed_item(segment: &str, locale: &Locale) -> Option<(CitationItem, bool)> {
176    let segment = segment.trim();
177    let at_pos = segment.find('@')?;
178    let mut suppress_author = false;
179    let prefix_end = if at_pos > 0 && segment.as_bytes()[at_pos - 1] == b'-' {
180        suppress_author = true;
181        at_pos - 1
182    } else {
183        at_pos
184    };
185
186    let prefix = normalize_prefix(&segment[..prefix_end]);
187    let after_at = &segment[at_pos + 1..];
188    let key_end = cite_key_len(after_at)?;
189    let key = &after_at[..key_end];
190    let remainder = after_at[key_end..].trim_start();
191
192    let mut item = CitationItem {
193        id: key.to_string(),
194        prefix,
195        ..Default::default()
196    };
197
198    if let Some(rest) = remainder.strip_prefix(',') {
199        let rest = rest.trim();
200        if !rest.is_empty() {
201            item.locator = normalize_locator_text(rest, locale);
202            if item.locator.is_none() {
203                item.suffix = Some(rest.to_string());
204            }
205        }
206    } else if !remainder.is_empty() {
207        item.suffix = Some(remainder.trim().to_string());
208    }
209
210    Some((item, suppress_author))
211}
212
213#[allow(clippy::string_slice, reason = "@ and indices from find() are safe")]
214fn parse_textual_citation(
215    content: &str,
216    start: usize,
217    locale: &Locale,
218) -> Option<(usize, Citation)> {
219    if !is_valid_textual_start(content, start) {
220        return None;
221    }
222
223    let after_at = &content[start + 1..];
224    let key_end = cite_key_len(after_at)?;
225    let key = &after_at[..key_end];
226    let mut consumed = 1 + key_end;
227
228    let mut item = CitationItem {
229        id: key.to_string(),
230        ..Default::default()
231    };
232
233    let trailing = &content[start + consumed..];
234    if let Some((locator_consumed, locator)) = parse_textual_locator_suffix(trailing, locale) {
235        item.locator = Some(locator);
236        consumed += locator_consumed;
237    }
238
239    Some((
240        consumed,
241        Citation {
242            mode: CitationMode::Integral,
243            items: vec![item],
244            ..Default::default()
245        },
246    ))
247}
248
249#[allow(clippy::string_slice, reason = "Brackets and @ are 1-byte ASCII")]
250fn parse_textual_locator_suffix(
251    input: &str,
252    locale: &Locale,
253) -> Option<(usize, citum_schema::citation::CitationLocator)> {
254    let whitespace_len = input.len() - input.trim_start_matches(char::is_whitespace).len();
255    let rest = &input[whitespace_len..];
256    if !rest.starts_with('[') {
257        return None;
258    }
259
260    let closing = rest.find(']')?;
261    let inner = rest[1..closing].trim();
262    if inner.is_empty() || inner.contains('@') {
263        return None;
264    }
265
266    let locator = normalize_locator_text(inner, locale)?;
267    Some((whitespace_len + closing + 1, locator))
268}
269
270fn cite_key_len(input: &str) -> Option<usize> {
271    let len = input
272        .char_indices()
273        .take_while(
274            |(_, ch)| matches!(ch, 'A'..='Z' | 'a'..='z' | '0'..='9' | '_' | '-' | ':' | '.'),
275        )
276        .map(|(idx, ch)| idx + ch.len_utf8())
277        .last()
278        .unwrap_or(0);
279
280    if len == 0 { None } else { Some(len) }
281}
282
283fn normalize_prefix(prefix: &str) -> Option<String> {
284    let trimmed = prefix.trim();
285    if trimmed.is_empty() {
286        None
287    } else {
288        Some(format!("{trimmed} "))
289    }
290}
291
292#[allow(clippy::string_slice, reason = "start index from find() is safe")]
293fn is_valid_textual_start(content: &str, start: usize) -> bool {
294    let prev = content[..start].chars().next_back();
295    !matches!(prev, Some(ch) if ch.is_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | '@'))
296}
297
298#[cfg(test)]
299#[allow(
300    clippy::unwrap_used,
301    clippy::expect_used,
302    clippy::panic,
303    clippy::indexing_slicing,
304    clippy::todo,
305    clippy::unimplemented,
306    clippy::unreachable,
307    clippy::get_unwrap,
308    reason = "Panicking is acceptable and often desired in tests."
309)]
310mod tests {
311    use super::*;
312    use citum_schema::citation::{CitationLocator, LocatorType};
313
314    #[test]
315    fn test_parse_bracketed_multi_cite() {
316        let parser = MarkdownParser;
317        let citations =
318            parser.parse_citations("See [@kuhn1962; @watson1953, ch. 2].", &Locale::en_us());
319
320        assert_eq!(citations.len(), 1);
321        let (_, _, citation) = &citations[0];
322        assert_eq!(citation.items.len(), 2);
323        assert_eq!(citation.items[0].id, "kuhn1962");
324        assert_eq!(
325            citation.items[1].locator,
326            Some(CitationLocator::single(LocatorType::Chapter, "2"))
327        );
328    }
329
330    #[test]
331    fn test_parse_bracketed_prefix_and_suppress_author() {
332        let parser = MarkdownParser;
333        let citations = parser.parse_citations("[see -@kuhn1962, p. 10]", &Locale::en_us());
334
335        assert_eq!(citations.len(), 1);
336        let (_, _, citation) = &citations[0];
337        assert!(citation.suppress_author);
338        assert_eq!(citation.items[0].prefix.as_deref(), Some("see "));
339        assert_eq!(
340            citation.items[0].locator,
341            Some(CitationLocator::single(LocatorType::Page, "10"))
342        );
343    }
344
345    #[test]
346    fn test_parse_textual_citation() {
347        let parser = MarkdownParser;
348        let citations = parser.parse_citations(
349            "Kuhn argued that @kuhn1962 changed science.",
350            &Locale::en_us(),
351        );
352
353        assert_eq!(citations.len(), 1);
354        let (_, _, citation) = &citations[0];
355        assert_eq!(citation.mode, CitationMode::Integral);
356        assert_eq!(citation.items[0].id, "kuhn1962");
357    }
358
359    #[test]
360    fn test_parse_textual_citation_with_locator_suffix() {
361        let parser = MarkdownParser;
362        let citations =
363            parser.parse_citations("@kuhn1962 [p. 10] argues this point.", &Locale::en_us());
364
365        assert_eq!(citations.len(), 1);
366        let (_, _, citation) = &citations[0];
367        assert_eq!(citation.mode, CitationMode::Integral);
368        assert_eq!(
369            citation.items[0].locator,
370            Some(CitationLocator::single(LocatorType::Page, "10"))
371        );
372    }
373
374    #[test]
375    fn test_parse_document_marks_citations_as_inline_prose() {
376        let parser = MarkdownParser;
377        let parsed = parser.parse_document("Text [@kuhn1962].", &Locale::en_us());
378
379        assert_eq!(parsed.citations.len(), 1);
380        assert_eq!(
381            parsed.citations[0].placement,
382            CitationPlacement::InlineProse
383        );
384        assert!(parsed.manual_note_order.is_empty());
385        assert!(parsed.bibliography_blocks.is_empty());
386    }
387
388    #[test]
389    fn test_does_not_parse_email_address() {
390        let parser = MarkdownParser;
391        let citations =
392            parser.parse_citations("Contact test@example.com for details.", &Locale::en_us());
393
394        assert!(citations.is_empty());
395    }
396
397    #[test]
398    fn test_unsupported_bracket_cluster_does_not_fall_back_to_textual_citations() {
399        let parser = MarkdownParser;
400        let citations =
401            parser.parse_citations("Mixed [@kuhn1962; -@watson1953] cluster.", &Locale::en_us());
402
403        assert!(citations.is_empty());
404    }
405
406    #[test]
407    fn test_markdown_finalize_html_output_is_passthrough() {
408        // MarkdownParser does not perform any markup-to-HTML conversion; the
409        // caller is responsible for rendering CommonMark. The trait default
410        // returns the input unchanged.
411        let parser = MarkdownParser;
412        let input = "**bold** and _em_ and [@key].";
413        assert_eq!(parser.finalize_html_output(input), input);
414    }
415}