Skip to main content

citum_engine/processor/document/
markdown.rs

1/*
2SPDX-License-Identifier: MIT OR Apache-2.0
3SPDX-FileCopyrightText: © 2023-2026 Bruce D'Arcus and Citum contributors
4*/
5
6//! Markdown document parsing for Pandoc-style citations.
7
8use super::djot::parsing::parse_frontmatter;
9use super::{CitationParser, CitationPlacement, CitationStructure, ParsedCitation, ParsedDocument};
10use crate::{Citation, CitationItem};
11use citum_schema::citation::{CitationMode, normalize_locator_text};
12use citum_schema::locale::Locale;
13use std::collections::HashSet;
14
15/// A parser for Markdown documents with Pandoc-style citation syntax.
16///
17/// This parser currently supports inline prose citations and maps them into the
18/// shared document-processing pipeline. Markdown-specific footnotes, document
19/// metadata, and inline bibliography blocks remain future work.
20pub struct MarkdownParser;
21
22impl Default for MarkdownParser {
23    fn default() -> Self {
24        Self
25    }
26}
27
28impl CitationParser for MarkdownParser {
29    /// Convert Markdown body markup to HTML after citation splicing.
30    ///
31    /// NUL placeholder tokens (`\x00CITUMHTML…TOKEN…\x00`) are temporarily
32    /// re-encoded as HTML comments before the Markdown parser runs, because
33    /// pulldown-cmark normalises `\x00` to U+FFFD. The comments survive the
34    /// conversion verbatim and are swapped back so that the caller's
35    /// `HtmlPlaceholderRegistry::apply()` can still locate them.
36    fn finalize_html_output(&self, rendered: &str) -> String {
37        use pulldown_cmark::{Options, html};
38
39        let (remapped, token_map) = remap_nul_tokens(rendered);
40        let parser = pulldown_cmark::Parser::new_ext(&remapped, Options::ENABLE_STRIKETHROUGH);
41        let mut out = String::new();
42        html::push_html(&mut out, parser);
43
44        // Restore original NUL tokens so HtmlPlaceholderRegistry::apply() works.
45        for (comment, original) in token_map {
46            out = out.replace(&comment, &original);
47        }
48        out
49    }
50
51    /// Convert Markdown body markup to the target terminal format (Typst, LaTeX)
52    /// after citation placeholder tokens have been spliced in.
53    fn render_body_markup<F>(&self, body: &str, fmt: &F) -> String
54    where
55        F: crate::render::format::OutputFormat<Output = String>,
56    {
57        crate::render::markup::render_markdown_body(body, fmt)
58    }
59
60    fn parse_document(&self, content: &str, locale: &Locale) -> ParsedDocument {
61        let (frontmatter_result, body) = parse_frontmatter(content);
62        let body_start = content.len() - body.len();
63        let (frontmatter, frontmatter_error) = match frontmatter_result {
64            Ok(fm) => (fm, None),
65            Err(e) => (None, Some(e)),
66        };
67        let frontmatter_options = frontmatter.as_ref().and_then(|fm| fm.options.clone());
68        // Legacy top-level fields are superseded by their `options.*` counterparts.
69        let frontmatter_integral_name_memory = frontmatter
70            .as_ref()
71            .and_then(|fm| fm.integral_name_memory.clone())
72            .filter(|_| {
73                frontmatter_options
74                    .as_ref()
75                    .and_then(|o| o.integral_name_memory.as_ref())
76                    .is_none()
77            });
78        let frontmatter_org_abbreviation_memory = frontmatter
79            .and_then(|fm| fm.org_abbreviation_memory)
80            .filter(|_| {
81                frontmatter_options
82                    .as_ref()
83                    .and_then(|o| o.org_abbreviation_memory.as_ref())
84                    .is_none()
85            });
86
87        let citations = find_citations(body, locale)
88            .into_iter()
89            .map(|(start, end, citation)| ParsedCitation {
90                start: body_start + start,
91                end: body_start + end,
92                citation,
93                placement: CitationPlacement::InlineProse,
94                structure: CitationStructure::default(),
95            })
96            .collect();
97
98        ParsedDocument {
99            citations,
100            manual_note_order: Vec::new(),
101            manual_note_references: Vec::new(),
102            manual_note_labels: HashSet::new(),
103            bibliography_blocks: Vec::new(),
104            frontmatter_groups: None,
105            frontmatter_integral_name_memory,
106            frontmatter_org_abbreviation_memory,
107            frontmatter_options,
108            frontmatter_error,
109            body_start,
110        }
111    }
112}
113
114#[allow(
115    clippy::string_slice,
116    clippy::unreachable,
117    reason = "Markdown scanning logic"
118)]
119fn find_citations(content: &str, locale: &Locale) -> Vec<(usize, usize, Citation)> {
120    let mut results = Vec::new();
121    let mut offset = 0;
122
123    while offset < content.len() {
124        let remaining = &content[offset..];
125        let next_at = remaining.find('@');
126        let next_bracket = remaining.find('[');
127
128        let (relative_start, kind) = match (next_at, next_bracket) {
129            (Some(at), Some(bracket)) if bracket <= at => (bracket, ScanKind::Bracket),
130            (Some(at), Some(bracket)) if at < bracket => (at, ScanKind::Textual),
131            (Some(at), None) => (at, ScanKind::Textual),
132            (None, Some(bracket)) => (bracket, ScanKind::Bracket),
133            (None, None) => break,
134            _ => unreachable!(),
135        };
136
137        let start = offset + relative_start;
138        let candidate = &content[start..];
139
140        let parsed = match kind {
141            ScanKind::Bracket => parse_bracketed_citation(candidate, locale),
142            ScanKind::Textual => parse_textual_citation(content, start, locale),
143        };
144
145        if let Some((consumed, citation)) = parsed {
146            results.push((start, start + consumed, citation));
147            offset = start + consumed;
148        } else if matches!(kind, ScanKind::Bracket) {
149            offset = start + candidate.find(']').map_or(1, |idx| idx + 1);
150        } else {
151            offset = start + 1;
152        }
153    }
154
155    results
156}
157
158#[derive(Debug, Clone, Copy)]
159enum ScanKind {
160    Bracket,
161    Textual,
162}
163
164#[allow(clippy::string_slice, reason = "Brackets and @ are 1-byte ASCII")]
165fn parse_bracketed_citation(input: &str, locale: &Locale) -> Option<(usize, Citation)> {
166    if !input.starts_with('[') {
167        return None;
168    }
169
170    let closing = input.find(']')?;
171    let inner = input[1..closing].trim();
172    if inner.is_empty() || !inner.contains('@') {
173        return None;
174    }
175
176    let mut items = Vec::new();
177    let mut suppress_author = None;
178
179    for segment in inner.split(';') {
180        let (item, suppress) = parse_bracketed_item(segment, locale)?;
181        if let Some(existing) = suppress_author {
182            if existing != suppress {
183                return None;
184            }
185        } else {
186            suppress_author = Some(suppress);
187        }
188        items.push(item);
189    }
190
191    Some((
192        closing + 1,
193        Citation {
194            items,
195            suppress_author: suppress_author.unwrap_or(false),
196            ..Default::default()
197        },
198    ))
199}
200
201#[allow(
202    clippy::string_slice,
203    clippy::indexing_slicing,
204    reason = "Citations are ASCII-heavy; indices from find() are on char boundaries"
205)]
206fn parse_bracketed_item(segment: &str, locale: &Locale) -> Option<(CitationItem, bool)> {
207    let segment = segment.trim();
208    let at_pos = segment.find('@')?;
209    let mut suppress_author = false;
210    let prefix_end = if at_pos > 0 && segment.as_bytes()[at_pos - 1] == b'-' {
211        suppress_author = true;
212        at_pos - 1
213    } else {
214        at_pos
215    };
216
217    let prefix = normalize_prefix(&segment[..prefix_end]);
218    let after_at = &segment[at_pos + 1..];
219    let key_end = cite_key_len(after_at)?;
220    let key = &after_at[..key_end];
221    let remainder = after_at[key_end..].trim_start();
222
223    let mut item = CitationItem {
224        id: key.to_string(),
225        prefix,
226        ..Default::default()
227    };
228
229    if let Some(rest) = remainder.strip_prefix(',') {
230        let rest = rest.trim();
231        if !rest.is_empty() {
232            item.locator = normalize_locator_text(rest, locale);
233            if item.locator.is_none() {
234                item.suffix = Some(rest.to_string());
235            }
236        }
237    } else if !remainder.is_empty() {
238        item.suffix = Some(remainder.trim().to_string());
239    }
240
241    Some((item, suppress_author))
242}
243
244#[allow(clippy::string_slice, reason = "@ and indices from find() are safe")]
245fn parse_textual_citation(
246    content: &str,
247    start: usize,
248    locale: &Locale,
249) -> Option<(usize, Citation)> {
250    if !is_valid_textual_start(content, start) {
251        return None;
252    }
253
254    let after_at = &content[start + 1..];
255    let key_end = cite_key_len(after_at)?;
256    let key = &after_at[..key_end];
257    let mut consumed = 1 + key_end;
258
259    let mut item = CitationItem {
260        id: key.to_string(),
261        ..Default::default()
262    };
263
264    let trailing = &content[start + consumed..];
265    if let Some((locator_consumed, locator)) = parse_textual_locator_suffix(trailing, locale) {
266        item.locator = Some(locator);
267        consumed += locator_consumed;
268    }
269
270    Some((
271        consumed,
272        Citation {
273            mode: CitationMode::Integral,
274            items: vec![item],
275            ..Default::default()
276        },
277    ))
278}
279
280#[allow(clippy::string_slice, reason = "Brackets and @ are 1-byte ASCII")]
281fn parse_textual_locator_suffix(
282    input: &str,
283    locale: &Locale,
284) -> Option<(usize, citum_schema::citation::CitationLocator)> {
285    let whitespace_len = input.len() - input.trim_start_matches(char::is_whitespace).len();
286    let rest = &input[whitespace_len..];
287    if !rest.starts_with('[') {
288        return None;
289    }
290
291    let closing = rest.find(']')?;
292    let inner = rest[1..closing].trim();
293    if inner.is_empty() || inner.contains('@') {
294        return None;
295    }
296
297    let locator = normalize_locator_text(inner, locale)?;
298    Some((whitespace_len + closing + 1, locator))
299}
300
301fn cite_key_len(input: &str) -> Option<usize> {
302    let len = input
303        .char_indices()
304        .take_while(
305            |(_, ch)| matches!(ch, 'A'..='Z' | 'a'..='z' | '0'..='9' | '_' | '-' | ':' | '.'),
306        )
307        .map(|(idx, ch)| idx + ch.len_utf8())
308        .last()
309        .unwrap_or(0);
310
311    if len == 0 { None } else { Some(len) }
312}
313
314fn normalize_prefix(prefix: &str) -> Option<String> {
315    let trimmed = prefix.trim();
316    if trimmed.is_empty() {
317        None
318    } else {
319        Some(format!("{trimmed} "))
320    }
321}
322
323#[allow(clippy::string_slice, reason = "start index from find() is safe")]
324fn is_valid_textual_start(content: &str, start: usize) -> bool {
325    let prev = content[..start].chars().next_back();
326    !matches!(prev, Some(ch) if ch.is_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | '@'))
327}
328
329/// Re-encode NUL placeholder tokens as HTML comments and return a mapping.
330///
331/// pulldown-cmark normalises `\x00` to U+FFFD, which would corrupt the
332/// `HtmlPlaceholderRegistry` tokens. Replacing them with HTML comments
333/// (`<!--CITUM-TOKEN-N-->`) before parsing lets them pass through as
334/// `InlineHtml` or `Html` events and survive the conversion intact.
335/// The returned pairs map each comment back to the original token so the
336/// caller can restore them after `push_html` runs.
337fn remap_nul_tokens(s: &str) -> (String, Vec<(String, String)>) {
338    let mut result = String::with_capacity(s.len());
339    let mut map: Vec<(String, String)> = Vec::new();
340    let mut outside = true;
341    let mut token_body = String::new();
342    for ch in s.chars() {
343        if ch == '\x00' {
344            if outside {
345                // Opening NUL: start accumulating the token body.
346                token_body.clear();
347            } else {
348                // Closing NUL: emit the comment placeholder.
349                let idx = map.len();
350                let comment = format!("<!--CITUM-TOKEN-{idx}-->");
351                let original = format!("\x00{token_body}\x00");
352                result.push_str(&comment);
353                map.push((comment, original));
354            }
355            outside = !outside;
356        } else if outside {
357            result.push(ch);
358        } else {
359            token_body.push(ch);
360        }
361    }
362    (result, map)
363}
364
365#[cfg(test)]
366#[allow(
367    clippy::unwrap_used,
368    clippy::expect_used,
369    clippy::panic,
370    clippy::indexing_slicing,
371    clippy::todo,
372    clippy::unimplemented,
373    clippy::unreachable,
374    clippy::get_unwrap,
375    reason = "Panicking is acceptable and often desired in tests."
376)]
377mod tests {
378    use super::*;
379    use citum_schema::citation::{CitationLocator, LocatorType};
380
381    #[test]
382    fn test_parse_bracketed_multi_cite() {
383        let parser = MarkdownParser;
384        let citations =
385            parser.parse_citations("See [@kuhn1962; @watson1953, ch. 2].", &Locale::en_us());
386
387        assert_eq!(citations.len(), 1);
388        let (_, _, citation) = &citations[0];
389        assert_eq!(citation.items.len(), 2);
390        assert_eq!(citation.items[0].id, "kuhn1962");
391        assert_eq!(
392            citation.items[1].locator,
393            Some(CitationLocator::single(LocatorType::Chapter, "2"))
394        );
395    }
396
397    #[test]
398    fn test_parse_bracketed_prefix_and_suppress_author() {
399        let parser = MarkdownParser;
400        let citations = parser.parse_citations("[see -@kuhn1962, p. 10]", &Locale::en_us());
401
402        assert_eq!(citations.len(), 1);
403        let (_, _, citation) = &citations[0];
404        assert!(citation.suppress_author);
405        assert_eq!(citation.items[0].prefix.as_deref(), Some("see "));
406        assert_eq!(
407            citation.items[0].locator,
408            Some(CitationLocator::single(LocatorType::Page, "10"))
409        );
410    }
411
412    #[test]
413    fn test_parse_textual_citation() {
414        let parser = MarkdownParser;
415        let citations = parser.parse_citations(
416            "Kuhn argued that @kuhn1962 changed science.",
417            &Locale::en_us(),
418        );
419
420        assert_eq!(citations.len(), 1);
421        let (_, _, citation) = &citations[0];
422        assert_eq!(citation.mode, CitationMode::Integral);
423        assert_eq!(citation.items[0].id, "kuhn1962");
424    }
425
426    #[test]
427    fn test_parse_textual_citation_with_locator_suffix() {
428        let parser = MarkdownParser;
429        let citations =
430            parser.parse_citations("@kuhn1962 [p. 10] argues this point.", &Locale::en_us());
431
432        assert_eq!(citations.len(), 1);
433        let (_, _, citation) = &citations[0];
434        assert_eq!(citation.mode, CitationMode::Integral);
435        assert_eq!(
436            citation.items[0].locator,
437            Some(CitationLocator::single(LocatorType::Page, "10"))
438        );
439    }
440
441    #[test]
442    fn test_parse_document_marks_citations_as_inline_prose() {
443        let parser = MarkdownParser;
444        let parsed = parser.parse_document("Text [@kuhn1962].", &Locale::en_us());
445
446        assert_eq!(parsed.citations.len(), 1);
447        assert_eq!(
448            parsed.citations[0].placement,
449            CitationPlacement::InlineProse
450        );
451        assert!(parsed.manual_note_order.is_empty());
452        assert!(parsed.bibliography_blocks.is_empty());
453    }
454
455    #[test]
456    fn test_does_not_parse_email_address() {
457        let parser = MarkdownParser;
458        let citations =
459            parser.parse_citations("Contact test@example.com for details.", &Locale::en_us());
460
461        assert!(citations.is_empty());
462    }
463
464    #[test]
465    fn test_unsupported_bracket_cluster_does_not_fall_back_to_textual_citations() {
466        let parser = MarkdownParser;
467        let citations =
468            parser.parse_citations("Mixed [@kuhn1962; -@watson1953] cluster.", &Locale::en_us());
469
470        assert!(citations.is_empty());
471    }
472
473    #[test]
474    fn given_markdown_body_when_finalize_html_output_then_markup_is_converted_to_html() {
475        let parser = MarkdownParser;
476        let input = "**bold** and _em_ text.";
477        let output = parser.finalize_html_output(input);
478        assert!(
479            output.contains("<strong>bold</strong>"),
480            "expected <strong>bold</strong> in: {output}"
481        );
482        assert!(
483            output.contains("<em>em</em>"),
484            "expected <em>em</em> in: {output}"
485        );
486    }
487
488    #[test]
489    fn given_markdown_with_nul_tokens_when_finalize_html_output_then_tokens_survive_conversion() {
490        let parser = MarkdownParser;
491        // NUL tokens stand in for spliced citation HTML; they must survive the
492        // pulldown-cmark pass so HtmlPlaceholderRegistry::apply() can substitute them.
493        let token = "\x00CITUMHTMLINLINETOKEN0\x00";
494        let input = format!("Some prose with {token} inline.");
495        let output = parser.finalize_html_output(&input);
496        assert!(
497            output.contains(token),
498            "NUL token must survive pulldown-cmark conversion; output: {output}"
499        );
500    }
501
502    #[test]
503    fn given_markdown_blockquote_when_finalize_html_output_then_blockquote_element_emitted() {
504        let parser = MarkdownParser;
505        let input = "> block quote with *italic* text";
506        let output = parser.finalize_html_output(input);
507        assert!(
508            output.contains("<blockquote>"),
509            "expected <blockquote> in: {output}"
510        );
511        assert!(
512            output.contains("<em>italic</em>"),
513            "expected <em>italic</em> in: {output}"
514        );
515    }
516}