Skip to main content

citum_engine/processor/document/
markdown.rs

1/*
2SPDX-License-Identifier: MIT OR Apache-2.0
3SPDX-FileCopyrightText: © 2023-2026 Bruce D'Arcus and Citum contributors
4*/
5
6//! Markdown document parsing for Pandoc-style citations.
7
8use super::{CitationParser, CitationPlacement, CitationStructure, ParsedCitation, ParsedDocument};
9use crate::{Citation, CitationItem};
10use citum_schema::citation::{CitationMode, normalize_locator_text};
11use citum_schema::locale::Locale;
12use std::collections::HashSet;
13
14/// A parser for Markdown documents with Pandoc-style citation syntax.
15///
16/// This parser currently supports inline prose citations and maps them into the
17/// shared document-processing pipeline. Markdown-specific footnotes, document
18/// metadata, and inline bibliography blocks remain future work.
19pub struct MarkdownParser;
20
21impl Default for MarkdownParser {
22    fn default() -> Self {
23        Self
24    }
25}
26
27impl CitationParser for MarkdownParser {
28    fn parse_document(&self, content: &str, locale: &Locale) -> ParsedDocument {
29        let citations = find_citations(content, locale)
30            .into_iter()
31            .map(|(start, end, citation)| ParsedCitation {
32                start,
33                end,
34                citation,
35                placement: CitationPlacement::InlineProse,
36                structure: CitationStructure::default(),
37            })
38            .collect();
39
40        ParsedDocument {
41            citations,
42            manual_note_order: Vec::new(),
43            manual_note_references: Vec::new(),
44            manual_note_labels: HashSet::new(),
45            bibliography_blocks: Vec::new(),
46            frontmatter_groups: None,
47            frontmatter_integral_name_memory: None,
48            body_start: 0,
49        }
50    }
51}
52
53#[allow(
54    clippy::string_slice,
55    clippy::unreachable,
56    reason = "Markdown scanning logic"
57)]
58fn find_citations(content: &str, locale: &Locale) -> Vec<(usize, usize, Citation)> {
59    let mut results = Vec::new();
60    let mut offset = 0;
61
62    while offset < content.len() {
63        let remaining = &content[offset..];
64        let next_at = remaining.find('@');
65        let next_bracket = remaining.find('[');
66
67        let (relative_start, kind) = match (next_at, next_bracket) {
68            (Some(at), Some(bracket)) if bracket <= at => (bracket, ScanKind::Bracket),
69            (Some(at), Some(bracket)) if at < bracket => (at, ScanKind::Textual),
70            (Some(at), None) => (at, ScanKind::Textual),
71            (None, Some(bracket)) => (bracket, ScanKind::Bracket),
72            (None, None) => break,
73            _ => unreachable!(),
74        };
75
76        let start = offset + relative_start;
77        let candidate = &content[start..];
78
79        let parsed = match kind {
80            ScanKind::Bracket => parse_bracketed_citation(candidate, locale),
81            ScanKind::Textual => parse_textual_citation(content, start, locale),
82        };
83
84        if let Some((consumed, citation)) = parsed {
85            results.push((start, start + consumed, citation));
86            offset = start + consumed;
87        } else if matches!(kind, ScanKind::Bracket) {
88            offset = start + candidate.find(']').map_or(1, |idx| idx + 1);
89        } else {
90            offset = start + 1;
91        }
92    }
93
94    results
95}
96
97#[derive(Debug, Clone, Copy)]
98enum ScanKind {
99    Bracket,
100    Textual,
101}
102
103#[allow(clippy::string_slice, reason = "Brackets and @ are 1-byte ASCII")]
104fn parse_bracketed_citation(input: &str, locale: &Locale) -> Option<(usize, Citation)> {
105    if !input.starts_with('[') {
106        return None;
107    }
108
109    let closing = input.find(']')?;
110    let inner = input[1..closing].trim();
111    if inner.is_empty() || !inner.contains('@') {
112        return None;
113    }
114
115    let mut items = Vec::new();
116    let mut suppress_author = None;
117
118    for segment in inner.split(';') {
119        let (item, suppress) = parse_bracketed_item(segment, locale)?;
120        if let Some(existing) = suppress_author {
121            if existing != suppress {
122                return None;
123            }
124        } else {
125            suppress_author = Some(suppress);
126        }
127        items.push(item);
128    }
129
130    Some((
131        closing + 1,
132        Citation {
133            items,
134            suppress_author: suppress_author.unwrap_or(false),
135            ..Default::default()
136        },
137    ))
138}
139
140#[allow(
141    clippy::string_slice,
142    clippy::indexing_slicing,
143    reason = "Citations are ASCII-heavy; indices from find() are on char boundaries"
144)]
145fn parse_bracketed_item(segment: &str, locale: &Locale) -> Option<(CitationItem, bool)> {
146    let segment = segment.trim();
147    let at_pos = segment.find('@')?;
148    let mut suppress_author = false;
149    let prefix_end = if at_pos > 0 && segment.as_bytes()[at_pos - 1] == b'-' {
150        suppress_author = true;
151        at_pos - 1
152    } else {
153        at_pos
154    };
155
156    let prefix = normalize_prefix(&segment[..prefix_end]);
157    let after_at = &segment[at_pos + 1..];
158    let key_end = cite_key_len(after_at)?;
159    let key = &after_at[..key_end];
160    let remainder = after_at[key_end..].trim_start();
161
162    let mut item = CitationItem {
163        id: key.to_string(),
164        prefix,
165        ..Default::default()
166    };
167
168    if let Some(rest) = remainder.strip_prefix(',') {
169        let rest = rest.trim();
170        if !rest.is_empty() {
171            item.locator = normalize_locator_text(rest, locale);
172            if item.locator.is_none() {
173                item.suffix = Some(rest.to_string());
174            }
175        }
176    } else if !remainder.is_empty() {
177        item.suffix = Some(remainder.trim().to_string());
178    }
179
180    Some((item, suppress_author))
181}
182
183#[allow(clippy::string_slice, reason = "@ and indices from find() are safe")]
184fn parse_textual_citation(
185    content: &str,
186    start: usize,
187    locale: &Locale,
188) -> Option<(usize, Citation)> {
189    if !is_valid_textual_start(content, start) {
190        return None;
191    }
192
193    let after_at = &content[start + 1..];
194    let key_end = cite_key_len(after_at)?;
195    let key = &after_at[..key_end];
196    let mut consumed = 1 + key_end;
197
198    let mut item = CitationItem {
199        id: key.to_string(),
200        ..Default::default()
201    };
202
203    let trailing = &content[start + consumed..];
204    if let Some((locator_consumed, locator)) = parse_textual_locator_suffix(trailing, locale) {
205        item.locator = Some(locator);
206        consumed += locator_consumed;
207    }
208
209    Some((
210        consumed,
211        Citation {
212            mode: CitationMode::Integral,
213            items: vec![item],
214            ..Default::default()
215        },
216    ))
217}
218
219#[allow(clippy::string_slice, reason = "Brackets and @ are 1-byte ASCII")]
220fn parse_textual_locator_suffix(
221    input: &str,
222    locale: &Locale,
223) -> Option<(usize, citum_schema::citation::CitationLocator)> {
224    let whitespace_len = input.len() - input.trim_start_matches(char::is_whitespace).len();
225    let rest = &input[whitespace_len..];
226    if !rest.starts_with('[') {
227        return None;
228    }
229
230    let closing = rest.find(']')?;
231    let inner = rest[1..closing].trim();
232    if inner.is_empty() || inner.contains('@') {
233        return None;
234    }
235
236    let locator = normalize_locator_text(inner, locale)?;
237    Some((whitespace_len + closing + 1, locator))
238}
239
240fn cite_key_len(input: &str) -> Option<usize> {
241    let len = input
242        .char_indices()
243        .take_while(
244            |(_, ch)| matches!(ch, 'A'..='Z' | 'a'..='z' | '0'..='9' | '_' | '-' | ':' | '.'),
245        )
246        .map(|(idx, ch)| idx + ch.len_utf8())
247        .last()
248        .unwrap_or(0);
249
250    if len == 0 { None } else { Some(len) }
251}
252
253fn normalize_prefix(prefix: &str) -> Option<String> {
254    let trimmed = prefix.trim();
255    if trimmed.is_empty() {
256        None
257    } else {
258        Some(format!("{trimmed} "))
259    }
260}
261
262#[allow(clippy::string_slice, reason = "start index from find() is safe")]
263fn is_valid_textual_start(content: &str, start: usize) -> bool {
264    let prev = content[..start].chars().next_back();
265    !matches!(prev, Some(ch) if ch.is_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | '@'))
266}
267
268#[cfg(test)]
269#[allow(
270    clippy::unwrap_used,
271    clippy::expect_used,
272    clippy::panic,
273    clippy::indexing_slicing,
274    clippy::todo,
275    clippy::unimplemented,
276    clippy::unreachable,
277    clippy::get_unwrap,
278    reason = "Panicking is acceptable and often desired in tests."
279)]
280mod tests {
281    use super::*;
282    use citum_schema::citation::{CitationLocator, LocatorType};
283
284    #[test]
285    fn test_parse_bracketed_multi_cite() {
286        let parser = MarkdownParser;
287        let citations =
288            parser.parse_citations("See [@kuhn1962; @watson1953, ch. 2].", &Locale::en_us());
289
290        assert_eq!(citations.len(), 1);
291        let (_, _, citation) = &citations[0];
292        assert_eq!(citation.items.len(), 2);
293        assert_eq!(citation.items[0].id, "kuhn1962");
294        assert_eq!(
295            citation.items[1].locator,
296            Some(CitationLocator::single(LocatorType::Chapter, "2"))
297        );
298    }
299
300    #[test]
301    fn test_parse_bracketed_prefix_and_suppress_author() {
302        let parser = MarkdownParser;
303        let citations = parser.parse_citations("[see -@kuhn1962, p. 10]", &Locale::en_us());
304
305        assert_eq!(citations.len(), 1);
306        let (_, _, citation) = &citations[0];
307        assert!(citation.suppress_author);
308        assert_eq!(citation.items[0].prefix.as_deref(), Some("see "));
309        assert_eq!(
310            citation.items[0].locator,
311            Some(CitationLocator::single(LocatorType::Page, "10"))
312        );
313    }
314
315    #[test]
316    fn test_parse_textual_citation() {
317        let parser = MarkdownParser;
318        let citations = parser.parse_citations(
319            "Kuhn argued that @kuhn1962 changed science.",
320            &Locale::en_us(),
321        );
322
323        assert_eq!(citations.len(), 1);
324        let (_, _, citation) = &citations[0];
325        assert_eq!(citation.mode, CitationMode::Integral);
326        assert_eq!(citation.items[0].id, "kuhn1962");
327    }
328
329    #[test]
330    fn test_parse_textual_citation_with_locator_suffix() {
331        let parser = MarkdownParser;
332        let citations =
333            parser.parse_citations("@kuhn1962 [p. 10] argues this point.", &Locale::en_us());
334
335        assert_eq!(citations.len(), 1);
336        let (_, _, citation) = &citations[0];
337        assert_eq!(citation.mode, CitationMode::Integral);
338        assert_eq!(
339            citation.items[0].locator,
340            Some(CitationLocator::single(LocatorType::Page, "10"))
341        );
342    }
343
344    #[test]
345    fn test_parse_document_marks_citations_as_inline_prose() {
346        let parser = MarkdownParser;
347        let parsed = parser.parse_document("Text [@kuhn1962].", &Locale::en_us());
348
349        assert_eq!(parsed.citations.len(), 1);
350        assert_eq!(
351            parsed.citations[0].placement,
352            CitationPlacement::InlineProse
353        );
354        assert!(parsed.manual_note_order.is_empty());
355        assert!(parsed.bibliography_blocks.is_empty());
356    }
357
358    #[test]
359    fn test_does_not_parse_email_address() {
360        let parser = MarkdownParser;
361        let citations =
362            parser.parse_citations("Contact test@example.com for details.", &Locale::en_us());
363
364        assert!(citations.is_empty());
365    }
366
367    #[test]
368    fn test_unsupported_bracket_cluster_does_not_fall_back_to_textual_citations() {
369        let parser = MarkdownParser;
370        let citations =
371            parser.parse_citations("Mixed [@kuhn1962; -@watson1953] cluster.", &Locale::en_us());
372
373        assert!(citations.is_empty());
374    }
375
376    #[test]
377    fn test_markdown_finalize_html_output_is_passthrough() {
378        // MarkdownParser does not perform any markup-to-HTML conversion; the
379        // caller is responsible for rendering CommonMark. The trait default
380        // returns the input unchanged.
381        let parser = MarkdownParser;
382        let input = "**bold** and _em_ and [@key].";
383        assert_eq!(parser.finalize_html_output(input), input);
384    }
385}