1use super::djot::parsing::parse_frontmatter;
9use super::{CitationParser, CitationPlacement, CitationStructure, ParsedCitation, ParsedDocument};
10use crate::processor::document::ManualNoteReference;
11use crate::{Citation, CitationItem};
12use citum_schema::citation::{CitationMode, normalize_locator_text};
13use citum_schema::locale::Locale;
14use std::collections::HashSet;
15use std::ops::Range;
16
17struct FootnoteRange {
23 label: String,
24 content: Range<usize>,
25}
26
27pub struct MarkdownParser;
29
30impl Default for MarkdownParser {
31 fn default() -> Self {
32 Self
33 }
34}
35
36impl CitationParser for MarkdownParser {
37 fn finalize_html_output(&self, rendered: &str) -> String {
45 use pulldown_cmark::{Options, html};
46
47 let (remapped, token_map) = remap_nul_tokens(rendered);
48 let parser = pulldown_cmark::Parser::new_ext(
49 &remapped,
50 Options::ENABLE_STRIKETHROUGH | Options::ENABLE_FOOTNOTES | Options::ENABLE_TABLES,
51 );
52 let mut out = String::new();
53 html::push_html(&mut out, parser);
54
55 for (comment, original) in token_map {
57 out = out.replace(&comment, &original);
58 }
59 out
60 }
61
62 fn render_body_markup<F>(&self, body: &str, fmt: &F) -> String
65 where
66 F: crate::render::format::OutputFormat<Output = String>,
67 {
68 crate::render::markup::render_markdown_body(body, fmt)
69 }
70
71 fn parse_document(&self, content: &str, locale: &Locale) -> ParsedDocument {
72 let (frontmatter_result, body) = parse_frontmatter(content);
73 let body_start = content.len() - body.len();
74 let (frontmatter, frontmatter_error) = match frontmatter_result {
75 Ok(fm) => (fm, None),
76 Err(e) => (None, Some(e)),
77 };
78 let frontmatter_options = frontmatter.as_ref().and_then(|fm| fm.options.clone());
79 let frontmatter_integral_name_memory = frontmatter
81 .as_ref()
82 .and_then(|fm| fm.integral_name_memory.clone())
83 .filter(|_| {
84 frontmatter_options
85 .as_ref()
86 .and_then(|o| o.integral_name_memory.as_ref())
87 .is_none()
88 });
89 let frontmatter_org_abbreviation_memory = frontmatter
90 .and_then(|fm| fm.org_abbreviation_memory)
91 .filter(|_| {
92 frontmatter_options
93 .as_ref()
94 .and_then(|o| o.org_abbreviation_memory.as_ref())
95 .is_none()
96 });
97
98 let (raw_note_refs, manual_note_labels, footnote_ranges) = scan_manual_notes_markdown(body);
99
100 let mut seen_labels = HashSet::new();
102 let mut manual_note_order = Vec::new();
103 let manual_note_references: Vec<ManualNoteReference> = raw_note_refs
104 .into_iter()
105 .map(|r| ManualNoteReference {
106 label: r.label.clone(),
107 start: body_start + r.start,
108 })
109 .inspect(|r| {
110 if seen_labels.insert(r.label.clone()) {
111 manual_note_order.push(r.label.clone());
112 }
113 })
114 .collect();
115
116 let adjusted_ranges: Vec<FootnoteRange> = footnote_ranges
118 .into_iter()
119 .map(|fr| FootnoteRange {
120 label: fr.label,
121 content: (body_start + fr.content.start)..(body_start + fr.content.end),
122 })
123 .collect();
124
125 let citations = find_citations(body, locale)
126 .into_iter()
127 .map(|(start, end, citation)| {
128 let abs_start = body_start + start;
129 let abs_end = body_start + end;
130 let placement = footnote_placement(abs_start, abs_end, &adjusted_ranges);
131 ParsedCitation {
132 start: abs_start,
133 end: abs_end,
134 citation,
135 placement,
136 structure: CitationStructure::default(),
137 }
138 })
139 .collect();
140
141 ParsedDocument {
142 citations,
143 manual_note_order,
144 manual_note_references,
145 manual_note_labels,
146 bibliography_blocks: Vec::new(),
147 frontmatter_groups: None,
148 frontmatter_integral_name_memory,
149 frontmatter_org_abbreviation_memory,
150 frontmatter_options,
151 frontmatter_error,
152 body_start,
153 }
154 }
155}
156
157fn scan_manual_notes_markdown(
168 content: &str,
169) -> (
170 Vec<ManualNoteReference>,
171 HashSet<String>,
172 Vec<FootnoteRange>,
173) {
174 use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
175
176 let opts = Options::ENABLE_FOOTNOTES | Options::ENABLE_STRIKETHROUGH;
177 let mut manual_note_references = Vec::new();
178 let mut manual_note_labels = HashSet::new();
179 let mut footnote_ranges = Vec::new();
180 let mut footnote_stack: Vec<(String, usize)> = Vec::new();
181
182 for (event, range) in Parser::new_ext(content, opts).into_offset_iter() {
183 match event {
184 Event::FootnoteReference(label) if footnote_stack.is_empty() => {
185 manual_note_references.push(ManualNoteReference {
186 label: label.to_string(),
187 start: range.start,
188 });
189 manual_note_labels.insert(label.to_string());
190 }
191 Event::Start(Tag::FootnoteDefinition(label)) => {
192 manual_note_labels.insert(label.to_string());
193 footnote_stack.push((label.to_string(), range.start));
194 }
195 Event::End(TagEnd::FootnoteDefinition) => {
196 if let Some((open_label, content_start)) = footnote_stack.pop() {
197 footnote_ranges.push(FootnoteRange {
198 label: open_label,
199 content: content_start..range.end,
200 });
201 }
202 }
203 _ => {}
204 }
205 }
206
207 (manual_note_references, manual_note_labels, footnote_ranges)
208}
209
210fn footnote_placement(start: usize, end: usize, ranges: &[FootnoteRange]) -> CitationPlacement {
213 ranges
214 .iter()
215 .find(|fr| fr.content.start <= start && end <= fr.content.end)
216 .map_or(CitationPlacement::InlineProse, |fr| {
217 CitationPlacement::ManualFootnote {
218 label: fr.label.clone(),
219 }
220 })
221}
222
223#[allow(
224 clippy::string_slice,
225 clippy::unreachable,
226 reason = "Markdown scanning logic"
227)]
228fn find_citations(content: &str, locale: &Locale) -> Vec<(usize, usize, Citation)> {
229 let mut results = Vec::new();
230 let mut offset = 0;
231
232 while offset < content.len() {
233 let remaining = &content[offset..];
234 let next_at = remaining.find('@');
235 let next_bracket = remaining.find('[');
236
237 let (relative_start, kind) = match (next_at, next_bracket) {
238 (Some(at), Some(bracket)) if bracket <= at => (bracket, ScanKind::Bracket),
239 (Some(at), Some(bracket)) if at < bracket => (at, ScanKind::Textual),
240 (Some(at), None) => (at, ScanKind::Textual),
241 (None, Some(bracket)) => (bracket, ScanKind::Bracket),
242 (None, None) => break,
243 _ => unreachable!(),
244 };
245
246 let start = offset + relative_start;
247 let candidate = &content[start..];
248
249 let parsed = match kind {
250 ScanKind::Bracket => parse_bracketed_citation(candidate, locale),
251 ScanKind::Textual => parse_textual_citation(content, start, locale),
252 };
253
254 if let Some((consumed, citation)) = parsed {
255 results.push((start, start + consumed, citation));
256 offset = start + consumed;
257 } else if matches!(kind, ScanKind::Bracket) {
258 offset = start + candidate.find(']').map_or(1, |idx| idx + 1);
259 } else {
260 offset = start + 1;
261 }
262 }
263
264 results
265}
266
267#[derive(Debug, Clone, Copy)]
268enum ScanKind {
269 Bracket,
270 Textual,
271}
272
273#[allow(clippy::string_slice, reason = "Brackets and @ are 1-byte ASCII")]
274fn parse_bracketed_citation(input: &str, locale: &Locale) -> Option<(usize, Citation)> {
275 if !input.starts_with('[') {
276 return None;
277 }
278
279 let closing = input.find(']')?;
280 let inner = input[1..closing].trim();
281 if inner.is_empty() || !inner.contains('@') {
282 return None;
283 }
284
285 let mut items = Vec::new();
286 let mut suppress_author = None;
287
288 for segment in inner.split(';') {
289 let (item, suppress) = parse_bracketed_item(segment, locale)?;
290 if let Some(existing) = suppress_author {
291 if existing != suppress {
292 return None;
293 }
294 } else {
295 suppress_author = Some(suppress);
296 }
297 items.push(item);
298 }
299
300 Some((
301 closing + 1,
302 Citation {
303 items,
304 suppress_author: suppress_author.unwrap_or(false),
305 ..Default::default()
306 },
307 ))
308}
309
310#[allow(
311 clippy::string_slice,
312 clippy::indexing_slicing,
313 reason = "Citations are ASCII-heavy; indices from find() are on char boundaries"
314)]
315fn parse_bracketed_item(segment: &str, locale: &Locale) -> Option<(CitationItem, bool)> {
316 let segment = segment.trim();
317 let at_pos = segment.find('@')?;
318 let mut suppress_author = false;
319 let prefix_end = if at_pos > 0 && segment.as_bytes()[at_pos - 1] == b'-' {
320 suppress_author = true;
321 at_pos - 1
322 } else {
323 at_pos
324 };
325
326 let prefix = normalize_prefix(&segment[..prefix_end]);
327 let after_at = &segment[at_pos + 1..];
328 let key_end = cite_key_len(after_at)?;
329 let key = &after_at[..key_end];
330 let remainder = after_at[key_end..].trim_start();
331
332 let mut item = CitationItem {
333 id: key.to_string(),
334 prefix,
335 ..Default::default()
336 };
337
338 if let Some(rest) = remainder.strip_prefix(',') {
339 let rest = rest.trim();
340 if !rest.is_empty() {
341 item.locator = normalize_locator_text(rest, locale);
342 if item.locator.is_none() {
343 item.suffix = Some(rest.to_string());
344 }
345 }
346 } else if !remainder.is_empty() {
347 item.suffix = Some(remainder.trim().to_string());
348 }
349
350 Some((item, suppress_author))
351}
352
353#[allow(clippy::string_slice, reason = "@ and indices from find() are safe")]
354fn parse_textual_citation(
355 content: &str,
356 start: usize,
357 locale: &Locale,
358) -> Option<(usize, Citation)> {
359 if !is_valid_textual_start(content, start) {
360 return None;
361 }
362
363 let after_at = &content[start + 1..];
364 let key_end = cite_key_len(after_at)?;
365 let key = &after_at[..key_end];
366 let mut consumed = 1 + key_end;
367
368 let mut item = CitationItem {
369 id: key.to_string(),
370 ..Default::default()
371 };
372
373 let trailing = &content[start + consumed..];
374 if let Some((locator_consumed, locator)) = parse_textual_locator_suffix(trailing, locale) {
375 item.locator = Some(locator);
376 consumed += locator_consumed;
377 }
378
379 Some((
380 consumed,
381 Citation {
382 mode: CitationMode::Integral,
383 items: vec![item],
384 ..Default::default()
385 },
386 ))
387}
388
389#[allow(clippy::string_slice, reason = "Brackets and @ are 1-byte ASCII")]
390fn parse_textual_locator_suffix(
391 input: &str,
392 locale: &Locale,
393) -> Option<(usize, citum_schema::citation::CitationLocator)> {
394 let whitespace_len = input.len() - input.trim_start_matches(char::is_whitespace).len();
395 let rest = &input[whitespace_len..];
396 if !rest.starts_with('[') {
397 return None;
398 }
399
400 let closing = rest.find(']')?;
401 let inner = rest[1..closing].trim();
402 if inner.is_empty() || inner.contains('@') {
403 return None;
404 }
405
406 let locator = normalize_locator_text(inner, locale)?;
407 Some((whitespace_len + closing + 1, locator))
408}
409
410fn cite_key_len(input: &str) -> Option<usize> {
411 let len = input
412 .char_indices()
413 .take_while(
414 |(_, ch)| matches!(ch, 'A'..='Z' | 'a'..='z' | '0'..='9' | '_' | '-' | ':' | '.'),
415 )
416 .map(|(idx, ch)| idx + ch.len_utf8())
417 .last()
418 .unwrap_or(0);
419
420 if len == 0 { None } else { Some(len) }
421}
422
423fn normalize_prefix(prefix: &str) -> Option<String> {
424 let trimmed = prefix.trim();
425 if trimmed.is_empty() {
426 None
427 } else {
428 Some(format!("{trimmed} "))
429 }
430}
431
432#[allow(clippy::string_slice, reason = "start index from find() is safe")]
433fn is_valid_textual_start(content: &str, start: usize) -> bool {
434 let prev = content[..start].chars().next_back();
435 !matches!(prev, Some(ch) if ch.is_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | '@'))
436}
437
438fn remap_nul_tokens(s: &str) -> (String, Vec<(String, String)>) {
447 let mut result = String::with_capacity(s.len());
448 let mut map: Vec<(String, String)> = Vec::new();
449 let mut outside = true;
450 let mut token_body = String::new();
451 for ch in s.chars() {
452 if ch == '\x00' {
453 if outside {
454 token_body.clear();
456 } else {
457 let idx = map.len();
459 let comment = format!("<!--CITUM-TOKEN-{idx}-->");
460 let original = format!("\x00{token_body}\x00");
461 result.push_str(&comment);
462 map.push((comment, original));
463 }
464 outside = !outside;
465 } else if outside {
466 result.push(ch);
467 } else {
468 token_body.push(ch);
469 }
470 }
471 (result, map)
472}
473
474#[cfg(test)]
475#[allow(
476 clippy::unwrap_used,
477 clippy::expect_used,
478 clippy::panic,
479 clippy::indexing_slicing,
480 clippy::todo,
481 clippy::unimplemented,
482 clippy::unreachable,
483 clippy::get_unwrap,
484 reason = "Panicking is acceptable and often desired in tests."
485)]
486mod tests {
487 use super::*;
488 use citum_schema::citation::{CitationLocator, LocatorType};
489
490 #[test]
491 fn test_parse_bracketed_multi_cite() {
492 let parser = MarkdownParser;
493 let citations =
494 parser.parse_citations("See [@kuhn1962; @watson1953, ch. 2].", &Locale::en_us());
495
496 assert_eq!(citations.len(), 1);
497 let (_, _, citation) = &citations[0];
498 assert_eq!(citation.items.len(), 2);
499 assert_eq!(citation.items[0].id, "kuhn1962");
500 assert_eq!(
501 citation.items[1].locator,
502 Some(CitationLocator::single(LocatorType::Chapter, "2"))
503 );
504 }
505
506 #[test]
507 fn test_parse_bracketed_prefix_and_suppress_author() {
508 let parser = MarkdownParser;
509 let citations = parser.parse_citations("[see -@kuhn1962, p. 10]", &Locale::en_us());
510
511 assert_eq!(citations.len(), 1);
512 let (_, _, citation) = &citations[0];
513 assert!(citation.suppress_author);
514 assert_eq!(citation.items[0].prefix.as_deref(), Some("see "));
515 assert_eq!(
516 citation.items[0].locator,
517 Some(CitationLocator::single(LocatorType::Page, "10"))
518 );
519 }
520
521 #[test]
522 fn test_parse_textual_citation() {
523 let parser = MarkdownParser;
524 let citations = parser.parse_citations(
525 "Kuhn argued that @kuhn1962 changed science.",
526 &Locale::en_us(),
527 );
528
529 assert_eq!(citations.len(), 1);
530 let (_, _, citation) = &citations[0];
531 assert_eq!(citation.mode, CitationMode::Integral);
532 assert_eq!(citation.items[0].id, "kuhn1962");
533 }
534
535 #[test]
536 fn test_parse_textual_citation_with_locator_suffix() {
537 let parser = MarkdownParser;
538 let citations =
539 parser.parse_citations("@kuhn1962 [p. 10] argues this point.", &Locale::en_us());
540
541 assert_eq!(citations.len(), 1);
542 let (_, _, citation) = &citations[0];
543 assert_eq!(citation.mode, CitationMode::Integral);
544 assert_eq!(
545 citation.items[0].locator,
546 Some(CitationLocator::single(LocatorType::Page, "10"))
547 );
548 }
549
550 #[test]
551 fn test_parse_document_marks_citations_as_inline_prose() {
552 let parser = MarkdownParser;
553 let parsed = parser.parse_document("Text [@kuhn1962].", &Locale::en_us());
554
555 assert_eq!(parsed.citations.len(), 1);
556 assert_eq!(
557 parsed.citations[0].placement,
558 CitationPlacement::InlineProse
559 );
560 assert!(parsed.manual_note_order.is_empty());
561 assert!(parsed.bibliography_blocks.is_empty());
562 }
563
564 #[test]
565 fn test_does_not_parse_email_address() {
566 let parser = MarkdownParser;
567 let citations =
568 parser.parse_citations("Contact test@example.com for details.", &Locale::en_us());
569
570 assert!(citations.is_empty());
571 }
572
573 #[test]
574 fn test_unsupported_bracket_cluster_does_not_fall_back_to_textual_citations() {
575 let parser = MarkdownParser;
576 let citations =
577 parser.parse_citations("Mixed [@kuhn1962; -@watson1953] cluster.", &Locale::en_us());
578
579 assert!(citations.is_empty());
580 }
581
582 #[test]
583 fn given_markdown_body_when_finalize_html_output_then_markup_is_converted_to_html() {
584 let parser = MarkdownParser;
585 let input = "**bold** and _em_ text.";
586 let output = parser.finalize_html_output(input);
587 assert!(
588 output.contains("<strong>bold</strong>"),
589 "expected <strong>bold</strong> in: {output}"
590 );
591 assert!(
592 output.contains("<em>em</em>"),
593 "expected <em>em</em> in: {output}"
594 );
595 }
596
597 #[test]
598 fn given_markdown_with_nul_tokens_when_finalize_html_output_then_tokens_survive_conversion() {
599 let parser = MarkdownParser;
600 let token = "\x00CITUMHTMLINLINETOKEN0\x00";
603 let input = format!("Some prose with {token} inline.");
604 let output = parser.finalize_html_output(&input);
605 assert!(
606 output.contains(token),
607 "NUL token must survive pulldown-cmark conversion; output: {output}"
608 );
609 }
610
611 #[test]
612 fn given_markdown_blockquote_when_finalize_html_output_then_blockquote_element_emitted() {
613 let parser = MarkdownParser;
614 let input = "> block quote with *italic* text";
615 let output = parser.finalize_html_output(input);
616 assert!(
617 output.contains("<blockquote>"),
618 "expected <blockquote> in: {output}"
619 );
620 assert!(
621 output.contains("<em>italic</em>"),
622 "expected <em>italic</em> in: {output}"
623 );
624 }
625
626 #[test]
627 fn given_markdown_pipe_table_when_finalize_html_output_then_table_element_emitted() {
628 let parser = MarkdownParser;
629 let input = "| A | B |\n|---|---|\n| 1 | 2 |";
630 let output = parser.finalize_html_output(input);
631 assert!(
632 output.contains("<table>"),
633 "pipe table should render as <table>: {output}"
634 );
635 }
636
637 #[test]
638 fn given_markdown_footnote_def_when_finalize_html_output_then_footnote_rendered() {
639 let parser = MarkdownParser;
640 let input = "Text[^1].\n\n[^1]: A note.";
641 let output = parser.finalize_html_output(input);
642 assert!(
643 output.contains("footnote") || output.contains("fn1"),
644 "footnote definition should produce HTML footnote markup: {output}"
645 );
646 }
647
648 #[test]
649 fn given_citation_inside_footnote_def_when_parse_document_then_placement_is_manual_footnote() {
650 let parser = MarkdownParser;
651 let doc = "See note[^1].\n\n[^1]: See [@kuhn1962].";
653 let parsed = parser.parse_document(doc, &Locale::en_us());
654
655 assert_eq!(parsed.citations.len(), 1, "one citation expected");
656 assert!(
657 matches!(
658 parsed.citations[0].placement,
659 CitationPlacement::ManualFootnote { .. }
660 ),
661 "citation inside [^n]: block should be ManualFootnote, got: {:?}",
662 parsed.citations[0].placement
663 );
664 assert!(
665 parsed.manual_note_labels.contains("1"),
666 "footnote label '1' should be tracked: {:?}",
667 parsed.manual_note_labels
668 );
669 assert_eq!(parsed.manual_note_order, vec!["1".to_string()]);
670 }
671
672 #[test]
673 fn given_citation_in_prose_when_parse_document_then_placement_is_inline_prose() {
674 let parser = MarkdownParser;
675 let doc = "As shown by [@kuhn1962], the method works.\n\n[^1]: Unrelated note.";
676 let parsed = parser.parse_document(doc, &Locale::en_us());
677
678 assert_eq!(parsed.citations.len(), 1);
679 assert!(
680 matches!(
681 parsed.citations[0].placement,
682 CitationPlacement::InlineProse
683 ),
684 "prose citation should be InlineProse: {:?}",
685 parsed.citations[0].placement
686 );
687 }
688
689 #[test]
690 fn given_multiple_footnotes_when_parse_document_then_note_order_is_first_reference_order() {
691 let parser = MarkdownParser;
692 let doc = "First[^b] then[^a].\n\n[^a]: [@kuhn1962].\n\n[^b]: [@smith2010].";
693 let parsed = parser.parse_document(doc, &Locale::en_us());
694
695 assert_eq!(
697 parsed.manual_note_order,
698 vec!["b".to_string(), "a".to_string()]
699 );
700 assert_eq!(parsed.citations.len(), 2);
701 for c in &parsed.citations {
702 assert!(
703 matches!(c.placement, CitationPlacement::ManualFootnote { .. }),
704 "both citations are inside footnote definitions: {:?}",
705 c.placement
706 );
707 }
708 }
709}