1use super::djot::parsing::parse_frontmatter;
9use super::{CitationParser, CitationPlacement, CitationStructure, ParsedCitation, ParsedDocument};
10use crate::{Citation, CitationItem};
11use citum_schema::citation::{CitationMode, normalize_locator_text};
12use citum_schema::locale::Locale;
13use std::collections::HashSet;
14
15pub struct MarkdownParser;
21
22impl Default for MarkdownParser {
23 fn default() -> Self {
24 Self
25 }
26}
27
28impl CitationParser for MarkdownParser {
29 fn finalize_html_output(&self, rendered: &str) -> String {
37 use pulldown_cmark::{Options, html};
38
39 let (remapped, token_map) = remap_nul_tokens(rendered);
40 let parser = pulldown_cmark::Parser::new_ext(&remapped, Options::ENABLE_STRIKETHROUGH);
41 let mut out = String::new();
42 html::push_html(&mut out, parser);
43
44 for (comment, original) in token_map {
46 out = out.replace(&comment, &original);
47 }
48 out
49 }
50
51 fn render_body_markup<F>(&self, body: &str, fmt: &F) -> String
54 where
55 F: crate::render::format::OutputFormat<Output = String>,
56 {
57 crate::render::markup::render_markdown_body(body, fmt)
58 }
59
60 fn parse_document(&self, content: &str, locale: &Locale) -> ParsedDocument {
61 let (frontmatter_result, body) = parse_frontmatter(content);
62 let body_start = content.len() - body.len();
63 let (frontmatter, frontmatter_error) = match frontmatter_result {
64 Ok(fm) => (fm, None),
65 Err(e) => (None, Some(e)),
66 };
67 let frontmatter_options = frontmatter.as_ref().and_then(|fm| fm.options.clone());
68 let frontmatter_integral_name_memory = frontmatter
70 .as_ref()
71 .and_then(|fm| fm.integral_name_memory.clone())
72 .filter(|_| {
73 frontmatter_options
74 .as_ref()
75 .and_then(|o| o.integral_name_memory.as_ref())
76 .is_none()
77 });
78 let frontmatter_org_abbreviation_memory = frontmatter
79 .and_then(|fm| fm.org_abbreviation_memory)
80 .filter(|_| {
81 frontmatter_options
82 .as_ref()
83 .and_then(|o| o.org_abbreviation_memory.as_ref())
84 .is_none()
85 });
86
87 let citations = find_citations(body, locale)
88 .into_iter()
89 .map(|(start, end, citation)| ParsedCitation {
90 start: body_start + start,
91 end: body_start + end,
92 citation,
93 placement: CitationPlacement::InlineProse,
94 structure: CitationStructure::default(),
95 })
96 .collect();
97
98 ParsedDocument {
99 citations,
100 manual_note_order: Vec::new(),
101 manual_note_references: Vec::new(),
102 manual_note_labels: HashSet::new(),
103 bibliography_blocks: Vec::new(),
104 frontmatter_groups: None,
105 frontmatter_integral_name_memory,
106 frontmatter_org_abbreviation_memory,
107 frontmatter_options,
108 frontmatter_error,
109 body_start,
110 }
111 }
112}
113
114#[allow(
115 clippy::string_slice,
116 clippy::unreachable,
117 reason = "Markdown scanning logic"
118)]
119fn find_citations(content: &str, locale: &Locale) -> Vec<(usize, usize, Citation)> {
120 let mut results = Vec::new();
121 let mut offset = 0;
122
123 while offset < content.len() {
124 let remaining = &content[offset..];
125 let next_at = remaining.find('@');
126 let next_bracket = remaining.find('[');
127
128 let (relative_start, kind) = match (next_at, next_bracket) {
129 (Some(at), Some(bracket)) if bracket <= at => (bracket, ScanKind::Bracket),
130 (Some(at), Some(bracket)) if at < bracket => (at, ScanKind::Textual),
131 (Some(at), None) => (at, ScanKind::Textual),
132 (None, Some(bracket)) => (bracket, ScanKind::Bracket),
133 (None, None) => break,
134 _ => unreachable!(),
135 };
136
137 let start = offset + relative_start;
138 let candidate = &content[start..];
139
140 let parsed = match kind {
141 ScanKind::Bracket => parse_bracketed_citation(candidate, locale),
142 ScanKind::Textual => parse_textual_citation(content, start, locale),
143 };
144
145 if let Some((consumed, citation)) = parsed {
146 results.push((start, start + consumed, citation));
147 offset = start + consumed;
148 } else if matches!(kind, ScanKind::Bracket) {
149 offset = start + candidate.find(']').map_or(1, |idx| idx + 1);
150 } else {
151 offset = start + 1;
152 }
153 }
154
155 results
156}
157
158#[derive(Debug, Clone, Copy)]
159enum ScanKind {
160 Bracket,
161 Textual,
162}
163
164#[allow(clippy::string_slice, reason = "Brackets and @ are 1-byte ASCII")]
165fn parse_bracketed_citation(input: &str, locale: &Locale) -> Option<(usize, Citation)> {
166 if !input.starts_with('[') {
167 return None;
168 }
169
170 let closing = input.find(']')?;
171 let inner = input[1..closing].trim();
172 if inner.is_empty() || !inner.contains('@') {
173 return None;
174 }
175
176 let mut items = Vec::new();
177 let mut suppress_author = None;
178
179 for segment in inner.split(';') {
180 let (item, suppress) = parse_bracketed_item(segment, locale)?;
181 if let Some(existing) = suppress_author {
182 if existing != suppress {
183 return None;
184 }
185 } else {
186 suppress_author = Some(suppress);
187 }
188 items.push(item);
189 }
190
191 Some((
192 closing + 1,
193 Citation {
194 items,
195 suppress_author: suppress_author.unwrap_or(false),
196 ..Default::default()
197 },
198 ))
199}
200
201#[allow(
202 clippy::string_slice,
203 clippy::indexing_slicing,
204 reason = "Citations are ASCII-heavy; indices from find() are on char boundaries"
205)]
206fn parse_bracketed_item(segment: &str, locale: &Locale) -> Option<(CitationItem, bool)> {
207 let segment = segment.trim();
208 let at_pos = segment.find('@')?;
209 let mut suppress_author = false;
210 let prefix_end = if at_pos > 0 && segment.as_bytes()[at_pos - 1] == b'-' {
211 suppress_author = true;
212 at_pos - 1
213 } else {
214 at_pos
215 };
216
217 let prefix = normalize_prefix(&segment[..prefix_end]);
218 let after_at = &segment[at_pos + 1..];
219 let key_end = cite_key_len(after_at)?;
220 let key = &after_at[..key_end];
221 let remainder = after_at[key_end..].trim_start();
222
223 let mut item = CitationItem {
224 id: key.to_string(),
225 prefix,
226 ..Default::default()
227 };
228
229 if let Some(rest) = remainder.strip_prefix(',') {
230 let rest = rest.trim();
231 if !rest.is_empty() {
232 item.locator = normalize_locator_text(rest, locale);
233 if item.locator.is_none() {
234 item.suffix = Some(rest.to_string());
235 }
236 }
237 } else if !remainder.is_empty() {
238 item.suffix = Some(remainder.trim().to_string());
239 }
240
241 Some((item, suppress_author))
242}
243
244#[allow(clippy::string_slice, reason = "@ and indices from find() are safe")]
245fn parse_textual_citation(
246 content: &str,
247 start: usize,
248 locale: &Locale,
249) -> Option<(usize, Citation)> {
250 if !is_valid_textual_start(content, start) {
251 return None;
252 }
253
254 let after_at = &content[start + 1..];
255 let key_end = cite_key_len(after_at)?;
256 let key = &after_at[..key_end];
257 let mut consumed = 1 + key_end;
258
259 let mut item = CitationItem {
260 id: key.to_string(),
261 ..Default::default()
262 };
263
264 let trailing = &content[start + consumed..];
265 if let Some((locator_consumed, locator)) = parse_textual_locator_suffix(trailing, locale) {
266 item.locator = Some(locator);
267 consumed += locator_consumed;
268 }
269
270 Some((
271 consumed,
272 Citation {
273 mode: CitationMode::Integral,
274 items: vec![item],
275 ..Default::default()
276 },
277 ))
278}
279
280#[allow(clippy::string_slice, reason = "Brackets and @ are 1-byte ASCII")]
281fn parse_textual_locator_suffix(
282 input: &str,
283 locale: &Locale,
284) -> Option<(usize, citum_schema::citation::CitationLocator)> {
285 let whitespace_len = input.len() - input.trim_start_matches(char::is_whitespace).len();
286 let rest = &input[whitespace_len..];
287 if !rest.starts_with('[') {
288 return None;
289 }
290
291 let closing = rest.find(']')?;
292 let inner = rest[1..closing].trim();
293 if inner.is_empty() || inner.contains('@') {
294 return None;
295 }
296
297 let locator = normalize_locator_text(inner, locale)?;
298 Some((whitespace_len + closing + 1, locator))
299}
300
301fn cite_key_len(input: &str) -> Option<usize> {
302 let len = input
303 .char_indices()
304 .take_while(
305 |(_, ch)| matches!(ch, 'A'..='Z' | 'a'..='z' | '0'..='9' | '_' | '-' | ':' | '.'),
306 )
307 .map(|(idx, ch)| idx + ch.len_utf8())
308 .last()
309 .unwrap_or(0);
310
311 if len == 0 { None } else { Some(len) }
312}
313
314fn normalize_prefix(prefix: &str) -> Option<String> {
315 let trimmed = prefix.trim();
316 if trimmed.is_empty() {
317 None
318 } else {
319 Some(format!("{trimmed} "))
320 }
321}
322
323#[allow(clippy::string_slice, reason = "start index from find() is safe")]
324fn is_valid_textual_start(content: &str, start: usize) -> bool {
325 let prev = content[..start].chars().next_back();
326 !matches!(prev, Some(ch) if ch.is_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | '@'))
327}
328
329fn remap_nul_tokens(s: &str) -> (String, Vec<(String, String)>) {
338 let mut result = String::with_capacity(s.len());
339 let mut map: Vec<(String, String)> = Vec::new();
340 let mut outside = true;
341 let mut token_body = String::new();
342 for ch in s.chars() {
343 if ch == '\x00' {
344 if outside {
345 token_body.clear();
347 } else {
348 let idx = map.len();
350 let comment = format!("<!--CITUM-TOKEN-{idx}-->");
351 let original = format!("\x00{token_body}\x00");
352 result.push_str(&comment);
353 map.push((comment, original));
354 }
355 outside = !outside;
356 } else if outside {
357 result.push(ch);
358 } else {
359 token_body.push(ch);
360 }
361 }
362 (result, map)
363}
364
365#[cfg(test)]
366#[allow(
367 clippy::unwrap_used,
368 clippy::expect_used,
369 clippy::panic,
370 clippy::indexing_slicing,
371 clippy::todo,
372 clippy::unimplemented,
373 clippy::unreachable,
374 clippy::get_unwrap,
375 reason = "Panicking is acceptable and often desired in tests."
376)]
377mod tests {
378 use super::*;
379 use citum_schema::citation::{CitationLocator, LocatorType};
380
381 #[test]
382 fn test_parse_bracketed_multi_cite() {
383 let parser = MarkdownParser;
384 let citations =
385 parser.parse_citations("See [@kuhn1962; @watson1953, ch. 2].", &Locale::en_us());
386
387 assert_eq!(citations.len(), 1);
388 let (_, _, citation) = &citations[0];
389 assert_eq!(citation.items.len(), 2);
390 assert_eq!(citation.items[0].id, "kuhn1962");
391 assert_eq!(
392 citation.items[1].locator,
393 Some(CitationLocator::single(LocatorType::Chapter, "2"))
394 );
395 }
396
397 #[test]
398 fn test_parse_bracketed_prefix_and_suppress_author() {
399 let parser = MarkdownParser;
400 let citations = parser.parse_citations("[see -@kuhn1962, p. 10]", &Locale::en_us());
401
402 assert_eq!(citations.len(), 1);
403 let (_, _, citation) = &citations[0];
404 assert!(citation.suppress_author);
405 assert_eq!(citation.items[0].prefix.as_deref(), Some("see "));
406 assert_eq!(
407 citation.items[0].locator,
408 Some(CitationLocator::single(LocatorType::Page, "10"))
409 );
410 }
411
412 #[test]
413 fn test_parse_textual_citation() {
414 let parser = MarkdownParser;
415 let citations = parser.parse_citations(
416 "Kuhn argued that @kuhn1962 changed science.",
417 &Locale::en_us(),
418 );
419
420 assert_eq!(citations.len(), 1);
421 let (_, _, citation) = &citations[0];
422 assert_eq!(citation.mode, CitationMode::Integral);
423 assert_eq!(citation.items[0].id, "kuhn1962");
424 }
425
426 #[test]
427 fn test_parse_textual_citation_with_locator_suffix() {
428 let parser = MarkdownParser;
429 let citations =
430 parser.parse_citations("@kuhn1962 [p. 10] argues this point.", &Locale::en_us());
431
432 assert_eq!(citations.len(), 1);
433 let (_, _, citation) = &citations[0];
434 assert_eq!(citation.mode, CitationMode::Integral);
435 assert_eq!(
436 citation.items[0].locator,
437 Some(CitationLocator::single(LocatorType::Page, "10"))
438 );
439 }
440
441 #[test]
442 fn test_parse_document_marks_citations_as_inline_prose() {
443 let parser = MarkdownParser;
444 let parsed = parser.parse_document("Text [@kuhn1962].", &Locale::en_us());
445
446 assert_eq!(parsed.citations.len(), 1);
447 assert_eq!(
448 parsed.citations[0].placement,
449 CitationPlacement::InlineProse
450 );
451 assert!(parsed.manual_note_order.is_empty());
452 assert!(parsed.bibliography_blocks.is_empty());
453 }
454
455 #[test]
456 fn test_does_not_parse_email_address() {
457 let parser = MarkdownParser;
458 let citations =
459 parser.parse_citations("Contact test@example.com for details.", &Locale::en_us());
460
461 assert!(citations.is_empty());
462 }
463
464 #[test]
465 fn test_unsupported_bracket_cluster_does_not_fall_back_to_textual_citations() {
466 let parser = MarkdownParser;
467 let citations =
468 parser.parse_citations("Mixed [@kuhn1962; -@watson1953] cluster.", &Locale::en_us());
469
470 assert!(citations.is_empty());
471 }
472
473 #[test]
474 fn given_markdown_body_when_finalize_html_output_then_markup_is_converted_to_html() {
475 let parser = MarkdownParser;
476 let input = "**bold** and _em_ text.";
477 let output = parser.finalize_html_output(input);
478 assert!(
479 output.contains("<strong>bold</strong>"),
480 "expected <strong>bold</strong> in: {output}"
481 );
482 assert!(
483 output.contains("<em>em</em>"),
484 "expected <em>em</em> in: {output}"
485 );
486 }
487
488 #[test]
489 fn given_markdown_with_nul_tokens_when_finalize_html_output_then_tokens_survive_conversion() {
490 let parser = MarkdownParser;
491 let token = "\x00CITUMHTMLINLINETOKEN0\x00";
494 let input = format!("Some prose with {token} inline.");
495 let output = parser.finalize_html_output(&input);
496 assert!(
497 output.contains(token),
498 "NUL token must survive pulldown-cmark conversion; output: {output}"
499 );
500 }
501
502 #[test]
503 fn given_markdown_blockquote_when_finalize_html_output_then_blockquote_element_emitted() {
504 let parser = MarkdownParser;
505 let input = "> block quote with *italic* text";
506 let output = parser.finalize_html_output(input);
507 assert!(
508 output.contains("<blockquote>"),
509 "expected <blockquote> in: {output}"
510 );
511 assert!(
512 output.contains("<em>italic</em>"),
513 "expected <em>italic</em> in: {output}"
514 );
515 }
516}