citum_engine/processor/document/
markdown.rs1use super::djot::parsing::parse_frontmatter;
9use super::{CitationParser, CitationPlacement, CitationStructure, ParsedCitation, ParsedDocument};
10use crate::{Citation, CitationItem};
11use citum_schema::citation::{CitationMode, normalize_locator_text};
12use citum_schema::locale::Locale;
13use std::collections::HashSet;
14
15pub struct MarkdownParser;
21
22impl Default for MarkdownParser {
23 fn default() -> Self {
24 Self
25 }
26}
27
28impl CitationParser for MarkdownParser {
29 fn parse_document(&self, content: &str, locale: &Locale) -> ParsedDocument {
30 let (frontmatter_result, body) = parse_frontmatter(content);
31 let body_start = content.len() - body.len();
32 let (frontmatter, frontmatter_error) = match frontmatter_result {
33 Ok(fm) => (fm, None),
34 Err(e) => (None, Some(e)),
35 };
36 let frontmatter_options = frontmatter.as_ref().and_then(|fm| fm.options.clone());
37 let frontmatter_integral_name_memory = frontmatter
39 .as_ref()
40 .and_then(|fm| fm.integral_name_memory.clone())
41 .filter(|_| {
42 frontmatter_options
43 .as_ref()
44 .and_then(|o| o.integral_name_memory.as_ref())
45 .is_none()
46 });
47 let frontmatter_org_abbreviation_memory = frontmatter
48 .and_then(|fm| fm.org_abbreviation_memory)
49 .filter(|_| {
50 frontmatter_options
51 .as_ref()
52 .and_then(|o| o.org_abbreviation_memory.as_ref())
53 .is_none()
54 });
55
56 let citations = find_citations(body, locale)
57 .into_iter()
58 .map(|(start, end, citation)| ParsedCitation {
59 start: body_start + start,
60 end: body_start + end,
61 citation,
62 placement: CitationPlacement::InlineProse,
63 structure: CitationStructure::default(),
64 })
65 .collect();
66
67 ParsedDocument {
68 citations,
69 manual_note_order: Vec::new(),
70 manual_note_references: Vec::new(),
71 manual_note_labels: HashSet::new(),
72 bibliography_blocks: Vec::new(),
73 frontmatter_groups: None,
74 frontmatter_integral_name_memory,
75 frontmatter_org_abbreviation_memory,
76 frontmatter_options,
77 frontmatter_error,
78 body_start,
79 }
80 }
81}
82
83#[allow(
84 clippy::string_slice,
85 clippy::unreachable,
86 reason = "Markdown scanning logic"
87)]
88fn find_citations(content: &str, locale: &Locale) -> Vec<(usize, usize, Citation)> {
89 let mut results = Vec::new();
90 let mut offset = 0;
91
92 while offset < content.len() {
93 let remaining = &content[offset..];
94 let next_at = remaining.find('@');
95 let next_bracket = remaining.find('[');
96
97 let (relative_start, kind) = match (next_at, next_bracket) {
98 (Some(at), Some(bracket)) if bracket <= at => (bracket, ScanKind::Bracket),
99 (Some(at), Some(bracket)) if at < bracket => (at, ScanKind::Textual),
100 (Some(at), None) => (at, ScanKind::Textual),
101 (None, Some(bracket)) => (bracket, ScanKind::Bracket),
102 (None, None) => break,
103 _ => unreachable!(),
104 };
105
106 let start = offset + relative_start;
107 let candidate = &content[start..];
108
109 let parsed = match kind {
110 ScanKind::Bracket => parse_bracketed_citation(candidate, locale),
111 ScanKind::Textual => parse_textual_citation(content, start, locale),
112 };
113
114 if let Some((consumed, citation)) = parsed {
115 results.push((start, start + consumed, citation));
116 offset = start + consumed;
117 } else if matches!(kind, ScanKind::Bracket) {
118 offset = start + candidate.find(']').map_or(1, |idx| idx + 1);
119 } else {
120 offset = start + 1;
121 }
122 }
123
124 results
125}
126
127#[derive(Debug, Clone, Copy)]
128enum ScanKind {
129 Bracket,
130 Textual,
131}
132
133#[allow(clippy::string_slice, reason = "Brackets and @ are 1-byte ASCII")]
134fn parse_bracketed_citation(input: &str, locale: &Locale) -> Option<(usize, Citation)> {
135 if !input.starts_with('[') {
136 return None;
137 }
138
139 let closing = input.find(']')?;
140 let inner = input[1..closing].trim();
141 if inner.is_empty() || !inner.contains('@') {
142 return None;
143 }
144
145 let mut items = Vec::new();
146 let mut suppress_author = None;
147
148 for segment in inner.split(';') {
149 let (item, suppress) = parse_bracketed_item(segment, locale)?;
150 if let Some(existing) = suppress_author {
151 if existing != suppress {
152 return None;
153 }
154 } else {
155 suppress_author = Some(suppress);
156 }
157 items.push(item);
158 }
159
160 Some((
161 closing + 1,
162 Citation {
163 items,
164 suppress_author: suppress_author.unwrap_or(false),
165 ..Default::default()
166 },
167 ))
168}
169
170#[allow(
171 clippy::string_slice,
172 clippy::indexing_slicing,
173 reason = "Citations are ASCII-heavy; indices from find() are on char boundaries"
174)]
175fn parse_bracketed_item(segment: &str, locale: &Locale) -> Option<(CitationItem, bool)> {
176 let segment = segment.trim();
177 let at_pos = segment.find('@')?;
178 let mut suppress_author = false;
179 let prefix_end = if at_pos > 0 && segment.as_bytes()[at_pos - 1] == b'-' {
180 suppress_author = true;
181 at_pos - 1
182 } else {
183 at_pos
184 };
185
186 let prefix = normalize_prefix(&segment[..prefix_end]);
187 let after_at = &segment[at_pos + 1..];
188 let key_end = cite_key_len(after_at)?;
189 let key = &after_at[..key_end];
190 let remainder = after_at[key_end..].trim_start();
191
192 let mut item = CitationItem {
193 id: key.to_string(),
194 prefix,
195 ..Default::default()
196 };
197
198 if let Some(rest) = remainder.strip_prefix(',') {
199 let rest = rest.trim();
200 if !rest.is_empty() {
201 item.locator = normalize_locator_text(rest, locale);
202 if item.locator.is_none() {
203 item.suffix = Some(rest.to_string());
204 }
205 }
206 } else if !remainder.is_empty() {
207 item.suffix = Some(remainder.trim().to_string());
208 }
209
210 Some((item, suppress_author))
211}
212
213#[allow(clippy::string_slice, reason = "@ and indices from find() are safe")]
214fn parse_textual_citation(
215 content: &str,
216 start: usize,
217 locale: &Locale,
218) -> Option<(usize, Citation)> {
219 if !is_valid_textual_start(content, start) {
220 return None;
221 }
222
223 let after_at = &content[start + 1..];
224 let key_end = cite_key_len(after_at)?;
225 let key = &after_at[..key_end];
226 let mut consumed = 1 + key_end;
227
228 let mut item = CitationItem {
229 id: key.to_string(),
230 ..Default::default()
231 };
232
233 let trailing = &content[start + consumed..];
234 if let Some((locator_consumed, locator)) = parse_textual_locator_suffix(trailing, locale) {
235 item.locator = Some(locator);
236 consumed += locator_consumed;
237 }
238
239 Some((
240 consumed,
241 Citation {
242 mode: CitationMode::Integral,
243 items: vec![item],
244 ..Default::default()
245 },
246 ))
247}
248
249#[allow(clippy::string_slice, reason = "Brackets and @ are 1-byte ASCII")]
250fn parse_textual_locator_suffix(
251 input: &str,
252 locale: &Locale,
253) -> Option<(usize, citum_schema::citation::CitationLocator)> {
254 let whitespace_len = input.len() - input.trim_start_matches(char::is_whitespace).len();
255 let rest = &input[whitespace_len..];
256 if !rest.starts_with('[') {
257 return None;
258 }
259
260 let closing = rest.find(']')?;
261 let inner = rest[1..closing].trim();
262 if inner.is_empty() || inner.contains('@') {
263 return None;
264 }
265
266 let locator = normalize_locator_text(inner, locale)?;
267 Some((whitespace_len + closing + 1, locator))
268}
269
270fn cite_key_len(input: &str) -> Option<usize> {
271 let len = input
272 .char_indices()
273 .take_while(
274 |(_, ch)| matches!(ch, 'A'..='Z' | 'a'..='z' | '0'..='9' | '_' | '-' | ':' | '.'),
275 )
276 .map(|(idx, ch)| idx + ch.len_utf8())
277 .last()
278 .unwrap_or(0);
279
280 if len == 0 { None } else { Some(len) }
281}
282
283fn normalize_prefix(prefix: &str) -> Option<String> {
284 let trimmed = prefix.trim();
285 if trimmed.is_empty() {
286 None
287 } else {
288 Some(format!("{trimmed} "))
289 }
290}
291
292#[allow(clippy::string_slice, reason = "start index from find() is safe")]
293fn is_valid_textual_start(content: &str, start: usize) -> bool {
294 let prev = content[..start].chars().next_back();
295 !matches!(prev, Some(ch) if ch.is_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | '@'))
296}
297
298#[cfg(test)]
299#[allow(
300 clippy::unwrap_used,
301 clippy::expect_used,
302 clippy::panic,
303 clippy::indexing_slicing,
304 clippy::todo,
305 clippy::unimplemented,
306 clippy::unreachable,
307 clippy::get_unwrap,
308 reason = "Panicking is acceptable and often desired in tests."
309)]
310mod tests {
311 use super::*;
312 use citum_schema::citation::{CitationLocator, LocatorType};
313
314 #[test]
315 fn test_parse_bracketed_multi_cite() {
316 let parser = MarkdownParser;
317 let citations =
318 parser.parse_citations("See [@kuhn1962; @watson1953, ch. 2].", &Locale::en_us());
319
320 assert_eq!(citations.len(), 1);
321 let (_, _, citation) = &citations[0];
322 assert_eq!(citation.items.len(), 2);
323 assert_eq!(citation.items[0].id, "kuhn1962");
324 assert_eq!(
325 citation.items[1].locator,
326 Some(CitationLocator::single(LocatorType::Chapter, "2"))
327 );
328 }
329
330 #[test]
331 fn test_parse_bracketed_prefix_and_suppress_author() {
332 let parser = MarkdownParser;
333 let citations = parser.parse_citations("[see -@kuhn1962, p. 10]", &Locale::en_us());
334
335 assert_eq!(citations.len(), 1);
336 let (_, _, citation) = &citations[0];
337 assert!(citation.suppress_author);
338 assert_eq!(citation.items[0].prefix.as_deref(), Some("see "));
339 assert_eq!(
340 citation.items[0].locator,
341 Some(CitationLocator::single(LocatorType::Page, "10"))
342 );
343 }
344
345 #[test]
346 fn test_parse_textual_citation() {
347 let parser = MarkdownParser;
348 let citations = parser.parse_citations(
349 "Kuhn argued that @kuhn1962 changed science.",
350 &Locale::en_us(),
351 );
352
353 assert_eq!(citations.len(), 1);
354 let (_, _, citation) = &citations[0];
355 assert_eq!(citation.mode, CitationMode::Integral);
356 assert_eq!(citation.items[0].id, "kuhn1962");
357 }
358
359 #[test]
360 fn test_parse_textual_citation_with_locator_suffix() {
361 let parser = MarkdownParser;
362 let citations =
363 parser.parse_citations("@kuhn1962 [p. 10] argues this point.", &Locale::en_us());
364
365 assert_eq!(citations.len(), 1);
366 let (_, _, citation) = &citations[0];
367 assert_eq!(citation.mode, CitationMode::Integral);
368 assert_eq!(
369 citation.items[0].locator,
370 Some(CitationLocator::single(LocatorType::Page, "10"))
371 );
372 }
373
374 #[test]
375 fn test_parse_document_marks_citations_as_inline_prose() {
376 let parser = MarkdownParser;
377 let parsed = parser.parse_document("Text [@kuhn1962].", &Locale::en_us());
378
379 assert_eq!(parsed.citations.len(), 1);
380 assert_eq!(
381 parsed.citations[0].placement,
382 CitationPlacement::InlineProse
383 );
384 assert!(parsed.manual_note_order.is_empty());
385 assert!(parsed.bibliography_blocks.is_empty());
386 }
387
388 #[test]
389 fn test_does_not_parse_email_address() {
390 let parser = MarkdownParser;
391 let citations =
392 parser.parse_citations("Contact test@example.com for details.", &Locale::en_us());
393
394 assert!(citations.is_empty());
395 }
396
397 #[test]
398 fn test_unsupported_bracket_cluster_does_not_fall_back_to_textual_citations() {
399 let parser = MarkdownParser;
400 let citations =
401 parser.parse_citations("Mixed [@kuhn1962; -@watson1953] cluster.", &Locale::en_us());
402
403 assert!(citations.is_empty());
404 }
405
406 #[test]
407 fn test_markdown_finalize_html_output_is_passthrough() {
408 let parser = MarkdownParser;
412 let input = "**bold** and _em_ and [@key].";
413 assert_eq!(parser.finalize_html_output(input), input);
414 }
415}