citum_engine/processor/document/
markdown.rs1use super::{CitationParser, CitationPlacement, CitationStructure, ParsedCitation, ParsedDocument};
9use crate::{Citation, CitationItem};
10use citum_schema::citation::{CitationMode, normalize_locator_text};
11use citum_schema::locale::Locale;
12use std::collections::HashSet;
13
14pub struct MarkdownParser;
20
21impl Default for MarkdownParser {
22 fn default() -> Self {
23 Self
24 }
25}
26
27impl CitationParser for MarkdownParser {
28 fn parse_document(&self, content: &str, locale: &Locale) -> ParsedDocument {
29 let citations = find_citations(content, locale)
30 .into_iter()
31 .map(|(start, end, citation)| ParsedCitation {
32 start,
33 end,
34 citation,
35 placement: CitationPlacement::InlineProse,
36 structure: CitationStructure::default(),
37 })
38 .collect();
39
40 ParsedDocument {
41 citations,
42 manual_note_order: Vec::new(),
43 manual_note_references: Vec::new(),
44 manual_note_labels: HashSet::new(),
45 bibliography_blocks: Vec::new(),
46 frontmatter_groups: None,
47 frontmatter_integral_name_memory: None,
48 body_start: 0,
49 }
50 }
51}
52
53#[allow(
54 clippy::string_slice,
55 clippy::unreachable,
56 reason = "Markdown scanning logic"
57)]
58fn find_citations(content: &str, locale: &Locale) -> Vec<(usize, usize, Citation)> {
59 let mut results = Vec::new();
60 let mut offset = 0;
61
62 while offset < content.len() {
63 let remaining = &content[offset..];
64 let next_at = remaining.find('@');
65 let next_bracket = remaining.find('[');
66
67 let (relative_start, kind) = match (next_at, next_bracket) {
68 (Some(at), Some(bracket)) if bracket <= at => (bracket, ScanKind::Bracket),
69 (Some(at), Some(bracket)) if at < bracket => (at, ScanKind::Textual),
70 (Some(at), None) => (at, ScanKind::Textual),
71 (None, Some(bracket)) => (bracket, ScanKind::Bracket),
72 (None, None) => break,
73 _ => unreachable!(),
74 };
75
76 let start = offset + relative_start;
77 let candidate = &content[start..];
78
79 let parsed = match kind {
80 ScanKind::Bracket => parse_bracketed_citation(candidate, locale),
81 ScanKind::Textual => parse_textual_citation(content, start, locale),
82 };
83
84 if let Some((consumed, citation)) = parsed {
85 results.push((start, start + consumed, citation));
86 offset = start + consumed;
87 } else if matches!(kind, ScanKind::Bracket) {
88 offset = start + candidate.find(']').map_or(1, |idx| idx + 1);
89 } else {
90 offset = start + 1;
91 }
92 }
93
94 results
95}
96
97#[derive(Debug, Clone, Copy)]
98enum ScanKind {
99 Bracket,
100 Textual,
101}
102
103#[allow(clippy::string_slice, reason = "Brackets and @ are 1-byte ASCII")]
104fn parse_bracketed_citation(input: &str, locale: &Locale) -> Option<(usize, Citation)> {
105 if !input.starts_with('[') {
106 return None;
107 }
108
109 let closing = input.find(']')?;
110 let inner = input[1..closing].trim();
111 if inner.is_empty() || !inner.contains('@') {
112 return None;
113 }
114
115 let mut items = Vec::new();
116 let mut suppress_author = None;
117
118 for segment in inner.split(';') {
119 let (item, suppress) = parse_bracketed_item(segment, locale)?;
120 if let Some(existing) = suppress_author {
121 if existing != suppress {
122 return None;
123 }
124 } else {
125 suppress_author = Some(suppress);
126 }
127 items.push(item);
128 }
129
130 Some((
131 closing + 1,
132 Citation {
133 items,
134 suppress_author: suppress_author.unwrap_or(false),
135 ..Default::default()
136 },
137 ))
138}
139
140#[allow(
141 clippy::string_slice,
142 clippy::indexing_slicing,
143 reason = "Citations are ASCII-heavy; indices from find() are on char boundaries"
144)]
145fn parse_bracketed_item(segment: &str, locale: &Locale) -> Option<(CitationItem, bool)> {
146 let segment = segment.trim();
147 let at_pos = segment.find('@')?;
148 let mut suppress_author = false;
149 let prefix_end = if at_pos > 0 && segment.as_bytes()[at_pos - 1] == b'-' {
150 suppress_author = true;
151 at_pos - 1
152 } else {
153 at_pos
154 };
155
156 let prefix = normalize_prefix(&segment[..prefix_end]);
157 let after_at = &segment[at_pos + 1..];
158 let key_end = cite_key_len(after_at)?;
159 let key = &after_at[..key_end];
160 let remainder = after_at[key_end..].trim_start();
161
162 let mut item = CitationItem {
163 id: key.to_string(),
164 prefix,
165 ..Default::default()
166 };
167
168 if let Some(rest) = remainder.strip_prefix(',') {
169 let rest = rest.trim();
170 if !rest.is_empty() {
171 item.locator = normalize_locator_text(rest, locale);
172 if item.locator.is_none() {
173 item.suffix = Some(rest.to_string());
174 }
175 }
176 } else if !remainder.is_empty() {
177 item.suffix = Some(remainder.trim().to_string());
178 }
179
180 Some((item, suppress_author))
181}
182
183#[allow(clippy::string_slice, reason = "@ and indices from find() are safe")]
184fn parse_textual_citation(
185 content: &str,
186 start: usize,
187 locale: &Locale,
188) -> Option<(usize, Citation)> {
189 if !is_valid_textual_start(content, start) {
190 return None;
191 }
192
193 let after_at = &content[start + 1..];
194 let key_end = cite_key_len(after_at)?;
195 let key = &after_at[..key_end];
196 let mut consumed = 1 + key_end;
197
198 let mut item = CitationItem {
199 id: key.to_string(),
200 ..Default::default()
201 };
202
203 let trailing = &content[start + consumed..];
204 if let Some((locator_consumed, locator)) = parse_textual_locator_suffix(trailing, locale) {
205 item.locator = Some(locator);
206 consumed += locator_consumed;
207 }
208
209 Some((
210 consumed,
211 Citation {
212 mode: CitationMode::Integral,
213 items: vec![item],
214 ..Default::default()
215 },
216 ))
217}
218
219#[allow(clippy::string_slice, reason = "Brackets and @ are 1-byte ASCII")]
220fn parse_textual_locator_suffix(
221 input: &str,
222 locale: &Locale,
223) -> Option<(usize, citum_schema::citation::CitationLocator)> {
224 let whitespace_len = input.len() - input.trim_start_matches(char::is_whitespace).len();
225 let rest = &input[whitespace_len..];
226 if !rest.starts_with('[') {
227 return None;
228 }
229
230 let closing = rest.find(']')?;
231 let inner = rest[1..closing].trim();
232 if inner.is_empty() || inner.contains('@') {
233 return None;
234 }
235
236 let locator = normalize_locator_text(inner, locale)?;
237 Some((whitespace_len + closing + 1, locator))
238}
239
240fn cite_key_len(input: &str) -> Option<usize> {
241 let len = input
242 .char_indices()
243 .take_while(
244 |(_, ch)| matches!(ch, 'A'..='Z' | 'a'..='z' | '0'..='9' | '_' | '-' | ':' | '.'),
245 )
246 .map(|(idx, ch)| idx + ch.len_utf8())
247 .last()
248 .unwrap_or(0);
249
250 if len == 0 { None } else { Some(len) }
251}
252
253fn normalize_prefix(prefix: &str) -> Option<String> {
254 let trimmed = prefix.trim();
255 if trimmed.is_empty() {
256 None
257 } else {
258 Some(format!("{trimmed} "))
259 }
260}
261
262#[allow(clippy::string_slice, reason = "start index from find() is safe")]
263fn is_valid_textual_start(content: &str, start: usize) -> bool {
264 let prev = content[..start].chars().next_back();
265 !matches!(prev, Some(ch) if ch.is_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | '@'))
266}
267
268#[cfg(test)]
269#[allow(
270 clippy::unwrap_used,
271 clippy::expect_used,
272 clippy::panic,
273 clippy::indexing_slicing,
274 clippy::todo,
275 clippy::unimplemented,
276 clippy::unreachable,
277 clippy::get_unwrap,
278 reason = "Panicking is acceptable and often desired in tests."
279)]
280mod tests {
281 use super::*;
282 use citum_schema::citation::{CitationLocator, LocatorType};
283
284 #[test]
285 fn test_parse_bracketed_multi_cite() {
286 let parser = MarkdownParser;
287 let citations =
288 parser.parse_citations("See [@kuhn1962; @watson1953, ch. 2].", &Locale::en_us());
289
290 assert_eq!(citations.len(), 1);
291 let (_, _, citation) = &citations[0];
292 assert_eq!(citation.items.len(), 2);
293 assert_eq!(citation.items[0].id, "kuhn1962");
294 assert_eq!(
295 citation.items[1].locator,
296 Some(CitationLocator::single(LocatorType::Chapter, "2"))
297 );
298 }
299
300 #[test]
301 fn test_parse_bracketed_prefix_and_suppress_author() {
302 let parser = MarkdownParser;
303 let citations = parser.parse_citations("[see -@kuhn1962, p. 10]", &Locale::en_us());
304
305 assert_eq!(citations.len(), 1);
306 let (_, _, citation) = &citations[0];
307 assert!(citation.suppress_author);
308 assert_eq!(citation.items[0].prefix.as_deref(), Some("see "));
309 assert_eq!(
310 citation.items[0].locator,
311 Some(CitationLocator::single(LocatorType::Page, "10"))
312 );
313 }
314
315 #[test]
316 fn test_parse_textual_citation() {
317 let parser = MarkdownParser;
318 let citations = parser.parse_citations(
319 "Kuhn argued that @kuhn1962 changed science.",
320 &Locale::en_us(),
321 );
322
323 assert_eq!(citations.len(), 1);
324 let (_, _, citation) = &citations[0];
325 assert_eq!(citation.mode, CitationMode::Integral);
326 assert_eq!(citation.items[0].id, "kuhn1962");
327 }
328
329 #[test]
330 fn test_parse_textual_citation_with_locator_suffix() {
331 let parser = MarkdownParser;
332 let citations =
333 parser.parse_citations("@kuhn1962 [p. 10] argues this point.", &Locale::en_us());
334
335 assert_eq!(citations.len(), 1);
336 let (_, _, citation) = &citations[0];
337 assert_eq!(citation.mode, CitationMode::Integral);
338 assert_eq!(
339 citation.items[0].locator,
340 Some(CitationLocator::single(LocatorType::Page, "10"))
341 );
342 }
343
344 #[test]
345 fn test_parse_document_marks_citations_as_inline_prose() {
346 let parser = MarkdownParser;
347 let parsed = parser.parse_document("Text [@kuhn1962].", &Locale::en_us());
348
349 assert_eq!(parsed.citations.len(), 1);
350 assert_eq!(
351 parsed.citations[0].placement,
352 CitationPlacement::InlineProse
353 );
354 assert!(parsed.manual_note_order.is_empty());
355 assert!(parsed.bibliography_blocks.is_empty());
356 }
357
358 #[test]
359 fn test_does_not_parse_email_address() {
360 let parser = MarkdownParser;
361 let citations =
362 parser.parse_citations("Contact test@example.com for details.", &Locale::en_us());
363
364 assert!(citations.is_empty());
365 }
366
367 #[test]
368 fn test_unsupported_bracket_cluster_does_not_fall_back_to_textual_citations() {
369 let parser = MarkdownParser;
370 let citations =
371 parser.parse_citations("Mixed [@kuhn1962; -@watson1953] cluster.", &Locale::en_us());
372
373 assert!(citations.is_empty());
374 }
375
376 #[test]
377 fn test_markdown_finalize_html_output_is_passthrough() {
378 let parser = MarkdownParser;
382 let input = "**bold** and _em_ and [@key].";
383 assert_eq!(parser.finalize_html_output(input), input);
384 }
385}