Skip to main content

panache_parser/parser/inlines/
citations.rs

1//! Citation parsing for Pandoc's citations extension.
2//!
3//! Syntax:
4//! - Bracketed: `[@doe99]`, `[@doe99; @smith2000]`
5//! - With locator: `[see @doe99, pp. 33-35]`
6//! - Suppress author: `[-@doe99]`
7//! - Author-in-text: `@doe99` (bare, without brackets)
8
9use crate::syntax::SyntaxKind;
10use rowan::GreenNodeBuilder;
11
12/// Try to parse a bracketed citation starting at the current position.
13/// Returns Some((length, content)) if successful, None otherwise.
14///
15/// Bracketed citations have the syntax: [@key], [@key1; @key2], [see @key, pp. 1-10]
16pub(crate) fn try_parse_bracketed_citation(text: &str) -> Option<(usize, &str)> {
17    let bytes = text.as_bytes();
18
19    // Must start with [
20    if bytes.is_empty() || bytes[0] != b'[' {
21        return None;
22    }
23
24    // Look ahead to see if this contains a citation marker (@)
25    // We need to distinguish from regular links
26    let mut has_citation = false;
27    let mut pos = 1;
28    let mut bracket_depth = 0;
29
30    while pos < bytes.len() {
31        match bytes[pos] {
32            b'\\' => {
33                // Skip escaped character
34                pos += 2;
35                continue;
36            }
37            b'[' => {
38                bracket_depth += 1;
39                pos += 1;
40            }
41            b']' => {
42                if bracket_depth == 0 {
43                    // Closing bracket of main citation - stop looking
44                    break;
45                }
46                bracket_depth -= 1;
47                pos += 1;
48            }
49            b'@' => {
50                // Found a citation marker - this is likely a citation
51                has_citation = true;
52                break;
53            }
54            _ => {
55                pos += 1;
56            }
57        }
58    }
59
60    if !has_citation {
61        return None;
62    }
63
64    // Now find the closing bracket
65    pos = 1;
66    bracket_depth = 1;
67
68    while pos < bytes.len() {
69        match bytes[pos] {
70            b'\\' => {
71                // Skip escaped character
72                pos += 2;
73                continue;
74            }
75            b'[' => {
76                bracket_depth += 1;
77                pos += 1;
78            }
79            b']' => {
80                bracket_depth -= 1;
81                if bracket_depth == 0 {
82                    // Found the closing bracket
83                    let content = &text[1..pos];
84                    return Some((pos + 1, content));
85                }
86                pos += 1;
87            }
88            _ => {
89                pos += 1;
90            }
91        }
92    }
93
94    // No closing bracket found
95    None
96}
97
98/// Try to parse a bare citation (author-in-text) starting at the current position.
99/// Returns Some((length, key, has_suppress)) if successful, None otherwise.
100///
101/// Bare citations have the syntax: @key or -@key
102pub(crate) fn try_parse_bare_citation(text: &str) -> Option<(usize, &str, bool)> {
103    let bytes = text.as_bytes();
104
105    if bytes.is_empty() {
106        return None;
107    }
108
109    let mut pos = 0;
110    let has_suppress = bytes[pos] == b'-';
111
112    if has_suppress {
113        pos += 1;
114        if pos >= bytes.len() {
115            return None;
116        }
117    }
118
119    // Must have @ next
120    if bytes[pos] != b'@' {
121        return None;
122    }
123    pos += 1;
124
125    if pos >= bytes.len() {
126        return None;
127    }
128
129    // Parse the citation key
130    let key_start = pos;
131    let key_len = parse_citation_key(&text[pos..])?;
132
133    if key_len == 0 {
134        return None;
135    }
136
137    let total_len = pos + key_len;
138    let key = &text[key_start..total_len];
139
140    Some((total_len, key, has_suppress))
141}
142
143/// Try to parse a Quarto cross-reference key (e.g., @fig-plot, @eq-energy).
144pub fn is_quarto_crossref_key(key: &str) -> bool {
145    let lower = key.to_ascii_lowercase();
146    let mut parts = lower.splitn(2, '-');
147    let prefix = parts.next().unwrap_or("");
148    let rest = parts.next().unwrap_or("");
149    if rest.is_empty() {
150        return false;
151    }
152    matches!(
153        prefix,
154        "fig"
155            | "tbl"
156            | "lst"
157            | "tip"
158            | "nte"
159            | "wrn"
160            | "imp"
161            | "cau"
162            | "thm"
163            | "lem"
164            | "cor"
165            | "prp"
166            | "cnj"
167            | "def"
168            | "exm"
169            | "exr"
170            | "sol"
171            | "rem"
172            | "alg"
173            | "eq"
174            | "sec"
175    )
176}
177
178pub const BOOKDOWN_LABEL_PREFIXES: &[&str] = &[
179    "eq", "fig", "tab", "thm", "lem", "cor", "prp", "cnj", "def", "exm", "exr", "sol", "rem",
180    "alg", "sec", "hyp",
181];
182
183pub fn is_bookdown_label(label: &str) -> bool {
184    BOOKDOWN_LABEL_PREFIXES.contains(&label)
185}
186
187pub fn has_bookdown_prefix(label: &str) -> bool {
188    let mut parts = label.splitn(2, ':');
189    let prefix = parts.next().unwrap_or("");
190    let rest = parts.next().unwrap_or("");
191    if rest.is_empty() {
192        return false;
193    }
194    is_bookdown_label(prefix)
195}
196
197pub(crate) fn emit_crossref(builder: &mut GreenNodeBuilder, key: &str, has_suppress: bool) {
198    builder.start_node(SyntaxKind::CROSSREF.into());
199
200    if has_suppress {
201        builder.token(SyntaxKind::CROSSREF_MARKER.into(), "-@");
202    } else {
203        builder.token(SyntaxKind::CROSSREF_MARKER.into(), "@");
204    }
205
206    if key.starts_with('{') && key.ends_with('}') {
207        builder.token(SyntaxKind::CROSSREF_BRACE_OPEN.into(), "{");
208        builder.token(SyntaxKind::CROSSREF_KEY.into(), &key[1..key.len() - 1]);
209        builder.token(SyntaxKind::CROSSREF_BRACE_CLOSE.into(), "}");
210    } else {
211        builder.token(SyntaxKind::CROSSREF_KEY.into(), key);
212    }
213
214    builder.finish_node();
215}
216
217pub(crate) fn emit_bookdown_crossref(builder: &mut GreenNodeBuilder, key: &str) {
218    builder.start_node(SyntaxKind::CROSSREF.into());
219    builder.token(SyntaxKind::CROSSREF_BOOKDOWN_OPEN.into(), "\\@ref(");
220    builder.token(SyntaxKind::CROSSREF_KEY.into(), key);
221    builder.token(SyntaxKind::CROSSREF_BOOKDOWN_CLOSE.into(), ")");
222    builder.finish_node();
223}
224
225/// Parse a citation key following Pandoc's rules.
226/// Returns the length of the key, or None if invalid.
227///
228/// Citation keys:
229/// - Must start with letter, digit, or _
230/// - Can contain alphanumerics and single internal punctuation: :.#$%&-+?<>~/
231/// - Keys in braces @{...} can contain anything
232/// - Double internal punctuation terminates key
233/// - Trailing punctuation not included
234fn parse_citation_key(text: &str) -> Option<usize> {
235    if text.is_empty() {
236        return None;
237    }
238
239    // Check for braced key: @{...}
240    if text.starts_with('{') {
241        // Find matching closing brace
242        let mut escape_next = false;
243
244        for (idx, ch) in text.char_indices().skip(1) {
245            if escape_next {
246                escape_next = false;
247                continue;
248            }
249
250            match ch {
251                '\\' => escape_next = true,
252                '}' => return Some(idx + ch.len_utf8()),
253                _ => {}
254            }
255        }
256
257        // No closing brace found
258        return None;
259    }
260
261    // Regular key: must start with letter, digit, or _
262    let mut iter = text.char_indices();
263    let (_, first_char) = iter.next()?;
264    if !first_char.is_alphanumeric() && first_char != '_' {
265        return None;
266    }
267
268    let mut last_alnum_end = first_char.len_utf8();
269    let mut last_included_end = last_alnum_end;
270    let mut last_punct_start: Option<usize> = None;
271    let mut prev_was_punct = false;
272
273    for (idx, ch) in iter {
274        if ch.is_alphanumeric() || ch == '_' {
275            prev_was_punct = false;
276            last_alnum_end = idx + ch.len_utf8();
277            last_included_end = last_alnum_end;
278            last_punct_start = None;
279        } else if is_internal_punctuation(ch) {
280            // Check if previous was also punctuation (double punct terminates)
281            if prev_was_punct {
282                // Double punctuation - terminate before the first punctuation
283                return Some(last_punct_start.unwrap_or(last_alnum_end));
284            }
285            prev_was_punct = true;
286            last_punct_start = Some(idx);
287            last_included_end = idx + ch.len_utf8();
288        } else {
289            // Not a valid key character - terminate here
290            break;
291        }
292    }
293
294    if prev_was_punct {
295        return Some(last_alnum_end);
296    }
297
298    if last_included_end == 0 {
299        None
300    } else {
301        Some(last_included_end)
302    }
303}
304
305/// Check if a character is valid internal punctuation in citation keys.
306fn is_internal_punctuation(ch: char) -> bool {
307    matches!(
308        ch,
309        ':' | '.' | '#' | '$' | '%' | '&' | '-' | '+' | '?' | '<' | '>' | '~' | '/'
310    )
311}
312
313/// Emit a bracketed citation node to the builder.
314pub(crate) fn emit_bracketed_citation(builder: &mut GreenNodeBuilder, content: &str) {
315    builder.start_node(SyntaxKind::CITATION.into());
316
317    // Opening bracket
318    builder.token(SyntaxKind::LINK_START.into(), "[");
319
320    // Emit prefix + citations + suffix with fine-grained tokens.
321    emit_bracketed_citation_content(builder, content);
322
323    // Closing bracket
324    builder.token(SyntaxKind::LINK_DEST.into(), "]");
325
326    builder.finish_node();
327}
328
329fn emit_bracketed_citation_content(builder: &mut GreenNodeBuilder, content: &str) {
330    let mut text_start = 0;
331    let mut iter = content.char_indices().peekable();
332
333    while let Some((idx, ch)) = iter.next() {
334        // Backslash escapes (e.g. `\@`, `\[`, `\]`) suppress citation/separator
335        // recognition for the following character — matching Pandoc, which
336        // treats the escape as a literal in the citation prefix/suffix.
337        if ch == '\\' {
338            iter.next();
339            continue;
340        }
341
342        if ch == '@' || (ch == '-' && matches!(iter.peek(), Some((_, '@')))) {
343            if idx > text_start {
344                builder.token(
345                    SyntaxKind::CITATION_CONTENT.into(),
346                    &content[text_start..idx],
347                );
348            }
349
350            let mut marker_len = 1;
351            let marker_text = if ch == '-' {
352                iter.next();
353                marker_len = 2;
354                "-@"
355            } else {
356                "@"
357            };
358            builder.token(SyntaxKind::CITATION_MARKER.into(), marker_text);
359
360            let key_start = idx + marker_len;
361            if key_start >= content.len() {
362                text_start = key_start;
363                continue;
364            }
365
366            if let Some(key_len) = parse_citation_key(&content[key_start..]) {
367                let key_end = key_start + key_len;
368                let key = &content[key_start..key_end];
369                if key.starts_with('{') && key.ends_with('}') {
370                    builder.token(SyntaxKind::CITATION_BRACE_OPEN.into(), "{");
371                    if key.len() > 2 {
372                        builder.token(SyntaxKind::CITATION_KEY.into(), &key[1..key.len() - 1]);
373                    }
374                    builder.token(SyntaxKind::CITATION_BRACE_CLOSE.into(), "}");
375                } else {
376                    builder.token(SyntaxKind::CITATION_KEY.into(), key);
377                }
378                while matches!(iter.peek(), Some((next_idx, _)) if *next_idx < key_end) {
379                    iter.next();
380                }
381                text_start = key_end;
382                continue;
383            }
384
385            text_start = key_start;
386            continue;
387        }
388
389        if ch == ';' {
390            if idx > text_start {
391                builder.token(
392                    SyntaxKind::CITATION_CONTENT.into(),
393                    &content[text_start..idx],
394                );
395            }
396            builder.token(SyntaxKind::CITATION_SEPARATOR.into(), ";");
397            text_start = idx + ch.len_utf8();
398            continue;
399        }
400    }
401
402    if text_start < content.len() {
403        builder.token(SyntaxKind::CITATION_CONTENT.into(), &content[text_start..]);
404    }
405}
406
407/// Emit a bare citation node to the builder.
408pub(crate) fn emit_bare_citation(builder: &mut GreenNodeBuilder, key: &str, has_suppress: bool) {
409    builder.start_node(SyntaxKind::CITATION.into());
410
411    // Emit marker (@ or -@)
412    if has_suppress {
413        builder.token(SyntaxKind::CITATION_MARKER.into(), "-@");
414    } else {
415        builder.token(SyntaxKind::CITATION_MARKER.into(), "@");
416    }
417
418    // Check if key is braced
419    if key.starts_with('{') && key.ends_with('}') {
420        builder.token(SyntaxKind::CITATION_BRACE_OPEN.into(), "{");
421        builder.token(SyntaxKind::CITATION_KEY.into(), &key[1..key.len() - 1]);
422        builder.token(SyntaxKind::CITATION_BRACE_CLOSE.into(), "}");
423    } else {
424        builder.token(SyntaxKind::CITATION_KEY.into(), key);
425    }
426
427    builder.finish_node();
428}
429
430#[cfg(test)]
431mod tests {
432    use super::*;
433
434    // Citation key parsing tests
435    #[test]
436    fn test_parse_simple_citation_key() {
437        assert_eq!(parse_citation_key("doe99"), Some(5));
438        assert_eq!(parse_citation_key("smith2000"), Some(9));
439    }
440
441    #[test]
442    fn test_parse_citation_key_with_internal_punct() {
443        assert_eq!(parse_citation_key("Foo_bar.baz"), Some(11));
444        assert_eq!(parse_citation_key("author:2020"), Some(11));
445    }
446
447    #[test]
448    fn test_parse_citation_key_trailing_punct() {
449        // Trailing punctuation should be excluded
450        assert_eq!(parse_citation_key("Foo_bar.baz."), Some(11));
451        assert_eq!(parse_citation_key("key:value:"), Some(9));
452    }
453
454    #[test]
455    fn test_parse_citation_key_double_punct() {
456        // Double punctuation terminates key
457        assert_eq!(parse_citation_key("Foo_bar--baz"), Some(7)); // key is "Foo_bar"
458    }
459
460    #[test]
461    fn test_parse_citation_key_with_braces() {
462        assert_eq!(parse_citation_key("{https://example.com}"), Some(21));
463        assert_eq!(parse_citation_key("{Foo_bar.baz.}"), Some(14));
464    }
465
466    #[test]
467    fn test_parse_citation_key_invalid_start() {
468        assert_eq!(parse_citation_key(".invalid"), None);
469        assert_eq!(parse_citation_key(":invalid"), None);
470    }
471
472    #[test]
473    fn test_parse_citation_key_stops_at_space() {
474        assert_eq!(parse_citation_key("key rest"), Some(3));
475    }
476
477    // Bare citation parsing tests
478    #[test]
479    fn test_parse_bare_citation_simple() {
480        let result = try_parse_bare_citation("@doe99");
481        assert_eq!(result, Some((6, "doe99", false)));
482    }
483
484    #[test]
485    fn test_parse_bare_citation_with_suppress() {
486        let result = try_parse_bare_citation("-@smith04");
487        assert_eq!(result, Some((9, "smith04", true)));
488    }
489
490    #[test]
491    fn test_parse_bare_citation_with_trailing_text() {
492        let result = try_parse_bare_citation("@doe99 says");
493        assert_eq!(result, Some((6, "doe99", false)));
494    }
495
496    #[test]
497    fn test_parse_bare_citation_braced_key() {
498        let result = try_parse_bare_citation("@{https://example.com}");
499        assert_eq!(result, Some((22, "{https://example.com}", false)));
500    }
501
502    #[test]
503    fn test_parse_bare_citation_not_citation() {
504        assert_eq!(try_parse_bare_citation("not a citation"), None);
505        assert_eq!(try_parse_bare_citation("@"), None);
506    }
507
508    // Bracketed citation parsing tests
509    #[test]
510    fn test_parse_bracketed_citation_simple() {
511        let result = try_parse_bracketed_citation("[@doe99]");
512        assert_eq!(result, Some((8, "@doe99")));
513    }
514
515    #[test]
516    fn test_parse_bracketed_citation_multiple() {
517        let result = try_parse_bracketed_citation("[@doe99; @smith2000]");
518        assert_eq!(result, Some((20, "@doe99; @smith2000")));
519    }
520
521    #[test]
522    fn test_parse_bracketed_citation_with_prefix() {
523        let result = try_parse_bracketed_citation("[see @doe99]");
524        assert_eq!(result, Some((12, "see @doe99")));
525    }
526
527    #[test]
528    fn test_parse_bracketed_citation_with_locator() {
529        let result = try_parse_bracketed_citation("[@doe99, pp. 33-35]");
530        assert_eq!(result, Some((19, "@doe99, pp. 33-35")));
531    }
532
533    #[test]
534    fn test_parse_bracketed_citation_complex() {
535        let result = try_parse_bracketed_citation("[see @doe99, pp. 33-35 and *passim*]");
536        assert_eq!(result, Some((36, "see @doe99, pp. 33-35 and *passim*")));
537    }
538
539    #[test]
540    fn test_parse_bracketed_citation_with_suppress() {
541        let result = try_parse_bracketed_citation("[-@doe99]");
542        assert_eq!(result, Some((9, "-@doe99")));
543    }
544
545    #[test]
546    fn test_parse_bracketed_citation_not_citation() {
547        // Regular link should not be parsed as citation
548        assert_eq!(try_parse_bracketed_citation("[text](url)"), None);
549        assert_eq!(try_parse_bracketed_citation("[just text]"), None);
550    }
551
552    #[test]
553    fn test_parse_bracketed_citation_nested_brackets() {
554        let result = try_parse_bracketed_citation("[see [nested] @doe99]");
555        assert_eq!(result, Some((21, "see [nested] @doe99")));
556    }
557
558    #[test]
559    fn test_parse_bracketed_citation_escaped_bracket() {
560        let result = try_parse_bracketed_citation(r"[@doe99 with \] escaped]");
561        assert_eq!(result, Some((24, r"@doe99 with \] escaped")));
562    }
563
564    #[test]
565    fn test_parse_bracketed_citation_paren_in_prefix() {
566        // Pandoc treats parens in the citation prefix as ordinary text;
567        // they must not abort citation detection.
568        let result = try_parse_bracketed_citation("[see (Smith 1999) and @doe99]");
569        assert_eq!(result, Some((29, "see (Smith 1999) and @doe99")));
570    }
571
572    #[test]
573    fn test_parse_bracketed_citation_escaped_at_in_prefix() {
574        // Pandoc accepts \@ref(label) inside the citation prefix without
575        // mistaking it for a citation marker; the actual citation is the
576        // unescaped @key that follows.
577        let result =
578            try_parse_bracketed_citation(r"[see also \@ref(svm) and @bischl_applied_2024]");
579        assert_eq!(
580            result,
581            Some((46, r"see also \@ref(svm) and @bischl_applied_2024"))
582        );
583    }
584}