Skip to main content

panache_parser/parser/inlines/
citations.rs

1//! Citation parsing for Pandoc's citations extension.
2//!
3//! Syntax:
4//! - Bracketed: `[@doe99]`, `[@doe99; @smith2000]`
5//! - With locator: `[see @doe99, pp. 33-35]`
6//! - Suppress author: `[-@doe99]`
7//! - Author-in-text: `@doe99` (bare, without brackets)
8
9use super::sink::InlineSink;
10use crate::syntax::SyntaxKind;
11
12/// Try to parse a bracketed citation starting at the current position.
13/// Returns Some((length, content)) if successful, None otherwise.
14///
15/// Bracketed citations have the syntax: [@key], [@key1; @key2], [see @key, pp. 1-10]
16pub(crate) fn try_parse_bracketed_citation(text: &str) -> Option<(usize, &str)> {
17    let bytes = text.as_bytes();
18
19    // Must start with [
20    if bytes.is_empty() || bytes[0] != b'[' {
21        return None;
22    }
23
24    // Look ahead to see if this contains a citation marker (@)
25    // We need to distinguish from regular links
26    let mut has_citation = false;
27    let mut pos = 1;
28    let mut bracket_depth = 0;
29
30    while pos < bytes.len() {
31        match bytes[pos] {
32            b'\\' => {
33                // Skip escaped character
34                pos += 2;
35                continue;
36            }
37            b'`' => {
38                // Skip verbatim code spans; markers inside don't count.
39                match code_span_end(bytes, pos) {
40                    Some(end) => pos = end,
41                    None => pos += 1,
42                }
43            }
44            b'[' => {
45                bracket_depth += 1;
46                pos += 1;
47            }
48            b']' => {
49                if bracket_depth == 0 {
50                    // Closing bracket of main citation - stop looking
51                    break;
52                }
53                bracket_depth -= 1;
54                pos += 1;
55            }
56            b'@' => {
57                // Found a citation marker - this is likely a citation
58                has_citation = true;
59                break;
60            }
61            _ => {
62                pos += 1;
63            }
64        }
65    }
66
67    if !has_citation {
68        return None;
69    }
70
71    // Now find the closing bracket
72    pos = 1;
73    bracket_depth = 1;
74
75    while pos < bytes.len() {
76        match bytes[pos] {
77            b'\\' => {
78                // Skip escaped character
79                pos += 2;
80                continue;
81            }
82            b'`' => {
83                // Skip verbatim code spans; brackets inside don't close.
84                match code_span_end(bytes, pos) {
85                    Some(end) => pos = end,
86                    None => pos += 1,
87                }
88            }
89            b'[' => {
90                bracket_depth += 1;
91                pos += 1;
92            }
93            b']' => {
94                bracket_depth -= 1;
95                if bracket_depth == 0 {
96                    // Found the closing bracket
97                    let content = &text[1..pos];
98                    return Some((pos + 1, content));
99                }
100                pos += 1;
101            }
102            _ => {
103                pos += 1;
104            }
105        }
106    }
107
108    // No closing bracket found
109    None
110}
111
112/// Try to parse a bare citation (author-in-text) starting at the current position.
113/// Returns Some((length, key, has_suppress)) if successful, None otherwise.
114///
115/// Bare citations have the syntax: @key or -@key
116pub(crate) fn try_parse_bare_citation(text: &str) -> Option<(usize, &str, bool)> {
117    let bytes = text.as_bytes();
118
119    if bytes.is_empty() {
120        return None;
121    }
122
123    let mut pos = 0;
124    let has_suppress = bytes[pos] == b'-';
125
126    if has_suppress {
127        pos += 1;
128        if pos >= bytes.len() {
129            return None;
130        }
131    }
132
133    // Must have @ next
134    if bytes[pos] != b'@' {
135        return None;
136    }
137    pos += 1;
138
139    if pos >= bytes.len() {
140        return None;
141    }
142
143    // Parse the citation key
144    let key_start = pos;
145    let key_len = parse_citation_key(&text[pos..])?;
146
147    if key_len == 0 {
148        return None;
149    }
150
151    let total_len = pos + key_len;
152    let key = &text[key_start..total_len];
153
154    Some((total_len, key, has_suppress))
155}
156
157/// Try to parse a Quarto cross-reference key (e.g., @fig-plot, @eq-energy).
158pub fn is_quarto_crossref_key(key: &str) -> bool {
159    let lower = key.to_ascii_lowercase();
160    let mut parts = lower.splitn(2, '-');
161    let prefix = parts.next().unwrap_or("");
162    let rest = parts.next().unwrap_or("");
163    if rest.is_empty() {
164        return false;
165    }
166    matches!(
167        prefix,
168        "fig"
169            | "tbl"
170            | "lst"
171            | "tip"
172            | "nte"
173            | "wrn"
174            | "imp"
175            | "cau"
176            | "thm"
177            | "lem"
178            | "cor"
179            | "prp"
180            | "cnj"
181            | "def"
182            | "exm"
183            | "exr"
184            | "sol"
185            | "rem"
186            | "alg"
187            | "eq"
188            | "sec"
189    )
190}
191
192/// Like [`is_quarto_crossref_key`], but also accepts any key whose prefix
193/// appears in `custom_prefixes`. Used to recognize cross-reference prefixes
194/// injected by Quarto extensions (e.g. pseudocode's `@algo-`) that aren't
195/// built in. Matching is case-insensitive on the prefix, consistent with the
196/// built-in check.
197pub fn is_crossref_key(key: &str, custom_prefixes: &[String]) -> bool {
198    is_quarto_crossref_key(key) || has_custom_crossref_prefix(key, custom_prefixes)
199}
200
201/// Whether `key`'s prefix (the segment before the first `-`) appears in
202/// `custom_prefixes`. Unlike [`is_quarto_crossref_key`], this matches *only*
203/// the configured extension prefixes, so callers can tell an extension-injected
204/// cross-reference (whose target panache can't resolve) apart from a built-in
205/// one (whose target it can and should validate).
206pub fn has_custom_crossref_prefix(key: &str, custom_prefixes: &[String]) -> bool {
207    if custom_prefixes.is_empty() {
208        return false;
209    }
210    let lower = key.to_ascii_lowercase();
211    let mut parts = lower.splitn(2, '-');
212    let prefix = parts.next().unwrap_or("");
213    let rest = parts.next().unwrap_or("");
214    if rest.is_empty() {
215        return false;
216    }
217    custom_prefixes
218        .iter()
219        .any(|candidate| candidate.eq_ignore_ascii_case(prefix))
220}
221
222pub const BOOKDOWN_LABEL_PREFIXES: &[&str] = &[
223    "eq", "fig", "tab", "thm", "lem", "cor", "prp", "cnj", "def", "exm", "exr", "sol", "rem",
224    "alg", "sec", "hyp",
225];
226
227pub fn is_bookdown_label(label: &str) -> bool {
228    BOOKDOWN_LABEL_PREFIXES.contains(&label)
229}
230
231pub fn has_bookdown_prefix(label: &str) -> bool {
232    let mut parts = label.splitn(2, ':');
233    let prefix = parts.next().unwrap_or("");
234    let rest = parts.next().unwrap_or("");
235    if rest.is_empty() {
236        return false;
237    }
238    is_bookdown_label(prefix)
239}
240
241pub(crate) fn emit_crossref(builder: &mut impl InlineSink, key: &str, has_suppress: bool) {
242    builder.start_node(SyntaxKind::CROSSREF.into());
243
244    if has_suppress {
245        builder.token(SyntaxKind::CROSSREF_MARKER.into(), "-@");
246    } else {
247        builder.token(SyntaxKind::CROSSREF_MARKER.into(), "@");
248    }
249
250    if key.starts_with('{') && key.ends_with('}') {
251        builder.token(SyntaxKind::CROSSREF_BRACE_OPEN.into(), "{");
252        builder.token(SyntaxKind::CROSSREF_KEY.into(), &key[1..key.len() - 1]);
253        builder.token(SyntaxKind::CROSSREF_BRACE_CLOSE.into(), "}");
254    } else {
255        builder.token(SyntaxKind::CROSSREF_KEY.into(), key);
256    }
257
258    builder.finish_node();
259}
260
261pub(crate) fn emit_bookdown_crossref(builder: &mut impl InlineSink, key: &str) {
262    builder.start_node(SyntaxKind::CROSSREF.into());
263    builder.token(SyntaxKind::CROSSREF_BOOKDOWN_OPEN.into(), "\\@ref(");
264    builder.token(SyntaxKind::CROSSREF_KEY.into(), key);
265    builder.token(SyntaxKind::CROSSREF_BOOKDOWN_CLOSE.into(), ")");
266    builder.finish_node();
267}
268
269/// Parse a citation key following Pandoc's rules.
270/// Returns the length of the key, or None if invalid.
271///
272/// Citation keys:
273/// - Must start with letter, digit, or _
274/// - Can contain alphanumerics and single internal punctuation: :.#$%&-+?<>~/
275/// - Keys in braces @{...} can contain anything
276/// - Double internal punctuation terminates key
277/// - Trailing punctuation not included
278fn parse_citation_key(text: &str) -> Option<usize> {
279    if text.is_empty() {
280        return None;
281    }
282
283    // Check for braced key: @{...}
284    if text.starts_with('{') {
285        // Find matching closing brace
286        let mut escape_next = false;
287
288        for (idx, ch) in text.char_indices().skip(1) {
289            if escape_next {
290                escape_next = false;
291                continue;
292            }
293
294            match ch {
295                '\\' => escape_next = true,
296                '}' => return Some(idx + ch.len_utf8()),
297                _ => {}
298            }
299        }
300
301        // No closing brace found
302        return None;
303    }
304
305    // Regular key: must start with letter, digit, or _
306    let mut iter = text.char_indices();
307    let (_, first_char) = iter.next()?;
308    if !first_char.is_alphanumeric() && first_char != '_' {
309        return None;
310    }
311
312    let mut last_alnum_end = first_char.len_utf8();
313    let mut last_included_end = last_alnum_end;
314    let mut last_punct_start: Option<usize> = None;
315    let mut prev_was_punct = false;
316
317    for (idx, ch) in iter {
318        if ch.is_alphanumeric() || ch == '_' {
319            prev_was_punct = false;
320            last_alnum_end = idx + ch.len_utf8();
321            last_included_end = last_alnum_end;
322            last_punct_start = None;
323        } else if is_internal_punctuation(ch) {
324            // Check if previous was also punctuation (double punct terminates)
325            if prev_was_punct {
326                // Double punctuation - terminate before the first punctuation
327                return Some(last_punct_start.unwrap_or(last_alnum_end));
328            }
329            prev_was_punct = true;
330            last_punct_start = Some(idx);
331            last_included_end = idx + ch.len_utf8();
332        } else {
333            // Not a valid key character - terminate here
334            break;
335        }
336    }
337
338    if prev_was_punct {
339        return Some(last_alnum_end);
340    }
341
342    if last_included_end == 0 {
343        None
344    } else {
345        Some(last_included_end)
346    }
347}
348
349/// If `bytes[pos]` begins a backtick code-span opener, return the index just
350/// past the matching closing run. Returns `None` when there is no closing run
351/// of equal length, in which case the backticks are literal text.
352///
353/// Code spans are verbatim, so citation markers (`@`), separators (`;`), and
354/// brackets (`]`) inside them must not influence citation detection — this
355/// matches pandoc, which parses `` [`@foo`] `` as a link, not a citation.
356fn code_span_end(bytes: &[u8], pos: usize) -> Option<usize> {
357    let mut open_end = pos;
358    while open_end < bytes.len() && bytes[open_end] == b'`' {
359        open_end += 1;
360    }
361    let run = open_end - pos;
362
363    let mut i = open_end;
364    while i < bytes.len() {
365        if bytes[i] == b'`' {
366            let close_start = i;
367            while i < bytes.len() && bytes[i] == b'`' {
368                i += 1;
369            }
370            if i - close_start == run {
371                return Some(i);
372            }
373        } else {
374            i += 1;
375        }
376    }
377
378    None
379}
380
381/// Check if a character is valid internal punctuation in citation keys.
382fn is_internal_punctuation(ch: char) -> bool {
383    matches!(
384        ch,
385        ':' | '.' | '#' | '$' | '%' | '&' | '-' | '+' | '?' | '<' | '>' | '~' | '/'
386    )
387}
388
389/// Emit a bracketed citation node to the builder.
390pub(crate) fn emit_bracketed_citation(builder: &mut impl InlineSink, content: &str) {
391    builder.start_node(SyntaxKind::CITATION.into());
392
393    // Opening bracket
394    builder.token(SyntaxKind::LINK_START.into(), "[");
395
396    // Emit prefix + citations + suffix with fine-grained tokens.
397    emit_bracketed_citation_content(builder, content);
398
399    // Closing bracket
400    builder.token(SyntaxKind::LINK_DEST.into(), "]");
401
402    builder.finish_node();
403}
404
405fn emit_bracketed_citation_content(builder: &mut impl InlineSink, content: &str) {
406    let mut text_start = 0;
407    let mut iter = content.char_indices().peekable();
408
409    while let Some((idx, ch)) = iter.next() {
410        // Backslash escapes (e.g. `\@`, `\[`, `\]`) suppress citation/separator
411        // recognition for the following character — matching Pandoc, which
412        // treats the escape as a literal in the citation prefix/suffix.
413        if ch == '\\' {
414            iter.next();
415            continue;
416        }
417
418        if ch == '`'
419            && let Some(end) = code_span_end(content.as_bytes(), idx)
420        {
421            // Verbatim code span: leave its bytes in the pending
422            // CITATION_CONTENT run and skip markers/separators inside it.
423            while matches!(iter.peek(), Some((next_idx, _)) if *next_idx < end) {
424                iter.next();
425            }
426            continue;
427        }
428
429        if ch == '@' || (ch == '-' && matches!(iter.peek(), Some((_, '@')))) {
430            if idx > text_start {
431                builder.token(
432                    SyntaxKind::CITATION_CONTENT.into(),
433                    &content[text_start..idx],
434                );
435            }
436
437            let mut marker_len = 1;
438            let marker_text = if ch == '-' {
439                iter.next();
440                marker_len = 2;
441                "-@"
442            } else {
443                "@"
444            };
445            builder.token(SyntaxKind::CITATION_MARKER.into(), marker_text);
446
447            let key_start = idx + marker_len;
448            if key_start >= content.len() {
449                text_start = key_start;
450                continue;
451            }
452
453            if let Some(key_len) = parse_citation_key(&content[key_start..]) {
454                let key_end = key_start + key_len;
455                let key = &content[key_start..key_end];
456                if key.starts_with('{') && key.ends_with('}') {
457                    builder.token(SyntaxKind::CITATION_BRACE_OPEN.into(), "{");
458                    if key.len() > 2 {
459                        builder.token(SyntaxKind::CITATION_KEY.into(), &key[1..key.len() - 1]);
460                    }
461                    builder.token(SyntaxKind::CITATION_BRACE_CLOSE.into(), "}");
462                } else {
463                    builder.token(SyntaxKind::CITATION_KEY.into(), key);
464                }
465                while matches!(iter.peek(), Some((next_idx, _)) if *next_idx < key_end) {
466                    iter.next();
467                }
468                text_start = key_end;
469                continue;
470            }
471
472            text_start = key_start;
473            continue;
474        }
475
476        if ch == ';' {
477            if idx > text_start {
478                builder.token(
479                    SyntaxKind::CITATION_CONTENT.into(),
480                    &content[text_start..idx],
481                );
482            }
483            builder.token(SyntaxKind::CITATION_SEPARATOR.into(), ";");
484            text_start = idx + ch.len_utf8();
485            continue;
486        }
487    }
488
489    if text_start < content.len() {
490        builder.token(SyntaxKind::CITATION_CONTENT.into(), &content[text_start..]);
491    }
492}
493
494/// Emit a bare citation node to the builder.
495pub(crate) fn emit_bare_citation(builder: &mut impl InlineSink, key: &str, has_suppress: bool) {
496    builder.start_node(SyntaxKind::CITATION.into());
497
498    // Emit marker (@ or -@)
499    if has_suppress {
500        builder.token(SyntaxKind::CITATION_MARKER.into(), "-@");
501    } else {
502        builder.token(SyntaxKind::CITATION_MARKER.into(), "@");
503    }
504
505    // Check if key is braced
506    if key.starts_with('{') && key.ends_with('}') {
507        builder.token(SyntaxKind::CITATION_BRACE_OPEN.into(), "{");
508        builder.token(SyntaxKind::CITATION_KEY.into(), &key[1..key.len() - 1]);
509        builder.token(SyntaxKind::CITATION_BRACE_CLOSE.into(), "}");
510    } else {
511        builder.token(SyntaxKind::CITATION_KEY.into(), key);
512    }
513
514    builder.finish_node();
515}
516
517#[cfg(test)]
518mod tests {
519    use super::*;
520
521    // Citation key parsing tests
522    #[test]
523    fn test_parse_simple_citation_key() {
524        assert_eq!(parse_citation_key("doe99"), Some(5));
525        assert_eq!(parse_citation_key("smith2000"), Some(9));
526    }
527
528    #[test]
529    fn test_parse_citation_key_with_internal_punct() {
530        assert_eq!(parse_citation_key("Foo_bar.baz"), Some(11));
531        assert_eq!(parse_citation_key("author:2020"), Some(11));
532    }
533
534    #[test]
535    fn test_parse_citation_key_trailing_punct() {
536        // Trailing punctuation should be excluded
537        assert_eq!(parse_citation_key("Foo_bar.baz."), Some(11));
538        assert_eq!(parse_citation_key("key:value:"), Some(9));
539    }
540
541    #[test]
542    fn test_parse_citation_key_double_punct() {
543        // Double punctuation terminates key
544        assert_eq!(parse_citation_key("Foo_bar--baz"), Some(7)); // key is "Foo_bar"
545    }
546
547    #[test]
548    fn test_parse_citation_key_with_braces() {
549        assert_eq!(parse_citation_key("{https://example.com}"), Some(21));
550        assert_eq!(parse_citation_key("{Foo_bar.baz.}"), Some(14));
551    }
552
553    #[test]
554    fn test_parse_citation_key_invalid_start() {
555        assert_eq!(parse_citation_key(".invalid"), None);
556        assert_eq!(parse_citation_key(":invalid"), None);
557    }
558
559    #[test]
560    fn test_parse_citation_key_stops_at_space() {
561        assert_eq!(parse_citation_key("key rest"), Some(3));
562    }
563
564    #[test]
565    fn is_crossref_key_accepts_builtin_without_custom() {
566        assert!(is_crossref_key("fig-plot", &[]));
567        assert!(!is_crossref_key("algo-cd", &[]));
568    }
569
570    #[test]
571    fn is_crossref_key_accepts_custom_prefix() {
572        let custom = vec!["algo".to_string()];
573        assert!(is_crossref_key("algo-cd", &custom));
574        // Case-insensitive on the prefix, consistent with the built-in check.
575        assert!(is_crossref_key("ALGO-cd", &custom));
576        // Built-ins still match alongside the custom set.
577        assert!(is_crossref_key("tbl-x", &custom));
578        // A bare prefix with no `-suffix` is not a crossref.
579        assert!(!is_crossref_key("algo", &custom));
580        // Unrelated prefixes remain citations.
581        assert!(!is_crossref_key("doe99", &custom));
582    }
583
584    // Bare citation parsing tests
585    #[test]
586    fn test_parse_bare_citation_simple() {
587        let result = try_parse_bare_citation("@doe99");
588        assert_eq!(result, Some((6, "doe99", false)));
589    }
590
591    #[test]
592    fn test_parse_bare_citation_with_suppress() {
593        let result = try_parse_bare_citation("-@smith04");
594        assert_eq!(result, Some((9, "smith04", true)));
595    }
596
597    #[test]
598    fn test_parse_bare_citation_with_trailing_text() {
599        let result = try_parse_bare_citation("@doe99 says");
600        assert_eq!(result, Some((6, "doe99", false)));
601    }
602
603    #[test]
604    fn test_parse_bare_citation_braced_key() {
605        let result = try_parse_bare_citation("@{https://example.com}");
606        assert_eq!(result, Some((22, "{https://example.com}", false)));
607    }
608
609    #[test]
610    fn test_parse_bare_citation_not_citation() {
611        assert_eq!(try_parse_bare_citation("not a citation"), None);
612        assert_eq!(try_parse_bare_citation("@"), None);
613    }
614
615    // Bracketed citation parsing tests
616    #[test]
617    fn test_parse_bracketed_citation_simple() {
618        let result = try_parse_bracketed_citation("[@doe99]");
619        assert_eq!(result, Some((8, "@doe99")));
620    }
621
622    #[test]
623    fn test_parse_bracketed_citation_multiple() {
624        let result = try_parse_bracketed_citation("[@doe99; @smith2000]");
625        assert_eq!(result, Some((20, "@doe99; @smith2000")));
626    }
627
628    #[test]
629    fn test_parse_bracketed_citation_with_prefix() {
630        let result = try_parse_bracketed_citation("[see @doe99]");
631        assert_eq!(result, Some((12, "see @doe99")));
632    }
633
634    #[test]
635    fn test_parse_bracketed_citation_with_locator() {
636        let result = try_parse_bracketed_citation("[@doe99, pp. 33-35]");
637        assert_eq!(result, Some((19, "@doe99, pp. 33-35")));
638    }
639
640    #[test]
641    fn test_parse_bracketed_citation_complex() {
642        let result = try_parse_bracketed_citation("[see @doe99, pp. 33-35 and *passim*]");
643        assert_eq!(result, Some((36, "see @doe99, pp. 33-35 and *passim*")));
644    }
645
646    #[test]
647    fn test_parse_bracketed_citation_with_suppress() {
648        let result = try_parse_bracketed_citation("[-@doe99]");
649        assert_eq!(result, Some((9, "-@doe99")));
650    }
651
652    #[test]
653    fn test_parse_bracketed_citation_not_citation() {
654        // Regular link should not be parsed as citation
655        assert_eq!(try_parse_bracketed_citation("[text](url)"), None);
656        assert_eq!(try_parse_bracketed_citation("[just text]"), None);
657    }
658
659    #[test]
660    fn test_parse_bracketed_citation_nested_brackets() {
661        let result = try_parse_bracketed_citation("[see [nested] @doe99]");
662        assert_eq!(result, Some((21, "see [nested] @doe99")));
663    }
664
665    #[test]
666    fn test_parse_bracketed_citation_escaped_bracket() {
667        let result = try_parse_bracketed_citation(r"[@doe99 with \] escaped]");
668        assert_eq!(result, Some((24, r"@doe99 with \] escaped")));
669    }
670
671    #[test]
672    fn test_parse_bracketed_citation_paren_in_prefix() {
673        // Pandoc treats parens in the citation prefix as ordinary text;
674        // they must not abort citation detection.
675        let result = try_parse_bracketed_citation("[see (Smith 1999) and @doe99]");
676        assert_eq!(result, Some((29, "see (Smith 1999) and @doe99")));
677    }
678
679    #[test]
680    fn test_bracketed_citation_ignores_at_in_code_span() {
681        // `@foo` inside a code span is verbatim, so [`@foo`] is a link label,
682        // not a citation (matches pandoc).
683        assert_eq!(try_parse_bracketed_citation("[`@foo`]"), None);
684    }
685
686    #[test]
687    fn test_bracketed_citation_code_span_in_prefix() {
688        // A code span may appear in the citation prefix; @ inside it is verbatim
689        // and the real @key follows.
690        assert_eq!(
691            try_parse_bracketed_citation("[`x@y` @doe99]"),
692            Some((14, "`x@y` @doe99"))
693        );
694    }
695
696    #[test]
697    fn test_bracketed_citation_bracket_in_code_span() {
698        // A `]` inside a code span does not terminate the bracket.
699        assert_eq!(
700            try_parse_bracketed_citation("[`a]b` @doe99]"),
701            Some((14, "`a]b` @doe99"))
702        );
703    }
704
705    #[test]
706    fn test_bracketed_citation_unterminated_backtick() {
707        // An unterminated backtick run is literal, so @foo is still a citation.
708        assert_eq!(
709            try_parse_bracketed_citation("[`@foo bar]"),
710            Some((11, "`@foo bar"))
711        );
712    }
713
714    #[test]
715    fn test_parse_bracketed_citation_escaped_at_in_prefix() {
716        // Pandoc accepts \@ref(label) inside the citation prefix without
717        // mistaking it for a citation marker; the actual citation is the
718        // unescaped @key that follows.
719        let result =
720            try_parse_bracketed_citation(r"[see also \@ref(svm) and @bischl_applied_2024]");
721        assert_eq!(
722            result,
723            Some((46, r"see also \@ref(svm) and @bischl_applied_2024"))
724        );
725    }
726}