Skip to main content

panache_parser/parser/inlines/
citations.rs

1//! Citation parsing for Pandoc's citations extension.
2//!
3//! Syntax:
4//! - Bracketed: `[@doe99]`, `[@doe99; @smith2000]`
5//! - With locator: `[see @doe99, pp. 33-35]`
6//! - Suppress author: `[-@doe99]`
7//! - Author-in-text: `@doe99` (bare, without brackets)
8
9use super::sink::InlineSink;
10use crate::syntax::SyntaxKind;
11
12/// Try to parse a bracketed citation starting at the current position.
13/// Returns Some((length, content)) if successful, None otherwise.
14///
15/// Bracketed citations have the syntax: [@key], [@key1; @key2], [see @key, pp. 1-10]
16pub(crate) fn try_parse_bracketed_citation(text: &str) -> Option<(usize, &str)> {
17    let bytes = text.as_bytes();
18
19    // Must start with [
20    if bytes.is_empty() || bytes[0] != b'[' {
21        return None;
22    }
23
24    // Look ahead to see if this contains a citation marker (@)
25    // We need to distinguish from regular links
26    let mut has_citation = false;
27    let mut pos = 1;
28    let mut bracket_depth = 0;
29
30    while pos < bytes.len() {
31        match bytes[pos] {
32            b'\\' => {
33                // Skip escaped character
34                pos += 2;
35                continue;
36            }
37            b'`' => {
38                // Skip verbatim code spans; markers inside don't count.
39                match code_span_end(bytes, pos) {
40                    Some(end) => pos = end,
41                    None => pos += 1,
42                }
43            }
44            b'[' => {
45                bracket_depth += 1;
46                pos += 1;
47            }
48            b']' => {
49                if bracket_depth == 0 {
50                    // Closing bracket of main citation - stop looking
51                    break;
52                }
53                bracket_depth -= 1;
54                pos += 1;
55            }
56            b'@' => {
57                // Found a citation marker - this is likely a citation
58                has_citation = true;
59                break;
60            }
61            _ => {
62                pos += 1;
63            }
64        }
65    }
66
67    if !has_citation {
68        return None;
69    }
70
71    // Now find the closing bracket
72    pos = 1;
73    bracket_depth = 1;
74
75    while pos < bytes.len() {
76        match bytes[pos] {
77            b'\\' => {
78                // Skip escaped character
79                pos += 2;
80                continue;
81            }
82            b'`' => {
83                // Skip verbatim code spans; brackets inside don't close.
84                match code_span_end(bytes, pos) {
85                    Some(end) => pos = end,
86                    None => pos += 1,
87                }
88            }
89            b'[' => {
90                bracket_depth += 1;
91                pos += 1;
92            }
93            b']' => {
94                bracket_depth -= 1;
95                if bracket_depth == 0 {
96                    // Found the closing bracket
97                    let content = &text[1..pos];
98                    return Some((pos + 1, content));
99                }
100                pos += 1;
101            }
102            _ => {
103                pos += 1;
104            }
105        }
106    }
107
108    // No closing bracket found
109    None
110}
111
112/// Try to parse a bare citation (author-in-text) starting at the current position.
113/// Returns Some((length, key, has_suppress)) if successful, None otherwise.
114///
115/// Bare citations have the syntax: @key or -@key
116pub(crate) fn try_parse_bare_citation(text: &str) -> Option<(usize, &str, bool)> {
117    let bytes = text.as_bytes();
118
119    if bytes.is_empty() {
120        return None;
121    }
122
123    let mut pos = 0;
124    let has_suppress = bytes[pos] == b'-';
125
126    if has_suppress {
127        pos += 1;
128        if pos >= bytes.len() {
129            return None;
130        }
131    }
132
133    // Must have @ next
134    if bytes[pos] != b'@' {
135        return None;
136    }
137    pos += 1;
138
139    if pos >= bytes.len() {
140        return None;
141    }
142
143    // Parse the citation key
144    let key_start = pos;
145    let key_len = parse_citation_key(&text[pos..])?;
146
147    if key_len == 0 {
148        return None;
149    }
150
151    let total_len = pos + key_len;
152    let key = &text[key_start..total_len];
153
154    Some((total_len, key, has_suppress))
155}
156
157/// Try to parse a Quarto cross-reference key (e.g., @fig-plot, @eq-energy).
158pub fn is_quarto_crossref_key(key: &str) -> bool {
159    let lower = key.to_ascii_lowercase();
160    let mut parts = lower.splitn(2, '-');
161    let prefix = parts.next().unwrap_or("");
162    let rest = parts.next().unwrap_or("");
163    if rest.is_empty() {
164        return false;
165    }
166    matches!(
167        prefix,
168        "fig"
169            | "tbl"
170            | "lst"
171            | "tip"
172            | "nte"
173            | "wrn"
174            | "imp"
175            | "cau"
176            | "thm"
177            | "lem"
178            | "cor"
179            | "prp"
180            | "cnj"
181            | "def"
182            | "exm"
183            | "exr"
184            | "sol"
185            | "rem"
186            | "alg"
187            | "eq"
188            | "sec"
189    )
190}
191
192pub const BOOKDOWN_LABEL_PREFIXES: &[&str] = &[
193    "eq", "fig", "tab", "thm", "lem", "cor", "prp", "cnj", "def", "exm", "exr", "sol", "rem",
194    "alg", "sec", "hyp",
195];
196
197pub fn is_bookdown_label(label: &str) -> bool {
198    BOOKDOWN_LABEL_PREFIXES.contains(&label)
199}
200
201pub fn has_bookdown_prefix(label: &str) -> bool {
202    let mut parts = label.splitn(2, ':');
203    let prefix = parts.next().unwrap_or("");
204    let rest = parts.next().unwrap_or("");
205    if rest.is_empty() {
206        return false;
207    }
208    is_bookdown_label(prefix)
209}
210
211pub(crate) fn emit_crossref(builder: &mut impl InlineSink, key: &str, has_suppress: bool) {
212    builder.start_node(SyntaxKind::CROSSREF.into());
213
214    if has_suppress {
215        builder.token(SyntaxKind::CROSSREF_MARKER.into(), "-@");
216    } else {
217        builder.token(SyntaxKind::CROSSREF_MARKER.into(), "@");
218    }
219
220    if key.starts_with('{') && key.ends_with('}') {
221        builder.token(SyntaxKind::CROSSREF_BRACE_OPEN.into(), "{");
222        builder.token(SyntaxKind::CROSSREF_KEY.into(), &key[1..key.len() - 1]);
223        builder.token(SyntaxKind::CROSSREF_BRACE_CLOSE.into(), "}");
224    } else {
225        builder.token(SyntaxKind::CROSSREF_KEY.into(), key);
226    }
227
228    builder.finish_node();
229}
230
231pub(crate) fn emit_bookdown_crossref(builder: &mut impl InlineSink, key: &str) {
232    builder.start_node(SyntaxKind::CROSSREF.into());
233    builder.token(SyntaxKind::CROSSREF_BOOKDOWN_OPEN.into(), "\\@ref(");
234    builder.token(SyntaxKind::CROSSREF_KEY.into(), key);
235    builder.token(SyntaxKind::CROSSREF_BOOKDOWN_CLOSE.into(), ")");
236    builder.finish_node();
237}
238
239/// Parse a citation key following Pandoc's rules.
240/// Returns the length of the key, or None if invalid.
241///
242/// Citation keys:
243/// - Must start with letter, digit, or _
244/// - Can contain alphanumerics and single internal punctuation: :.#$%&-+?<>~/
245/// - Keys in braces @{...} can contain anything
246/// - Double internal punctuation terminates key
247/// - Trailing punctuation not included
248fn parse_citation_key(text: &str) -> Option<usize> {
249    if text.is_empty() {
250        return None;
251    }
252
253    // Check for braced key: @{...}
254    if text.starts_with('{') {
255        // Find matching closing brace
256        let mut escape_next = false;
257
258        for (idx, ch) in text.char_indices().skip(1) {
259            if escape_next {
260                escape_next = false;
261                continue;
262            }
263
264            match ch {
265                '\\' => escape_next = true,
266                '}' => return Some(idx + ch.len_utf8()),
267                _ => {}
268            }
269        }
270
271        // No closing brace found
272        return None;
273    }
274
275    // Regular key: must start with letter, digit, or _
276    let mut iter = text.char_indices();
277    let (_, first_char) = iter.next()?;
278    if !first_char.is_alphanumeric() && first_char != '_' {
279        return None;
280    }
281
282    let mut last_alnum_end = first_char.len_utf8();
283    let mut last_included_end = last_alnum_end;
284    let mut last_punct_start: Option<usize> = None;
285    let mut prev_was_punct = false;
286
287    for (idx, ch) in iter {
288        if ch.is_alphanumeric() || ch == '_' {
289            prev_was_punct = false;
290            last_alnum_end = idx + ch.len_utf8();
291            last_included_end = last_alnum_end;
292            last_punct_start = None;
293        } else if is_internal_punctuation(ch) {
294            // Check if previous was also punctuation (double punct terminates)
295            if prev_was_punct {
296                // Double punctuation - terminate before the first punctuation
297                return Some(last_punct_start.unwrap_or(last_alnum_end));
298            }
299            prev_was_punct = true;
300            last_punct_start = Some(idx);
301            last_included_end = idx + ch.len_utf8();
302        } else {
303            // Not a valid key character - terminate here
304            break;
305        }
306    }
307
308    if prev_was_punct {
309        return Some(last_alnum_end);
310    }
311
312    if last_included_end == 0 {
313        None
314    } else {
315        Some(last_included_end)
316    }
317}
318
319/// If `bytes[pos]` begins a backtick code-span opener, return the index just
320/// past the matching closing run. Returns `None` when there is no closing run
321/// of equal length, in which case the backticks are literal text.
322///
323/// Code spans are verbatim, so citation markers (`@`), separators (`;`), and
324/// brackets (`]`) inside them must not influence citation detection — this
325/// matches pandoc, which parses `` [`@foo`] `` as a link, not a citation.
326fn code_span_end(bytes: &[u8], pos: usize) -> Option<usize> {
327    let mut open_end = pos;
328    while open_end < bytes.len() && bytes[open_end] == b'`' {
329        open_end += 1;
330    }
331    let run = open_end - pos;
332
333    let mut i = open_end;
334    while i < bytes.len() {
335        if bytes[i] == b'`' {
336            let close_start = i;
337            while i < bytes.len() && bytes[i] == b'`' {
338                i += 1;
339            }
340            if i - close_start == run {
341                return Some(i);
342            }
343        } else {
344            i += 1;
345        }
346    }
347
348    None
349}
350
351/// Check if a character is valid internal punctuation in citation keys.
352fn is_internal_punctuation(ch: char) -> bool {
353    matches!(
354        ch,
355        ':' | '.' | '#' | '$' | '%' | '&' | '-' | '+' | '?' | '<' | '>' | '~' | '/'
356    )
357}
358
359/// Emit a bracketed citation node to the builder.
360pub(crate) fn emit_bracketed_citation(builder: &mut impl InlineSink, content: &str) {
361    builder.start_node(SyntaxKind::CITATION.into());
362
363    // Opening bracket
364    builder.token(SyntaxKind::LINK_START.into(), "[");
365
366    // Emit prefix + citations + suffix with fine-grained tokens.
367    emit_bracketed_citation_content(builder, content);
368
369    // Closing bracket
370    builder.token(SyntaxKind::LINK_DEST.into(), "]");
371
372    builder.finish_node();
373}
374
375fn emit_bracketed_citation_content(builder: &mut impl InlineSink, content: &str) {
376    let mut text_start = 0;
377    let mut iter = content.char_indices().peekable();
378
379    while let Some((idx, ch)) = iter.next() {
380        // Backslash escapes (e.g. `\@`, `\[`, `\]`) suppress citation/separator
381        // recognition for the following character — matching Pandoc, which
382        // treats the escape as a literal in the citation prefix/suffix.
383        if ch == '\\' {
384            iter.next();
385            continue;
386        }
387
388        if ch == '`'
389            && let Some(end) = code_span_end(content.as_bytes(), idx)
390        {
391            // Verbatim code span: leave its bytes in the pending
392            // CITATION_CONTENT run and skip markers/separators inside it.
393            while matches!(iter.peek(), Some((next_idx, _)) if *next_idx < end) {
394                iter.next();
395            }
396            continue;
397        }
398
399        if ch == '@' || (ch == '-' && matches!(iter.peek(), Some((_, '@')))) {
400            if idx > text_start {
401                builder.token(
402                    SyntaxKind::CITATION_CONTENT.into(),
403                    &content[text_start..idx],
404                );
405            }
406
407            let mut marker_len = 1;
408            let marker_text = if ch == '-' {
409                iter.next();
410                marker_len = 2;
411                "-@"
412            } else {
413                "@"
414            };
415            builder.token(SyntaxKind::CITATION_MARKER.into(), marker_text);
416
417            let key_start = idx + marker_len;
418            if key_start >= content.len() {
419                text_start = key_start;
420                continue;
421            }
422
423            if let Some(key_len) = parse_citation_key(&content[key_start..]) {
424                let key_end = key_start + key_len;
425                let key = &content[key_start..key_end];
426                if key.starts_with('{') && key.ends_with('}') {
427                    builder.token(SyntaxKind::CITATION_BRACE_OPEN.into(), "{");
428                    if key.len() > 2 {
429                        builder.token(SyntaxKind::CITATION_KEY.into(), &key[1..key.len() - 1]);
430                    }
431                    builder.token(SyntaxKind::CITATION_BRACE_CLOSE.into(), "}");
432                } else {
433                    builder.token(SyntaxKind::CITATION_KEY.into(), key);
434                }
435                while matches!(iter.peek(), Some((next_idx, _)) if *next_idx < key_end) {
436                    iter.next();
437                }
438                text_start = key_end;
439                continue;
440            }
441
442            text_start = key_start;
443            continue;
444        }
445
446        if ch == ';' {
447            if idx > text_start {
448                builder.token(
449                    SyntaxKind::CITATION_CONTENT.into(),
450                    &content[text_start..idx],
451                );
452            }
453            builder.token(SyntaxKind::CITATION_SEPARATOR.into(), ";");
454            text_start = idx + ch.len_utf8();
455            continue;
456        }
457    }
458
459    if text_start < content.len() {
460        builder.token(SyntaxKind::CITATION_CONTENT.into(), &content[text_start..]);
461    }
462}
463
464/// Emit a bare citation node to the builder.
465pub(crate) fn emit_bare_citation(builder: &mut impl InlineSink, key: &str, has_suppress: bool) {
466    builder.start_node(SyntaxKind::CITATION.into());
467
468    // Emit marker (@ or -@)
469    if has_suppress {
470        builder.token(SyntaxKind::CITATION_MARKER.into(), "-@");
471    } else {
472        builder.token(SyntaxKind::CITATION_MARKER.into(), "@");
473    }
474
475    // Check if key is braced
476    if key.starts_with('{') && key.ends_with('}') {
477        builder.token(SyntaxKind::CITATION_BRACE_OPEN.into(), "{");
478        builder.token(SyntaxKind::CITATION_KEY.into(), &key[1..key.len() - 1]);
479        builder.token(SyntaxKind::CITATION_BRACE_CLOSE.into(), "}");
480    } else {
481        builder.token(SyntaxKind::CITATION_KEY.into(), key);
482    }
483
484    builder.finish_node();
485}
486
487#[cfg(test)]
488mod tests {
489    use super::*;
490
491    // Citation key parsing tests
492    #[test]
493    fn test_parse_simple_citation_key() {
494        assert_eq!(parse_citation_key("doe99"), Some(5));
495        assert_eq!(parse_citation_key("smith2000"), Some(9));
496    }
497
498    #[test]
499    fn test_parse_citation_key_with_internal_punct() {
500        assert_eq!(parse_citation_key("Foo_bar.baz"), Some(11));
501        assert_eq!(parse_citation_key("author:2020"), Some(11));
502    }
503
504    #[test]
505    fn test_parse_citation_key_trailing_punct() {
506        // Trailing punctuation should be excluded
507        assert_eq!(parse_citation_key("Foo_bar.baz."), Some(11));
508        assert_eq!(parse_citation_key("key:value:"), Some(9));
509    }
510
511    #[test]
512    fn test_parse_citation_key_double_punct() {
513        // Double punctuation terminates key
514        assert_eq!(parse_citation_key("Foo_bar--baz"), Some(7)); // key is "Foo_bar"
515    }
516
517    #[test]
518    fn test_parse_citation_key_with_braces() {
519        assert_eq!(parse_citation_key("{https://example.com}"), Some(21));
520        assert_eq!(parse_citation_key("{Foo_bar.baz.}"), Some(14));
521    }
522
523    #[test]
524    fn test_parse_citation_key_invalid_start() {
525        assert_eq!(parse_citation_key(".invalid"), None);
526        assert_eq!(parse_citation_key(":invalid"), None);
527    }
528
529    #[test]
530    fn test_parse_citation_key_stops_at_space() {
531        assert_eq!(parse_citation_key("key rest"), Some(3));
532    }
533
534    // Bare citation parsing tests
535    #[test]
536    fn test_parse_bare_citation_simple() {
537        let result = try_parse_bare_citation("@doe99");
538        assert_eq!(result, Some((6, "doe99", false)));
539    }
540
541    #[test]
542    fn test_parse_bare_citation_with_suppress() {
543        let result = try_parse_bare_citation("-@smith04");
544        assert_eq!(result, Some((9, "smith04", true)));
545    }
546
547    #[test]
548    fn test_parse_bare_citation_with_trailing_text() {
549        let result = try_parse_bare_citation("@doe99 says");
550        assert_eq!(result, Some((6, "doe99", false)));
551    }
552
553    #[test]
554    fn test_parse_bare_citation_braced_key() {
555        let result = try_parse_bare_citation("@{https://example.com}");
556        assert_eq!(result, Some((22, "{https://example.com}", false)));
557    }
558
559    #[test]
560    fn test_parse_bare_citation_not_citation() {
561        assert_eq!(try_parse_bare_citation("not a citation"), None);
562        assert_eq!(try_parse_bare_citation("@"), None);
563    }
564
565    // Bracketed citation parsing tests
566    #[test]
567    fn test_parse_bracketed_citation_simple() {
568        let result = try_parse_bracketed_citation("[@doe99]");
569        assert_eq!(result, Some((8, "@doe99")));
570    }
571
572    #[test]
573    fn test_parse_bracketed_citation_multiple() {
574        let result = try_parse_bracketed_citation("[@doe99; @smith2000]");
575        assert_eq!(result, Some((20, "@doe99; @smith2000")));
576    }
577
578    #[test]
579    fn test_parse_bracketed_citation_with_prefix() {
580        let result = try_parse_bracketed_citation("[see @doe99]");
581        assert_eq!(result, Some((12, "see @doe99")));
582    }
583
584    #[test]
585    fn test_parse_bracketed_citation_with_locator() {
586        let result = try_parse_bracketed_citation("[@doe99, pp. 33-35]");
587        assert_eq!(result, Some((19, "@doe99, pp. 33-35")));
588    }
589
590    #[test]
591    fn test_parse_bracketed_citation_complex() {
592        let result = try_parse_bracketed_citation("[see @doe99, pp. 33-35 and *passim*]");
593        assert_eq!(result, Some((36, "see @doe99, pp. 33-35 and *passim*")));
594    }
595
596    #[test]
597    fn test_parse_bracketed_citation_with_suppress() {
598        let result = try_parse_bracketed_citation("[-@doe99]");
599        assert_eq!(result, Some((9, "-@doe99")));
600    }
601
602    #[test]
603    fn test_parse_bracketed_citation_not_citation() {
604        // Regular link should not be parsed as citation
605        assert_eq!(try_parse_bracketed_citation("[text](url)"), None);
606        assert_eq!(try_parse_bracketed_citation("[just text]"), None);
607    }
608
609    #[test]
610    fn test_parse_bracketed_citation_nested_brackets() {
611        let result = try_parse_bracketed_citation("[see [nested] @doe99]");
612        assert_eq!(result, Some((21, "see [nested] @doe99")));
613    }
614
615    #[test]
616    fn test_parse_bracketed_citation_escaped_bracket() {
617        let result = try_parse_bracketed_citation(r"[@doe99 with \] escaped]");
618        assert_eq!(result, Some((24, r"@doe99 with \] escaped")));
619    }
620
621    #[test]
622    fn test_parse_bracketed_citation_paren_in_prefix() {
623        // Pandoc treats parens in the citation prefix as ordinary text;
624        // they must not abort citation detection.
625        let result = try_parse_bracketed_citation("[see (Smith 1999) and @doe99]");
626        assert_eq!(result, Some((29, "see (Smith 1999) and @doe99")));
627    }
628
629    #[test]
630    fn test_bracketed_citation_ignores_at_in_code_span() {
631        // `@foo` inside a code span is verbatim, so [`@foo`] is a link label,
632        // not a citation (matches pandoc).
633        assert_eq!(try_parse_bracketed_citation("[`@foo`]"), None);
634    }
635
636    #[test]
637    fn test_bracketed_citation_code_span_in_prefix() {
638        // A code span may appear in the citation prefix; @ inside it is verbatim
639        // and the real @key follows.
640        assert_eq!(
641            try_parse_bracketed_citation("[`x@y` @doe99]"),
642            Some((14, "`x@y` @doe99"))
643        );
644    }
645
646    #[test]
647    fn test_bracketed_citation_bracket_in_code_span() {
648        // A `]` inside a code span does not terminate the bracket.
649        assert_eq!(
650            try_parse_bracketed_citation("[`a]b` @doe99]"),
651            Some((14, "`a]b` @doe99"))
652        );
653    }
654
655    #[test]
656    fn test_bracketed_citation_unterminated_backtick() {
657        // An unterminated backtick run is literal, so @foo is still a citation.
658        assert_eq!(
659            try_parse_bracketed_citation("[`@foo bar]"),
660            Some((11, "`@foo bar"))
661        );
662    }
663
664    #[test]
665    fn test_parse_bracketed_citation_escaped_at_in_prefix() {
666        // Pandoc accepts \@ref(label) inside the citation prefix without
667        // mistaking it for a citation marker; the actual citation is the
668        // unescaped @key that follows.
669        let result =
670            try_parse_bracketed_citation(r"[see also \@ref(svm) and @bischl_applied_2024]");
671        assert_eq!(
672            result,
673            Some((46, r"see also \@ref(svm) and @bischl_applied_2024"))
674        );
675    }
676}