Skip to main content

panache_parser/parser/inlines/
citations.rs

1//! Citation parsing for Pandoc's citations extension.
2//!
3//! Syntax:
4//! - Bracketed: `[@doe99]`, `[@doe99; @smith2000]`
5//! - With locator: `[see @doe99, pp. 33-35]`
6//! - Suppress author: `[-@doe99]`
7//! - Author-in-text: `@doe99` (bare, without brackets)
8
9use crate::syntax::SyntaxKind;
10use rowan::GreenNodeBuilder;
11
12/// Try to parse a bracketed citation starting at the current position.
13/// Returns Some((length, content)) if successful, None otherwise.
14///
15/// Bracketed citations have the syntax: [@key], [@key1; @key2], [see @key, pp. 1-10]
16pub(crate) fn try_parse_bracketed_citation(text: &str) -> Option<(usize, &str)> {
17    let bytes = text.as_bytes();
18
19    // Must start with [
20    if bytes.is_empty() || bytes[0] != b'[' {
21        return None;
22    }
23
24    // Look ahead to see if this contains a citation marker (@)
25    // We need to distinguish from regular links
26    let mut has_citation = false;
27    let mut pos = 1;
28    let mut bracket_depth = 0;
29
30    while pos < bytes.len() {
31        match bytes[pos] {
32            b'\\' => {
33                // Skip escaped character
34                pos += 2;
35                continue;
36            }
37            b'[' => {
38                bracket_depth += 1;
39                pos += 1;
40            }
41            b']' => {
42                if bracket_depth == 0 {
43                    // Closing bracket of main citation - stop looking
44                    break;
45                }
46                bracket_depth -= 1;
47                pos += 1;
48            }
49            b'@' => {
50                // Found a citation marker - this is likely a citation
51                has_citation = true;
52                break;
53            }
54            b'(' if bracket_depth == 0 => {
55                // Opening paren at top level suggests this might be a link [text](url)
56                // Not a citation
57                break;
58            }
59            _ => {
60                pos += 1;
61            }
62        }
63    }
64
65    if !has_citation {
66        return None;
67    }
68
69    // Now find the closing bracket
70    pos = 1;
71    bracket_depth = 1;
72
73    while pos < bytes.len() {
74        match bytes[pos] {
75            b'\\' => {
76                // Skip escaped character
77                pos += 2;
78                continue;
79            }
80            b'[' => {
81                bracket_depth += 1;
82                pos += 1;
83            }
84            b']' => {
85                bracket_depth -= 1;
86                if bracket_depth == 0 {
87                    // Found the closing bracket
88                    let content = &text[1..pos];
89                    return Some((pos + 1, content));
90                }
91                pos += 1;
92            }
93            _ => {
94                pos += 1;
95            }
96        }
97    }
98
99    // No closing bracket found
100    None
101}
102
103/// Try to parse a bare citation (author-in-text) starting at the current position.
104/// Returns Some((length, key, has_suppress)) if successful, None otherwise.
105///
106/// Bare citations have the syntax: @key or -@key
107pub(crate) fn try_parse_bare_citation(text: &str) -> Option<(usize, &str, bool)> {
108    let bytes = text.as_bytes();
109
110    if bytes.is_empty() {
111        return None;
112    }
113
114    let mut pos = 0;
115    let has_suppress = bytes[pos] == b'-';
116
117    if has_suppress {
118        pos += 1;
119        if pos >= bytes.len() {
120            return None;
121        }
122    }
123
124    // Must have @ next
125    if bytes[pos] != b'@' {
126        return None;
127    }
128    pos += 1;
129
130    if pos >= bytes.len() {
131        return None;
132    }
133
134    // Parse the citation key
135    let key_start = pos;
136    let key_len = parse_citation_key(&text[pos..])?;
137
138    if key_len == 0 {
139        return None;
140    }
141
142    let total_len = pos + key_len;
143    let key = &text[key_start..total_len];
144
145    Some((total_len, key, has_suppress))
146}
147
148/// Try to parse a Quarto cross-reference key (e.g., @fig-plot, @eq-energy).
149pub fn is_quarto_crossref_key(key: &str) -> bool {
150    let lower = key.to_ascii_lowercase();
151    let mut parts = lower.splitn(2, '-');
152    let prefix = parts.next().unwrap_or("");
153    let rest = parts.next().unwrap_or("");
154    if rest.is_empty() {
155        return false;
156    }
157    matches!(
158        prefix,
159        "fig"
160            | "tbl"
161            | "lst"
162            | "tip"
163            | "nte"
164            | "wrn"
165            | "imp"
166            | "cau"
167            | "thm"
168            | "lem"
169            | "cor"
170            | "prp"
171            | "cnj"
172            | "def"
173            | "exm"
174            | "exr"
175            | "sol"
176            | "rem"
177            | "alg"
178            | "eq"
179            | "sec"
180    )
181}
182
183pub const BOOKDOWN_LABEL_PREFIXES: &[&str] = &[
184    "eq", "fig", "tab", "thm", "lem", "cor", "prp", "cnj", "def", "exm", "exr", "sol", "rem",
185    "alg", "sec", "hyp",
186];
187
188pub fn is_bookdown_label(label: &str) -> bool {
189    BOOKDOWN_LABEL_PREFIXES.contains(&label)
190}
191
192pub fn has_bookdown_prefix(label: &str) -> bool {
193    let mut parts = label.splitn(2, ':');
194    let prefix = parts.next().unwrap_or("");
195    let rest = parts.next().unwrap_or("");
196    if rest.is_empty() {
197        return false;
198    }
199    is_bookdown_label(prefix)
200}
201
202pub(crate) fn emit_crossref(builder: &mut GreenNodeBuilder, key: &str, has_suppress: bool) {
203    builder.start_node(SyntaxKind::CROSSREF.into());
204
205    if has_suppress {
206        builder.token(SyntaxKind::CROSSREF_MARKER.into(), "-@");
207    } else {
208        builder.token(SyntaxKind::CROSSREF_MARKER.into(), "@");
209    }
210
211    if key.starts_with('{') && key.ends_with('}') {
212        builder.token(SyntaxKind::CROSSREF_BRACE_OPEN.into(), "{");
213        builder.token(SyntaxKind::CROSSREF_KEY.into(), &key[1..key.len() - 1]);
214        builder.token(SyntaxKind::CROSSREF_BRACE_CLOSE.into(), "}");
215    } else {
216        builder.token(SyntaxKind::CROSSREF_KEY.into(), key);
217    }
218
219    builder.finish_node();
220}
221
222pub(crate) fn emit_bookdown_crossref(builder: &mut GreenNodeBuilder, key: &str) {
223    builder.start_node(SyntaxKind::CROSSREF.into());
224    builder.token(SyntaxKind::CROSSREF_BOOKDOWN_OPEN.into(), "\\@ref(");
225    builder.token(SyntaxKind::CROSSREF_KEY.into(), key);
226    builder.token(SyntaxKind::CROSSREF_BOOKDOWN_CLOSE.into(), ")");
227    builder.finish_node();
228}
229
230/// Parse a citation key following Pandoc's rules.
231/// Returns the length of the key, or None if invalid.
232///
233/// Citation keys:
234/// - Must start with letter, digit, or _
235/// - Can contain alphanumerics and single internal punctuation: :.#$%&-+?<>~/
236/// - Keys in braces @{...} can contain anything
237/// - Double internal punctuation terminates key
238/// - Trailing punctuation not included
239fn parse_citation_key(text: &str) -> Option<usize> {
240    if text.is_empty() {
241        return None;
242    }
243
244    // Check for braced key: @{...}
245    if text.starts_with('{') {
246        // Find matching closing brace
247        let mut escape_next = false;
248
249        for (idx, ch) in text.char_indices().skip(1) {
250            if escape_next {
251                escape_next = false;
252                continue;
253            }
254
255            match ch {
256                '\\' => escape_next = true,
257                '}' => return Some(idx + ch.len_utf8()),
258                _ => {}
259            }
260        }
261
262        // No closing brace found
263        return None;
264    }
265
266    // Regular key: must start with letter, digit, or _
267    let mut iter = text.char_indices();
268    let (_, first_char) = iter.next()?;
269    if !first_char.is_alphanumeric() && first_char != '_' {
270        return None;
271    }
272
273    let mut last_alnum_end = first_char.len_utf8();
274    let mut last_included_end = last_alnum_end;
275    let mut last_punct_start: Option<usize> = None;
276    let mut prev_was_punct = false;
277
278    for (idx, ch) in iter {
279        if ch.is_alphanumeric() || ch == '_' {
280            prev_was_punct = false;
281            last_alnum_end = idx + ch.len_utf8();
282            last_included_end = last_alnum_end;
283            last_punct_start = None;
284        } else if is_internal_punctuation(ch) {
285            // Check if previous was also punctuation (double punct terminates)
286            if prev_was_punct {
287                // Double punctuation - terminate before the first punctuation
288                return Some(last_punct_start.unwrap_or(last_alnum_end));
289            }
290            prev_was_punct = true;
291            last_punct_start = Some(idx);
292            last_included_end = idx + ch.len_utf8();
293        } else {
294            // Not a valid key character - terminate here
295            break;
296        }
297    }
298
299    if prev_was_punct {
300        return Some(last_alnum_end);
301    }
302
303    if last_included_end == 0 {
304        None
305    } else {
306        Some(last_included_end)
307    }
308}
309
310/// Check if a character is valid internal punctuation in citation keys.
311fn is_internal_punctuation(ch: char) -> bool {
312    matches!(
313        ch,
314        ':' | '.' | '#' | '$' | '%' | '&' | '-' | '+' | '?' | '<' | '>' | '~' | '/'
315    )
316}
317
318/// Emit a bracketed citation node to the builder.
319pub(crate) fn emit_bracketed_citation(builder: &mut GreenNodeBuilder, content: &str) {
320    builder.start_node(SyntaxKind::CITATION.into());
321
322    // Opening bracket
323    builder.token(SyntaxKind::LINK_START.into(), "[");
324
325    // Emit prefix + citations + suffix with fine-grained tokens.
326    emit_bracketed_citation_content(builder, content);
327
328    // Closing bracket
329    builder.token(SyntaxKind::LINK_DEST.into(), "]");
330
331    builder.finish_node();
332}
333
334fn emit_bracketed_citation_content(builder: &mut GreenNodeBuilder, content: &str) {
335    let mut text_start = 0;
336    let mut iter = content.char_indices().peekable();
337
338    while let Some((idx, ch)) = iter.next() {
339        if ch == '@' || (ch == '-' && matches!(iter.peek(), Some((_, '@')))) {
340            if idx > text_start {
341                builder.token(
342                    SyntaxKind::CITATION_CONTENT.into(),
343                    &content[text_start..idx],
344                );
345            }
346
347            let mut marker_len = 1;
348            let marker_text = if ch == '-' {
349                iter.next();
350                marker_len = 2;
351                "-@"
352            } else {
353                "@"
354            };
355            builder.token(SyntaxKind::CITATION_MARKER.into(), marker_text);
356
357            let key_start = idx + marker_len;
358            if key_start >= content.len() {
359                text_start = key_start;
360                continue;
361            }
362
363            if let Some(key_len) = parse_citation_key(&content[key_start..]) {
364                let key_end = key_start + key_len;
365                let key = &content[key_start..key_end];
366                if key.starts_with('{') && key.ends_with('}') {
367                    builder.token(SyntaxKind::CITATION_BRACE_OPEN.into(), "{");
368                    if key.len() > 2 {
369                        builder.token(SyntaxKind::CITATION_KEY.into(), &key[1..key.len() - 1]);
370                    }
371                    builder.token(SyntaxKind::CITATION_BRACE_CLOSE.into(), "}");
372                } else {
373                    builder.token(SyntaxKind::CITATION_KEY.into(), key);
374                }
375                while matches!(iter.peek(), Some((next_idx, _)) if *next_idx < key_end) {
376                    iter.next();
377                }
378                text_start = key_end;
379                continue;
380            }
381
382            text_start = key_start;
383            continue;
384        }
385
386        if ch == ';' {
387            if idx > text_start {
388                builder.token(
389                    SyntaxKind::CITATION_CONTENT.into(),
390                    &content[text_start..idx],
391                );
392            }
393            builder.token(SyntaxKind::CITATION_SEPARATOR.into(), ";");
394            text_start = idx + ch.len_utf8();
395            continue;
396        }
397    }
398
399    if text_start < content.len() {
400        builder.token(SyntaxKind::CITATION_CONTENT.into(), &content[text_start..]);
401    }
402}
403
404/// Emit a bare citation node to the builder.
405pub(crate) fn emit_bare_citation(builder: &mut GreenNodeBuilder, key: &str, has_suppress: bool) {
406    builder.start_node(SyntaxKind::CITATION.into());
407
408    // Emit marker (@ or -@)
409    if has_suppress {
410        builder.token(SyntaxKind::CITATION_MARKER.into(), "-@");
411    } else {
412        builder.token(SyntaxKind::CITATION_MARKER.into(), "@");
413    }
414
415    // Check if key is braced
416    if key.starts_with('{') && key.ends_with('}') {
417        builder.token(SyntaxKind::CITATION_BRACE_OPEN.into(), "{");
418        builder.token(SyntaxKind::CITATION_KEY.into(), &key[1..key.len() - 1]);
419        builder.token(SyntaxKind::CITATION_BRACE_CLOSE.into(), "}");
420    } else {
421        builder.token(SyntaxKind::CITATION_KEY.into(), key);
422    }
423
424    builder.finish_node();
425}
426
427#[cfg(test)]
428mod tests {
429    use super::*;
430
431    // Citation key parsing tests
432    #[test]
433    fn test_parse_simple_citation_key() {
434        assert_eq!(parse_citation_key("doe99"), Some(5));
435        assert_eq!(parse_citation_key("smith2000"), Some(9));
436    }
437
438    #[test]
439    fn test_parse_citation_key_with_internal_punct() {
440        assert_eq!(parse_citation_key("Foo_bar.baz"), Some(11));
441        assert_eq!(parse_citation_key("author:2020"), Some(11));
442    }
443
444    #[test]
445    fn test_parse_citation_key_trailing_punct() {
446        // Trailing punctuation should be excluded
447        assert_eq!(parse_citation_key("Foo_bar.baz."), Some(11));
448        assert_eq!(parse_citation_key("key:value:"), Some(9));
449    }
450
451    #[test]
452    fn test_parse_citation_key_double_punct() {
453        // Double punctuation terminates key
454        assert_eq!(parse_citation_key("Foo_bar--baz"), Some(7)); // key is "Foo_bar"
455    }
456
457    #[test]
458    fn test_parse_citation_key_with_braces() {
459        assert_eq!(parse_citation_key("{https://example.com}"), Some(21));
460        assert_eq!(parse_citation_key("{Foo_bar.baz.}"), Some(14));
461    }
462
463    #[test]
464    fn test_parse_citation_key_invalid_start() {
465        assert_eq!(parse_citation_key(".invalid"), None);
466        assert_eq!(parse_citation_key(":invalid"), None);
467    }
468
469    #[test]
470    fn test_parse_citation_key_stops_at_space() {
471        assert_eq!(parse_citation_key("key rest"), Some(3));
472    }
473
474    // Bare citation parsing tests
475    #[test]
476    fn test_parse_bare_citation_simple() {
477        let result = try_parse_bare_citation("@doe99");
478        assert_eq!(result, Some((6, "doe99", false)));
479    }
480
481    #[test]
482    fn test_parse_bare_citation_with_suppress() {
483        let result = try_parse_bare_citation("-@smith04");
484        assert_eq!(result, Some((9, "smith04", true)));
485    }
486
487    #[test]
488    fn test_parse_bare_citation_with_trailing_text() {
489        let result = try_parse_bare_citation("@doe99 says");
490        assert_eq!(result, Some((6, "doe99", false)));
491    }
492
493    #[test]
494    fn test_parse_bare_citation_braced_key() {
495        let result = try_parse_bare_citation("@{https://example.com}");
496        assert_eq!(result, Some((22, "{https://example.com}", false)));
497    }
498
499    #[test]
500    fn test_parse_bare_citation_not_citation() {
501        assert_eq!(try_parse_bare_citation("not a citation"), None);
502        assert_eq!(try_parse_bare_citation("@"), None);
503    }
504
505    // Bracketed citation parsing tests
506    #[test]
507    fn test_parse_bracketed_citation_simple() {
508        let result = try_parse_bracketed_citation("[@doe99]");
509        assert_eq!(result, Some((8, "@doe99")));
510    }
511
512    #[test]
513    fn test_parse_bracketed_citation_multiple() {
514        let result = try_parse_bracketed_citation("[@doe99; @smith2000]");
515        assert_eq!(result, Some((20, "@doe99; @smith2000")));
516    }
517
518    #[test]
519    fn test_parse_bracketed_citation_with_prefix() {
520        let result = try_parse_bracketed_citation("[see @doe99]");
521        assert_eq!(result, Some((12, "see @doe99")));
522    }
523
524    #[test]
525    fn test_parse_bracketed_citation_with_locator() {
526        let result = try_parse_bracketed_citation("[@doe99, pp. 33-35]");
527        assert_eq!(result, Some((19, "@doe99, pp. 33-35")));
528    }
529
530    #[test]
531    fn test_parse_bracketed_citation_complex() {
532        let result = try_parse_bracketed_citation("[see @doe99, pp. 33-35 and *passim*]");
533        assert_eq!(result, Some((36, "see @doe99, pp. 33-35 and *passim*")));
534    }
535
536    #[test]
537    fn test_parse_bracketed_citation_with_suppress() {
538        let result = try_parse_bracketed_citation("[-@doe99]");
539        assert_eq!(result, Some((9, "-@doe99")));
540    }
541
542    #[test]
543    fn test_parse_bracketed_citation_not_citation() {
544        // Regular link should not be parsed as citation
545        assert_eq!(try_parse_bracketed_citation("[text](url)"), None);
546        assert_eq!(try_parse_bracketed_citation("[just text]"), None);
547    }
548
549    #[test]
550    fn test_parse_bracketed_citation_nested_brackets() {
551        let result = try_parse_bracketed_citation("[see [nested] @doe99]");
552        assert_eq!(result, Some((21, "see [nested] @doe99")));
553    }
554
555    #[test]
556    fn test_parse_bracketed_citation_escaped_bracket() {
557        let result = try_parse_bracketed_citation(r"[@doe99 with \] escaped]");
558        assert_eq!(result, Some((24, r"@doe99 with \] escaped")));
559    }
560}