panache-parser 0.13.0

//! Reference definition and footnote parsing functions.
//!
//! Reference definitions have the form:
//! ```markdown
//! [label]: url "optional title"
//! [label]: url 'optional title'
//! [label]: url (optional title)
//! [label]: <url> "title"
//! ```
//!
//! Footnote definitions have the form:
//! ```markdown
//! [^id]: Footnote content here.
//!     Can continue on multiple lines
//!     as long as they're indented.
//! ```

/// Try to parse a reference definition starting at the current position.
/// Returns Some((bytes_consumed, label, url, title)) on success.
///
/// `text` may span multiple lines. The destination and title may each be
/// preceded by at most one newline (per CommonMark §4.7). Blank lines
/// terminate the definition: callers should stop the input at the first
/// blank line so the parser cannot cross one.
///
/// `dialect` controls a CommonMark-only constraint (§4.7): the title, if
/// present on the same line as the destination, must be separated from the
/// destination by at least one space or tab. Pandoc-markdown accepts the
/// title even when it's directly attached (e.g. `[foo]: <bar>(baz)`).
///
/// Syntax:
/// ```markdown
/// [label]: url "title"
/// [label]: <url> 'title'
/// [label]:
///   url
///   "title"
/// ```
pub fn try_parse_reference_definition(
    text: &str,
    dialect: crate::options::Dialect,
) -> Option<(usize, String, String, Option<String>)> {
    try_parse_reference_definition_with_mode(text, true, dialect)
}

/// Multimarkdown-flavored variant: tolerates trailing content after the title
/// on the same line (e.g. `[ref]: /url "title" width=20px ...`). Callers in
/// the MMD code path then keep collecting attribute-continuation lines.
pub fn try_parse_reference_definition_lax(
    text: &str,
    dialect: crate::options::Dialect,
) -> Option<(usize, String, String, Option<String>)> {
    try_parse_reference_definition_with_mode(text, false, dialect)
}

fn try_parse_reference_definition_with_mode(
    text: &str,
    strict_eol: bool,
    dialect: crate::options::Dialect,
) -> Option<(usize, String, String, Option<String>)> {
    let spans = reference_definition_spans(text, strict_eol, dialect)?;
    let label = text[spans.indent + 1..spans.label_close].to_string();
    let url = if spans.url_is_angle {
        text[spans.url.start + 1..spans.url.end - 1].to_string()
    } else {
        text[spans.url.clone()].to_string()
    };
    let title = spans
        .title
        .as_ref()
        .map(|r| text[r.start + 1..r.end - 1].to_string());
    Some((spans.consumed, label, url, title))
}

/// Byte spans of a recognized reference definition, all relative to the `text`
/// passed to [`reference_definition_spans`].
///
/// This is the single source of truth shared by *detection*
/// (`try_parse_reference_definition`, which extracts the component strings)
/// and *emission* (`emit_reference_definition_lines`, which wraps the same
/// byte ranges in `REFERENCE_URL` / `REFERENCE_TITLE` CST nodes). Keeping both
/// phases on one walker is what prevents the detect/emit drift the dispatcher's
/// doc comment warns about.
#[derive(Debug, Clone)]
pub(crate) struct ReferenceSpans {
    /// Leading-space count before `[` (0..=3); also the byte index of `[`.
    pub indent: usize,
    /// Byte index of the label-closing `]`.
    pub label_close: usize,
    /// Byte index of the `:` after the label.
    pub colon: usize,
    /// Destination byte range, *including* `<>` when angle-bracketed.
    pub url: std::ops::Range<usize>,
    /// Whether the destination is `<…>` angle-bracketed.
    pub url_is_angle: bool,
    /// Title byte range, *including* its quote/paren delimiters, when present.
    pub title: Option<std::ops::Range<usize>>,
    /// Total bytes consumed (matches the legacy `bytes_consumed`).
    pub consumed: usize,
}

/// Scan a reference definition and record the byte spans of its components.
///
/// The walk is identical to the legacy string-returning parser — it just
/// records offsets instead of allocating component strings, so detection and
/// emission stay byte-for-byte consistent. See [`ReferenceSpans`].
pub(crate) fn reference_definition_spans(
    text: &str,
    strict_eol: bool,
    dialect: crate::options::Dialect,
) -> Option<ReferenceSpans> {
    let leading_spaces = text.chars().take_while(|&c| c == ' ').count();
    if leading_spaces > 3 {
        return None;
    }
    let inner = &text[leading_spaces..];
    let bytes = inner.as_bytes();

    // Must start at beginning of line with [
    if bytes.is_empty() || bytes[0] != b'[' {
        return None;
    }

    // Check if it's a footnote definition [^id]: - not a reference definition
    if bytes.len() >= 2 && bytes[1] == b'^' {
        return None;
    }

    // Find the closing ] for the label. Labels may span lines (CommonMark
    // §4.7) but a blank line inside the label terminates the attempt. We also
    // reject unescaped `[` inside the label per spec.
    let mut pos = 1;
    let mut escape_next = false;

    while pos < bytes.len() {
        if escape_next {
            escape_next = false;
            pos += 1;
            continue;
        }

        match bytes[pos] {
            b'\\' => {
                escape_next = true;
                pos += 1;
            }
            b']' => {
                break;
            }
            b'[' => {
                return None;
            }
            b'\n' | b'\r' => {
                let nl_end =
                    if bytes[pos] == b'\r' && pos + 1 < bytes.len() && bytes[pos + 1] == b'\n' {
                        pos + 2
                    } else {
                        pos + 1
                    };
                let mut probe = nl_end;
                while probe < bytes.len() && matches!(bytes[probe], b' ' | b'\t') {
                    probe += 1;
                }
                if probe >= bytes.len() || bytes[probe] == b'\n' || bytes[probe] == b'\r' {
                    return None;
                }
                pos = nl_end;
            }
            _ => {
                pos += 1;
            }
        }
    }

    if pos >= bytes.len() || bytes[pos] != b']' {
        return None;
    }

    let label = &inner[1..pos];
    if label.trim().is_empty() {
        return None;
    }
    let label_close = leading_spaces + pos;

    pos += 1; // Skip ]

    // Must be followed by :
    if pos >= bytes.len() || bytes[pos] != b':' {
        return None;
    }
    let colon = leading_spaces + pos;
    pos += 1;

    // Skip ws + at most one newline + ws to the URL.
    pos = skip_ws_one_newline(bytes, pos)?;

    // Parse URL
    let url_start = pos;
    let url_is_angle = pos < bytes.len() && bytes[pos] == b'<';

    if url_is_angle {
        pos += 1;
        while pos < bytes.len() && bytes[pos] != b'>' && bytes[pos] != b'\n' && bytes[pos] != b'\r'
        {
            pos += 1;
        }
        if pos >= bytes.len() || bytes[pos] != b'>' {
            return None;
        }
        pos += 1; // Skip >
    } else {
        while pos < bytes.len() && !matches!(bytes[pos], b' ' | b'\t' | b'\n' | b'\r') {
            pos += 1;
        }
        if pos == url_start {
            return None;
        }
    }
    let url = (leading_spaces + url_start)..(leading_spaces + pos);

    // After URL, try optional title. If a title attempt is malformed but we
    // had to cross a newline to reach it, fall back to "no title, end of URL
    // line" — the next line is then parsed independently (e.g.
    // `[foo]: /url\n"title" ok\n` → ref def `[foo]: /url`, paragraph
    // `"title" ok`).
    let after_url = pos;
    let url_line_end = consume_to_eol(bytes, after_url);
    let url_line_end_lax = if strict_eol {
        url_line_end
    } else {
        Some(consume_to_eol_lax(bytes, after_url))
    };

    let mut title: Option<std::ops::Range<usize>> = None;
    let mut end_pos: Option<usize> = None;

    if let Some(title_start) = skip_ws_one_newline(bytes, after_url) {
        let crossed_newline = bytes[after_url..title_start]
            .iter()
            .any(|&b| b == b'\n' || b == b'\r');
        // CommonMark §4.7: when the title is on the same line as the
        // destination, it must be separated from the destination by at least
        // one space or tab. `<bar>(baz)` (no whitespace between `>` and `(`)
        // is therefore not a valid LRD under CommonMark; Pandoc accepts it.
        let cmark_requires_separator = dialect == crate::options::Dialect::CommonMark
            && !crossed_newline
            && title_start == after_url;
        if cmark_requires_separator {
            return Some(ReferenceSpans {
                indent: leading_spaces,
                label_close,
                colon,
                url,
                url_is_angle,
                title: None,
                consumed: leading_spaces + url_line_end_lax?,
            });
        }
        let mut title_pos = title_start;
        match parse_title(bytes, &mut title_pos) {
            Some(Some(range)) => {
                let line_end = if strict_eol {
                    consume_to_eol(bytes, title_pos)
                } else {
                    Some(consume_to_eol_lax(bytes, title_pos))
                };
                if let Some(end) = line_end {
                    title = Some((leading_spaces + range.start)..(leading_spaces + range.end));
                    end_pos = Some(end);
                } else if !crossed_newline {
                    return None;
                }
            }
            None => {
                if !crossed_newline {
                    return None;
                }
            }
            Some(None) => {}
        }
    }

    let end = match end_pos {
        Some(p) => p,
        None => url_line_end_lax?,
    };

    Some(ReferenceSpans {
        indent: leading_spaces,
        label_close,
        colon,
        url,
        url_is_angle,
        title,
        consumed: leading_spaces + end,
    })
}

/// Like `consume_to_eol` but returns the end-of-line position regardless of
/// whether the line had non-whitespace content after the parsed segment.
fn consume_to_eol_lax(bytes: &[u8], mut pos: usize) -> usize {
    while pos < bytes.len() && bytes[pos] != b'\n' && bytes[pos] != b'\r' {
        pos += 1;
    }
    if pos < bytes.len() {
        if bytes[pos] == b'\r' && pos + 1 < bytes.len() && bytes[pos + 1] == b'\n' {
            pos += 2;
        } else {
            pos += 1;
        }
    }
    pos
}

/// Skip space/tab from `pos`, then consume one line ending if present.
/// Returns `None` if non-whitespace is found before the line ending.
fn consume_to_eol(bytes: &[u8], mut pos: usize) -> Option<usize> {
    while pos < bytes.len() && matches!(bytes[pos], b' ' | b'\t') {
        pos += 1;
    }
    if pos >= bytes.len() {
        return Some(pos);
    }
    match bytes[pos] {
        b'\n' => Some(pos + 1),
        b'\r' => {
            if pos + 1 < bytes.len() && bytes[pos + 1] == b'\n' {
                Some(pos + 2)
            } else {
                Some(pos + 1)
            }
        }
        _ => None,
    }
}

/// Skip space/tab and optionally one line ending followed by more space/tab,
/// per the "optional spaces or tabs (including up to one [line ending])" rule
/// in CommonMark §4.7. Returns `None` if a *second* line ending is encountered
/// (i.e. a blank line), which terminates the definition.
fn skip_ws_one_newline(bytes: &[u8], mut pos: usize) -> Option<usize> {
    while pos < bytes.len() && matches!(bytes[pos], b' ' | b'\t') {
        pos += 1;
    }
    if pos < bytes.len() && (bytes[pos] == b'\n' || bytes[pos] == b'\r') {
        if bytes[pos] == b'\r' && pos + 1 < bytes.len() && bytes[pos + 1] == b'\n' {
            pos += 2;
        } else {
            pos += 1;
        }
        while pos < bytes.len() && matches!(bytes[pos], b' ' | b'\t') {
            pos += 1;
        }
        if pos < bytes.len() && (bytes[pos] == b'\n' || bytes[pos] == b'\r') {
            return None;
        }
    }
    Some(pos)
}

pub fn line_is_mmd_link_attribute_continuation(line: &str) -> bool {
    if !(line.starts_with(' ') || line.starts_with('\t')) {
        return false;
    }

    let trimmed = line.trim();
    if trimmed.is_empty() {
        return false;
    }

    let bytes = trimmed.as_bytes();
    let mut pos = 0usize;
    let len = bytes.len();
    let mut saw_pair = false;

    while pos < len {
        // Skip inter-token whitespace.
        while pos < len && (bytes[pos] == b' ' || bytes[pos] == b'\t') {
            pos += 1;
        }
        if pos >= len {
            break;
        }

        // Parse key until '=' or whitespace.
        let key_start = pos;
        while pos < len && bytes[pos] != b'=' && bytes[pos] != b' ' && bytes[pos] != b'\t' {
            pos += 1;
        }
        if pos == key_start || pos >= len || bytes[pos] != b'=' {
            return false;
        }
        pos += 1; // skip '='

        // Parse value (quoted or unquoted), require non-empty value.
        if pos >= len {
            return false;
        }
        if bytes[pos] == b'"' || bytes[pos] == b'\'' {
            let quote = bytes[pos];
            pos += 1;
            let value_start = pos;
            while pos < len && bytes[pos] != quote {
                pos += 1;
            }
            if pos == value_start || pos >= len {
                return false;
            }
            pos += 1; // skip closing quote
        } else {
            let value_start = pos;
            while pos < len && bytes[pos] != b' ' && bytes[pos] != b'\t' {
                pos += 1;
            }
            if pos == value_start {
                return false;
            }
        }

        saw_pair = true;
    }

    saw_pair
}

/// Parse an optional title after the URL.
/// Titles can be in double quotes, single quotes, or parentheses.
///
/// Returns `Some(Some(range))` with the title's *outer* byte range (delimiters
/// included, relative to `bytes`) when a title is found, `Some(None)` if there
/// is no title, and `None` if a title is started but malformed. On success
/// `*pos` is advanced past the closing delimiter and any trailing space/tab.
fn parse_title(bytes: &[u8], pos: &mut usize) -> Option<Option<std::ops::Range<usize>>> {
    let base_pos = *pos;

    // Skip whitespace (including newlines for multi-line titles)
    while *pos < bytes.len() && matches!(bytes[*pos], b' ' | b'\t' | b'\n' | b'\r') {
        *pos += 1;
    }

    // Check if there's a title
    if *pos >= bytes.len() {
        return Some(None);
    }

    let quote_char = bytes[*pos];
    if !matches!(quote_char, b'"' | b'\'' | b'(') {
        // No title, that's okay
        *pos = base_pos; // Reset position
        return Some(None);
    }

    let closing_char = if quote_char == b'(' { b')' } else { quote_char };

    let open = *pos;
    *pos += 1; // Skip opening quote

    // Find closing quote
    let mut escape_next = false;
    while *pos < bytes.len() {
        if escape_next {
            escape_next = false;
            *pos += 1;
            continue;
        }

        match bytes[*pos] {
            b'\\' => {
                escape_next = true;
                *pos += 1;
            }
            c if c == closing_char => {
                *pos += 1; // Skip closing quote
                let close_end = *pos;

                // Skip trailing whitespace to end of line
                while *pos < bytes.len() && matches!(bytes[*pos], b' ' | b'\t') {
                    *pos += 1;
                }

                return Some(Some(open..close_end));
            }
            b'\n' if quote_char == b'(' => {
                // Parenthetical titles can span lines
                *pos += 1;
            }
            _ => {
                *pos += 1;
            }
        }
    }

    // No closing quote found
    None
}

/// Try to parse just the footnote marker [^id]: from a line.
/// Returns Some((id, content_start_col)) if the line starts with a footnote marker.
///
/// Syntax:
/// ```markdown
/// [^id]: Footnote content.
/// ```
pub fn try_parse_footnote_marker(line: &str) -> Option<(String, usize)> {
    let bytes = line.as_bytes();

    // Must start with [^
    if bytes.len() < 4 || bytes[0] != b'[' || bytes[1] != b'^' {
        return None;
    }

    // Find the closing ] for the ID
    let mut pos = 2;
    while pos < bytes.len() && bytes[pos] != b']' && bytes[pos] != b'\n' && bytes[pos] != b'\r' {
        pos += 1;
    }

    if pos >= bytes.len() || bytes[pos] != b']' {
        return None;
    }

    let id = &line[2..pos];
    if id.is_empty() {
        return None;
    }

    pos += 1; // Skip ]

    // Must be followed by :
    if pos >= bytes.len() || bytes[pos] != b':' {
        return None;
    }
    pos += 1;

    // Skip spaces/tabs until content (or end of line)
    while pos < bytes.len() && matches!(bytes[pos], b' ' | b'\t') {
        pos += 1;
    }

    Some((id.to_string(), pos))
}

#[cfg(test)]
mod tests {
    use super::{line_is_mmd_link_attribute_continuation, try_parse_reference_definition};
    use crate::syntax::SyntaxKind;

    #[test]
    fn test_footnote_definition_body_layout_is_lossless() {
        let input = "[^note-on-refs]:\n    Note that if `--file-scope` is used,\n";
        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
        assert_eq!(tree.text().to_string(), input);
    }

    #[test]
    fn test_footnote_definition_marker_emits_structural_tokens() {
        let input = "[^note-on-refs]: body\n";
        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
        let def = tree
            .descendants()
            .find(|n| n.kind() == SyntaxKind::FOOTNOTE_DEFINITION)
            .expect("footnote definition");
        let token_kinds: Vec<_> = def
            .children_with_tokens()
            .filter_map(|e| e.into_token())
            .map(|t| t.kind())
            .collect();
        assert!(token_kinds.contains(&SyntaxKind::FOOTNOTE_LABEL_START));
        assert!(token_kinds.contains(&SyntaxKind::FOOTNOTE_LABEL_ID));
        assert!(token_kinds.contains(&SyntaxKind::FOOTNOTE_LABEL_END));
        assert!(token_kinds.contains(&SyntaxKind::FOOTNOTE_LABEL_COLON));
    }

    #[test]
    fn footnote_multiline_dollar_math_parses_as_display_math_not_tex_block() {
        let input = "[^note]: Intro line before math:\n    $$\n    \\begin{aligned} a &= b \\\\ c &= d \\end{aligned}\n    $$\n";
        let tree = crate::parse(input, Some(crate::ParserOptions::default()));

        let def = tree
            .descendants()
            .find(|n| n.kind() == SyntaxKind::FOOTNOTE_DEFINITION)
            .expect("footnote definition");

        let has_display_math = def
            .descendants()
            .any(|n| n.kind() == SyntaxKind::DISPLAY_MATH);
        let has_tex_block = def.descendants().any(|n| n.kind() == SyntaxKind::TEX_BLOCK);

        assert!(
            has_display_math,
            "Expected DISPLAY_MATH in footnote definition, got:\n{}",
            tree
        );
        assert!(
            !has_tex_block,
            "Did not expect TEX_BLOCK in footnote definition for $$...$$ math, got:\n{}",
            tree
        );
    }

    #[test]
    fn test_reference_definition_with_up_to_three_leading_spaces() {
        let d = crate::options::Dialect::Pandoc;
        assert!(try_parse_reference_definition("   [foo]: #bar", d).is_some());
        assert!(try_parse_reference_definition("    [foo]: #bar", d).is_none());
    }

    #[test]
    fn test_reference_definition_commonmark_requires_separator_before_title() {
        // Pandoc: title `(baz)` directly attached after `<bar>` is accepted.
        let pandoc =
            try_parse_reference_definition("[foo]: <bar>(baz)\n", crate::options::Dialect::Pandoc);
        assert_eq!(
            pandoc
                .as_ref()
                .map(|(_, _, url, title)| (url.as_str(), title.as_deref())),
            Some(("bar", Some("baz")))
        );

        // CommonMark: same input is not a valid LRD because the title `(baz)`
        // is not space-separated from the destination; the parser rejects the
        // candidate so the dispatcher falls back to a paragraph.
        let cmark = try_parse_reference_definition(
            "[foo]: <bar>(baz)\n",
            crate::options::Dialect::CommonMark,
        );
        assert!(cmark.is_none());

        // CommonMark with a space before the title does parse as an LRD with a
        // title.
        let cmark_ok = try_parse_reference_definition(
            "[foo]: <bar> (baz)\n",
            crate::options::Dialect::CommonMark,
        );
        assert_eq!(
            cmark_ok
                .as_ref()
                .map(|(_, _, url, title)| (url.as_str(), title.as_deref())),
            Some(("bar", Some("baz")))
        );
    }

    #[test]
    fn test_reference_definition_emits_structured_url_and_title() {
        let input = "[ref]: <https://example.com> \"The Title\"\n";
        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
        assert_eq!(tree.text().to_string(), input, "must stay lossless");

        let def = tree
            .descendants()
            .find(|n| n.kind() == SyntaxKind::REFERENCE_DEFINITION)
            .expect("reference definition");

        let url = def
            .children()
            .find(|n| n.kind() == SyntaxKind::REFERENCE_URL)
            .expect("REFERENCE_URL node");
        assert_eq!(url.text().to_string(), "<https://example.com>");
        // Angle brackets are kept inside the node as their own delimiter tokens.
        assert!(
            url.children_with_tokens()
                .any(|e| e.kind() == SyntaxKind::LINK_DEST_START)
        );
        assert!(
            url.children_with_tokens()
                .any(|e| e.kind() == SyntaxKind::LINK_DEST_END)
        );

        let title = def
            .children()
            .find(|n| n.kind() == SyntaxKind::REFERENCE_TITLE)
            .expect("REFERENCE_TITLE node");
        assert_eq!(title.text().to_string(), "\"The Title\"");
    }

    #[test]
    fn test_reference_definition_without_title_omits_title_node() {
        let input = "[ref]: /url\n";
        let tree = crate::parse(input, Some(crate::ParserOptions::default()));
        assert_eq!(tree.text().to_string(), input, "must stay lossless");

        let def = tree
            .descendants()
            .find(|n| n.kind() == SyntaxKind::REFERENCE_DEFINITION)
            .expect("reference definition");

        let url = def
            .children()
            .find(|n| n.kind() == SyntaxKind::REFERENCE_URL)
            .expect("REFERENCE_URL node");
        assert_eq!(url.text().to_string(), "/url");
        assert!(
            !def.children()
                .any(|n| n.kind() == SyntaxKind::REFERENCE_TITLE),
            "no title => no REFERENCE_TITLE node"
        );
    }

    #[test]
    fn mmd_link_attribute_continuation_detects_valid_tokens() {
        assert!(line_is_mmd_link_attribute_continuation(
            "    width=20px height=30px id=myId"
        ));
        assert!(line_is_mmd_link_attribute_continuation(
            "\tclass=\"myClass1 myClass2\""
        ));
    }

    #[test]
    fn mmd_link_attribute_continuation_rejects_non_attribute_lines() {
        assert!(!line_is_mmd_link_attribute_continuation(
            "not-indented width=20px"
        ));
        assert!(!line_is_mmd_link_attribute_continuation(
            "    not-an-attr token"
        ));
    }
}