use std::fmt;
#[derive(Debug, PartialEq, Eq)]
pub enum Token {
Text(String),
Shortcode {
name: String,
args: Vec<(String, String)>,
body: Option<String>,
},
}
#[derive(Debug, PartialEq, Eq)]
pub struct ShortcodeError {
pub position: usize,
pub message: String,
}
impl fmt::Display for ShortcodeError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"shortcode parse error at byte {}: {}",
self.position, self.message
)
}
}
impl std::error::Error for ShortcodeError {}
pub fn parse(input: &str) -> Result<Vec<Token>, ShortcodeError> {
let bytes = input.as_bytes();
let code_regions = find_code_regions(input);
let mut out = Vec::new();
let mut text_start = 0usize;
let mut i = 0usize;
while i < bytes.len() {
if at_open_marker(bytes, i) {
if let Some(end) = code_region_containing(i, &code_regions) {
i = end;
continue;
}
if i > text_start {
out.push(Token::Text(input[text_start..i].to_string()));
}
let start = i;
let header = parse_header(input, &mut i)?;
match header {
Header::Close(name) => {
return Err(ShortcodeError {
position: start,
message: format!("unmatched close tag for {name:?}"),
});
}
Header::Open { name, args } => {
if let Some((body, close_end)) =
find_matching_close(input, i, &name, &code_regions)?
{
out.push(Token::Shortcode {
name,
args,
body: Some(body),
});
i = close_end;
} else {
out.push(Token::Shortcode {
name,
args,
body: None,
});
}
text_start = i;
}
}
} else {
i += 1;
}
}
if text_start < bytes.len() {
out.push(Token::Text(input[text_start..].to_string()));
}
Ok(out)
}
fn find_code_regions(input: &str) -> Vec<(usize, usize)> {
use pulldown_cmark::{Event, Parser, Tag, TagEnd};
let mut regions: Vec<(usize, usize)> = Vec::new();
let mut block_start: Option<usize> = None;
for (event, range) in Parser::new(input).into_offset_iter() {
match event {
Event::Start(Tag::CodeBlock(_)) => block_start = Some(range.start),
Event::End(TagEnd::CodeBlock) => {
if let Some(start) = block_start.take() {
regions.push((start, range.end));
}
}
Event::Code(_) => regions.push((range.start, range.end)),
_ => {}
}
}
regions
}
fn code_region_containing(pos: usize, regions: &[(usize, usize)]) -> Option<usize> {
regions
.iter()
.find(|(s, e)| pos >= *s && pos < *e)
.map(|(_, e)| *e)
}
enum Header {
Open {
name: String,
args: Vec<(String, String)>,
},
Close(String),
}
fn parse_header(input: &str, cursor: &mut usize) -> Result<Header, ShortcodeError> {
debug_assert!(at_open_marker(input.as_bytes(), *cursor));
*cursor += 3; skip_ws(input, cursor);
if peek_char(input, *cursor) == Some('/') {
*cursor += 1;
skip_ws(input, cursor);
let name = parse_ident(input, cursor)?;
skip_ws(input, cursor);
expect_close_marker(input, cursor)?;
return Ok(Header::Close(name));
}
let name = parse_ident(input, cursor)?;
let mut args = Vec::new();
loop {
skip_ws(input, cursor);
if at_close_marker(input.as_bytes(), *cursor) {
*cursor += 3;
return Ok(Header::Open { name, args });
}
let key = parse_ident(input, cursor)?;
skip_ws(input, cursor);
expect_char(input, cursor, '=')?;
skip_ws(input, cursor);
let value = parse_qstring(input, cursor)?;
args.push((key, value));
}
}
fn parse_ident(input: &str, cursor: &mut usize) -> Result<String, ShortcodeError> {
let bytes = input.as_bytes();
let start = *cursor;
let mut end = start;
while end < bytes.len() {
let b = bytes[end];
let valid = if end == start {
b.is_ascii_alphabetic()
} else {
b.is_ascii_alphanumeric() || b == b'_' || b == b'-'
};
if !valid {
break;
}
end += 1;
}
if end == start {
return Err(ShortcodeError {
position: start,
message: "expected identifier".to_string(),
});
}
*cursor = end;
Ok(input[start..end].to_string())
}
fn parse_qstring(input: &str, cursor: &mut usize) -> Result<String, ShortcodeError> {
let bytes = input.as_bytes();
let start = *cursor;
expect_char(input, cursor, '"')?;
let value_start = *cursor;
while *cursor < bytes.len() && bytes[*cursor] != b'"' {
*cursor += 1;
}
if *cursor >= bytes.len() {
return Err(ShortcodeError {
position: start,
message: "unterminated quoted value".to_string(),
});
}
let value = input[value_start..*cursor].to_string();
*cursor += 1; Ok(value)
}
fn expect_char(input: &str, cursor: &mut usize, c: char) -> Result<(), ShortcodeError> {
if peek_char(input, *cursor) == Some(c) {
*cursor += c.len_utf8();
Ok(())
} else {
Err(ShortcodeError {
position: *cursor,
message: format!("expected {c:?}"),
})
}
}
fn expect_close_marker(input: &str, cursor: &mut usize) -> Result<(), ShortcodeError> {
if at_close_marker(input.as_bytes(), *cursor) {
*cursor += 3;
Ok(())
} else {
Err(ShortcodeError {
position: *cursor,
message: "expected '>}}'".to_string(),
})
}
}
fn find_matching_close(
input: &str,
start: usize,
name: &str,
code_regions: &[(usize, usize)],
) -> Result<Option<(String, usize)>, ShortcodeError> {
let bytes = input.as_bytes();
let mut i = start;
while i < bytes.len() {
if at_open_marker(bytes, i) {
if let Some(end) = code_region_containing(i, code_regions) {
i = end;
continue;
}
let header_start = i;
let mut probe = i;
let header = parse_header(input, &mut probe)?;
match header {
Header::Close(close_name) if close_name == name => {
return Ok(Some((input[start..header_start].to_string(), probe)));
}
Header::Close(other) => {
return Err(ShortcodeError {
position: header_start,
message: format!("unmatched close tag {other:?} inside {name:?} block"),
});
}
Header::Open {
name: nested_name, ..
} if nested_name == name => {
return Err(ShortcodeError {
position: header_start,
message: format!("nested {name:?} inside its own block is not allowed"),
});
}
Header::Open { .. } => {
i = probe;
}
}
} else {
i += 1;
}
}
Ok(None)
}
fn at_open_marker(bytes: &[u8], i: usize) -> bool {
i + 3 <= bytes.len() && &bytes[i..i + 3] == b"{{<"
}
fn at_close_marker(bytes: &[u8], i: usize) -> bool {
i + 3 <= bytes.len() && &bytes[i..i + 3] == b">}}"
}
fn peek_char(input: &str, cursor: usize) -> Option<char> {
input[cursor..].chars().next()
}
fn skip_ws(input: &str, cursor: &mut usize) {
let bytes = input.as_bytes();
while *cursor < bytes.len() && bytes[*cursor].is_ascii_whitespace() {
*cursor += 1;
}
}
#[cfg(test)]
mod tests {
use super::*;
fn shortcode(name: &str, args: &[(&str, &str)], body: Option<&str>) -> Token {
Token::Shortcode {
name: name.to_string(),
args: args
.iter()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect(),
body: body.map(str::to_string),
}
}
#[test]
fn empty_input_yields_no_tokens() {
assert_eq!(parse("").unwrap(), Vec::<Token>::new());
}
#[test]
fn plain_text_is_a_single_text_token() {
assert_eq!(
parse("hello world").unwrap(),
vec![Token::Text("hello world".to_string())]
);
}
#[test]
fn self_closing_with_one_arg() {
let input = r#"see {{< adr-ref id="0017" >}} for details"#;
assert_eq!(
parse(input).unwrap(),
vec![
Token::Text("see ".to_string()),
shortcode("adr-ref", &[("id", "0017")], None),
Token::Text(" for details".to_string()),
]
);
}
#[test]
fn self_closing_with_multiple_args() {
let input = r#"{{< callout type="warning" emoji="!" >}}"#;
assert_eq!(
parse(input).unwrap(),
vec![shortcode(
"callout",
&[("type", "warning"), ("emoji", "!")],
None
)]
);
}
#[test]
fn block_form_captures_body_verbatim() {
let input = r#"{{< callout type="warning" >}}Be careful.{{< /callout >}}"#;
assert_eq!(
parse(input).unwrap(),
vec![shortcode(
"callout",
&[("type", "warning")],
Some("Be careful.")
)]
);
}
#[test]
fn block_body_is_opaque_to_inner_markup() {
let input = r#"{{< note >}}see [link](x.md){{< /note >}}"#;
assert_eq!(
parse(input).unwrap(),
vec![shortcode("note", &[], Some("see [link](x.md)"))]
);
}
#[test]
fn unmatched_close_tag_is_an_error() {
let err = parse("oops {{< /callout >}}").unwrap_err();
assert_eq!(err.position, 5);
assert!(err.message.contains("unmatched close"));
}
#[test]
fn unterminated_open_marker_is_an_error() {
let err = parse(r#"{{< adr-ref id="0017" "#).unwrap_err();
assert!(
err.message.contains("expected") || err.message.contains("identifier"),
"got: {}",
err.message
);
}
#[test]
fn unterminated_quoted_value_is_an_error() {
let err = parse(r#"{{< adr-ref id="0017 >}}"#).unwrap_err();
assert!(err.message.contains("unterminated"));
}
#[test]
fn bare_value_without_quotes_is_an_error() {
let err = parse(r#"{{< adr-ref id=0017 >}}"#).unwrap_err();
assert!(
err.message.contains("expected '\"'") || err.message.contains("\""),
"got: {}",
err.message
);
}
#[test]
fn positional_argument_without_key_is_an_error() {
let err = parse(r#"{{< adr-ref "0017" >}}"#).unwrap_err();
assert!(err.message.contains("identifier"), "got: {}", err.message);
}
#[test]
fn nested_same_name_block_is_an_error() {
let err =
parse(r#"{{< note >}}inner {{< note >}}x{{< /note >}}{{< /note >}}"#).unwrap_err();
assert!(err.message.contains("nested"), "got: {}", err.message);
}
#[test]
fn shortcode_inside_inline_code_span_is_ignored() {
let input = r#"Use `{{< adr-ref id="0017" >}}` to link an ADR."#;
assert_eq!(parse(input).unwrap(), vec![Token::Text(input.to_string())]);
}
#[test]
fn shortcode_inside_fenced_code_block_is_ignored() {
let input = "```\n{{< adr-ref id=\"0017\" >}}\n```";
assert_eq!(parse(input).unwrap(), vec![Token::Text(input.to_string())]);
}
#[test]
fn shortcode_outside_code_still_parses_when_an_example_appears_in_code() {
let input = r#"Example: `{{< adr-ref id="9999" >}}`. Real: {{< adr-ref id="0017" >}}."#;
let tokens = parse(input).unwrap();
assert_eq!(tokens.len(), 3);
match &tokens[1] {
Token::Shortcode { name, args, body } => {
assert_eq!(name, "adr-ref");
assert_eq!(args, &vec![("id".to_string(), "0017".to_string())]);
assert!(body.is_none());
}
_ => panic!("expected shortcode at index 1, got: {tokens:?}"),
}
}
#[test]
fn block_without_close_is_self_closing() {
let input = r#"{{< note >}}body without close"#;
assert_eq!(
parse(input).unwrap(),
vec![
shortcode("note", &[], None),
Token::Text("body without close".to_string()),
]
);
}
}