#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct SpanAttributes {
pub font_family: Option<String>,
pub size: Option<String>,
pub foreground: Option<String>,
pub background: Option<String>,
pub weight: Option<String>,
pub style: Option<String>,
}
#[derive(Debug, Clone, PartialEq)]
pub enum TextSpan {
Plain(String),
Bold(Vec<TextSpan>),
Italic(Vec<TextSpan>),
Highlight(Vec<TextSpan>),
Comment(Vec<TextSpan>),
Span(SpanAttributes, Vec<TextSpan>),
}
impl TextSpan {
#[must_use]
pub fn plain_text(&self) -> String {
match self {
TextSpan::Plain(s) => s.clone(),
TextSpan::Bold(children)
| TextSpan::Italic(children)
| TextSpan::Highlight(children)
| TextSpan::Comment(children)
| TextSpan::Span(_, children) => children.iter().map(TextSpan::plain_text).collect(),
}
}
}
#[must_use]
pub fn has_inline_markup(text: &str) -> bool {
let mut remaining = text;
while let Some(pos) = remaining.find('<') {
let after = &remaining[pos + 1..];
if tag_name_at_start(after).is_some() {
return true;
}
if span_tag_at_start(after).is_some() {
return true;
}
if let Some(rest) = after.strip_prefix('/') {
if tag_name_at_start(rest).is_some() {
return true;
}
if rest
.get(..5)
.is_some_and(|p| p.eq_ignore_ascii_case("span>"))
{
return true;
}
}
remaining = &remaining[pos + 1..];
}
false
}
#[must_use]
pub fn parse_inline_markup(text: &str) -> Vec<TextSpan> {
if !has_inline_markup(text) {
if text.is_empty() {
return Vec::new();
}
return vec![TextSpan::Plain(text.to_string())];
}
let mut parser = InlineMarkupParser::new(text);
let spans = parser.parse_spans(&[]);
if spans.is_empty() && !text.is_empty() {
return vec![TextSpan::Plain(text.to_string())];
}
normalize_spans(spans)
}
#[must_use]
pub fn spans_to_plain_text(spans: &[TextSpan]) -> String {
spans.iter().map(TextSpan::plain_text).collect()
}
#[derive(Debug, Clone, PartialEq, Eq)]
enum TagType {
Bold,
Italic,
Highlight,
Comment,
Span(SpanAttributes),
}
fn tag_name_at_start(s: &str) -> Option<(TagType, usize)> {
let tags: &[(&str, TagType)] = &[
("highlight>", TagType::Highlight),
("comment>", TagType::Comment),
("italic>", TagType::Italic),
("bold>", TagType::Bold),
("b>", TagType::Bold),
("i>", TagType::Italic),
];
for (name, tag_type) in tags {
if s.get(..name.len())
.is_some_and(|candidate| candidate.eq_ignore_ascii_case(name))
{
return Some((tag_type.clone(), name.len()));
}
}
None
}
fn span_tag_at_start(s: &str) -> Option<(SpanAttributes, usize)> {
if !s
.get(..4)
.is_some_and(|prefix| prefix.eq_ignore_ascii_case("span"))
{
return None;
}
let after_name = &s[4..];
if after_name.starts_with('>') {
return Some((SpanAttributes::default(), 5)); }
if !after_name.starts_with(|c: char| c.is_ascii_whitespace()) {
return None;
}
let closing = s.find('>')?;
let attr_str = &s[4..closing].trim();
let attrs = parse_span_attributes(attr_str);
Some((attrs, closing + 1))
}
fn parse_span_attributes(s: &str) -> SpanAttributes {
let mut attrs = SpanAttributes::default();
let mut remaining = s.trim();
while !remaining.is_empty() {
remaining = remaining.trim_start();
if remaining.is_empty() {
break;
}
let eq_pos = match remaining.find('=') {
Some(pos) => pos,
None => break,
};
let key = remaining[..eq_pos].trim();
let after_eq = remaining[eq_pos + 1..].trim_start();
let (quote_char, after_quote) = if let Some(rest) = after_eq.strip_prefix('"') {
('"', rest)
} else if let Some(rest) = after_eq.strip_prefix('\'') {
('\'', rest)
} else {
break;
};
let end_quote = match after_quote.find(quote_char) {
Some(pos) => pos,
None => break,
};
let value = &after_quote[..end_quote];
let key_lower = key.to_ascii_lowercase();
match key_lower.as_str() {
"font_family" => attrs.font_family = Some(value.to_string()),
"size" => attrs.size = Some(value.to_string()),
"foreground" | "color" => attrs.foreground = Some(value.to_string()),
"background" => attrs.background = Some(value.to_string()),
"weight" => attrs.weight = Some(value.to_string()),
"style" => attrs.style = Some(value.to_string()),
_ => {} }
remaining = &after_quote[end_quote + 1..];
}
attrs
}
fn closing_tag_at_start(s: &str) -> Option<(TagType, usize)> {
if s.get(..5).is_some_and(|p| p.eq_ignore_ascii_case("span>")) {
return Some((TagType::Span(SpanAttributes::default()), 5));
}
tag_name_at_start(s)
}
fn tag_type_to_span(tag_type: TagType, children: Vec<TextSpan>) -> TextSpan {
match tag_type {
TagType::Bold => TextSpan::Bold(children),
TagType::Italic => TextSpan::Italic(children),
TagType::Highlight => TextSpan::Highlight(children),
TagType::Comment => TextSpan::Comment(children),
TagType::Span(attrs) => TextSpan::Span(attrs, children),
}
}
fn closers_contain(closers: &[TagType], tag: &TagType) -> bool {
closers.iter().any(|c| match (c, tag) {
(TagType::Span(_), TagType::Span(_)) => true,
(a, b) => a == b,
})
}
const MAX_NESTING_DEPTH: usize = 32;
struct InlineMarkupParser<'a> {
input: &'a str,
pos: usize,
}
impl<'a> InlineMarkupParser<'a> {
fn new(input: &'a str) -> Self {
Self { input, pos: 0 }
}
fn remaining(&self) -> &'a str {
&self.input[self.pos..]
}
fn parse_spans(&mut self, expected_closers: &[TagType]) -> Vec<TextSpan> {
let mut spans: Vec<TextSpan> = Vec::new();
let mut plain_start = self.pos;
while self.pos < self.input.len() {
let remaining = self.remaining();
if remaining.starts_with('<') {
let after_lt = &self.input[self.pos + 1..];
if after_lt.starts_with('/') {
let after_slash = &self.input[self.pos + 2..];
if let Some((tag_type, name_len)) = closing_tag_at_start(after_slash) {
if closers_contain(expected_closers, &tag_type) {
if plain_start < self.pos {
spans.push(TextSpan::Plain(
self.input[plain_start..self.pos].to_string(),
));
}
self.pos += 2 + name_len;
return spans;
}
self.pos += 1;
continue;
}
self.pos += 1;
continue;
}
if expected_closers.len() >= MAX_NESTING_DEPTH {
self.pos += 1;
continue;
}
if let Some((attrs, tag_len)) = span_tag_at_start(after_lt) {
if plain_start < self.pos {
spans.push(TextSpan::Plain(
self.input[plain_start..self.pos].to_string(),
));
}
self.pos += 1 + tag_len;
let mut closers = expected_closers.to_vec();
closers.push(TagType::Span(attrs.clone()));
let children = self.parse_spans(&closers);
spans.push(TextSpan::Span(attrs, children));
plain_start = self.pos;
continue;
}
if let Some((tag_type, name_len)) = tag_name_at_start(after_lt) {
if plain_start < self.pos {
spans.push(TextSpan::Plain(
self.input[plain_start..self.pos].to_string(),
));
}
self.pos += 1 + name_len;
let mut closers = expected_closers.to_vec();
closers.push(tag_type.clone());
let children = self.parse_spans(&closers);
let span = tag_type_to_span(tag_type, children);
spans.push(span);
plain_start = self.pos;
continue;
}
self.pos += 1;
continue;
}
let ch_len = remaining.chars().next().map_or(1, |c| c.len_utf8());
self.pos += ch_len;
}
if plain_start < self.pos {
spans.push(TextSpan::Plain(
self.input[plain_start..self.pos].to_string(),
));
}
spans
}
}
fn normalize_spans(spans: Vec<TextSpan>) -> Vec<TextSpan> {
let mut result: Vec<TextSpan> = Vec::new();
for span in spans {
match span {
TextSpan::Plain(text) => {
if let Some(TextSpan::Plain(prev)) = result.last_mut() {
prev.push_str(&text);
} else {
result.push(TextSpan::Plain(text));
}
}
TextSpan::Bold(children) => {
result.push(TextSpan::Bold(normalize_spans(children)));
}
TextSpan::Italic(children) => {
result.push(TextSpan::Italic(normalize_spans(children)));
}
TextSpan::Highlight(children) => {
result.push(TextSpan::Highlight(normalize_spans(children)));
}
TextSpan::Comment(children) => {
result.push(TextSpan::Comment(normalize_spans(children)));
}
TextSpan::Span(attrs, children) => {
result.push(TextSpan::Span(attrs, normalize_spans(children)));
}
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn no_markup_plain_text() {
assert!(!has_inline_markup("Hello world"));
}
#[test]
fn no_markup_with_angle_bracket() {
assert!(!has_inline_markup("x < y"));
}
#[test]
fn has_bold_tag() {
assert!(has_inline_markup("<b>bold</b>"));
}
#[test]
fn has_italic_tag() {
assert!(has_inline_markup("<i>italic</i>"));
}
#[test]
fn has_long_bold_tag() {
assert!(has_inline_markup("<bold>text</bold>"));
}
#[test]
fn has_long_italic_tag() {
assert!(has_inline_markup("<italic>text</italic>"));
}
#[test]
fn has_highlight_tag() {
assert!(has_inline_markup("<highlight>text</highlight>"));
}
#[test]
fn has_comment_tag() {
assert!(has_inline_markup("<comment>text</comment>"));
}
#[test]
fn parse_plain_text() {
let spans = parse_inline_markup("Hello world");
assert_eq!(spans, vec![TextSpan::Plain("Hello world".to_string())]);
}
#[test]
fn parse_empty_text() {
let spans = parse_inline_markup("");
assert_eq!(spans, Vec::<TextSpan>::new());
}
#[test]
fn parse_bold_short() {
let spans = parse_inline_markup("<b>bold</b>");
assert_eq!(
spans,
vec![TextSpan::Bold(vec![TextSpan::Plain("bold".to_string())])]
);
}
#[test]
fn parse_bold_long() {
let spans = parse_inline_markup("<bold>bold</bold>");
assert_eq!(
spans,
vec![TextSpan::Bold(vec![TextSpan::Plain("bold".to_string())])]
);
}
#[test]
fn parse_italic_short() {
let spans = parse_inline_markup("<i>italic</i>");
assert_eq!(
spans,
vec![TextSpan::Italic(vec![TextSpan::Plain(
"italic".to_string()
)])]
);
}
#[test]
fn parse_italic_long() {
let spans = parse_inline_markup("<italic>italic</italic>");
assert_eq!(
spans,
vec![TextSpan::Italic(vec![TextSpan::Plain(
"italic".to_string()
)])]
);
}
#[test]
fn parse_highlight() {
let spans = parse_inline_markup("<highlight>highlighted</highlight>");
assert_eq!(
spans,
vec![TextSpan::Highlight(vec![TextSpan::Plain(
"highlighted".to_string()
)])]
);
}
#[test]
fn parse_comment() {
let spans = parse_inline_markup("<comment>commented</comment>");
assert_eq!(
spans,
vec![TextSpan::Comment(vec![TextSpan::Plain(
"commented".to_string()
)])]
);
}
#[test]
fn parse_text_before_and_after_tag() {
let spans = parse_inline_markup("Hello <b>world</b> foo");
assert_eq!(
spans,
vec![
TextSpan::Plain("Hello ".to_string()),
TextSpan::Bold(vec![TextSpan::Plain("world".to_string())]),
TextSpan::Plain(" foo".to_string()),
]
);
}
#[test]
fn parse_multiple_tags() {
let spans = parse_inline_markup("<b>bold</b> and <i>italic</i>");
assert_eq!(
spans,
vec![
TextSpan::Bold(vec![TextSpan::Plain("bold".to_string())]),
TextSpan::Plain(" and ".to_string()),
TextSpan::Italic(vec![TextSpan::Plain("italic".to_string())]),
]
);
}
#[test]
fn parse_nested_bold_italic() {
let spans = parse_inline_markup("<b><i>both</i></b>");
assert_eq!(
spans,
vec![TextSpan::Bold(vec![TextSpan::Italic(vec![
TextSpan::Plain("both".to_string())
])])]
);
}
#[test]
fn parse_nested_with_surrounding_text() {
let spans = parse_inline_markup("<b>bold <i>and italic</i> text</b>");
assert_eq!(
spans,
vec![TextSpan::Bold(vec![
TextSpan::Plain("bold ".to_string()),
TextSpan::Italic(vec![TextSpan::Plain("and italic".to_string())]),
TextSpan::Plain(" text".to_string()),
])]
);
}
#[test]
fn parse_case_insensitive_tags() {
let spans = parse_inline_markup("<B>bold</B>");
assert_eq!(
spans,
vec![TextSpan::Bold(vec![TextSpan::Plain("bold".to_string())])]
);
}
#[test]
fn parse_mixed_case_tags() {
let spans = parse_inline_markup("<Bold>text</Bold>");
assert_eq!(
spans,
vec![TextSpan::Bold(vec![TextSpan::Plain("text".to_string())])]
);
}
#[test]
fn unclosed_tag_wraps_remaining_text() {
let spans = parse_inline_markup("<b>unclosed");
assert_eq!(
spans,
vec![TextSpan::Bold(vec![TextSpan::Plain(
"unclosed".to_string()
)])]
);
}
#[test]
fn depth_limit_prevents_stack_overflow() {
let open_tags: String = "<b>".repeat(MAX_NESTING_DEPTH + 1);
let close_tags: String = "</b>".repeat(MAX_NESTING_DEPTH + 1);
let input = format!("{}text{}", open_tags, close_tags);
let spans = parse_inline_markup(&input);
assert!(!spans.is_empty());
}
#[test]
fn unrecognized_tag_treated_as_plain() {
let spans = parse_inline_markup("<unknown>text</unknown>");
assert_eq!(
spans,
vec![TextSpan::Plain("<unknown>text</unknown>".to_string())]
);
}
#[test]
fn lone_angle_bracket_is_plain() {
let spans = parse_inline_markup("x < y");
assert_eq!(spans, vec![TextSpan::Plain("x < y".to_string())]);
}
#[test]
fn stray_closing_tag_is_plain() {
let spans = parse_inline_markup("text </b> more");
assert_eq!(spans, vec![TextSpan::Plain("text </b> more".to_string())]);
}
#[test]
fn has_span_tag_no_attrs() {
assert!(has_inline_markup("<span>text</span>"));
}
#[test]
fn has_span_tag_with_attrs() {
assert!(has_inline_markup(r#"<span foreground="red">text</span>"#));
}
#[test]
fn has_span_closing_tag_only() {
assert!(has_inline_markup("text </span> more"));
}
#[test]
fn parse_span_no_attrs() {
let spans = parse_inline_markup("<span>styled</span>");
assert_eq!(
spans,
vec![TextSpan::Span(
SpanAttributes::default(),
vec![TextSpan::Plain("styled".to_string())]
)]
);
}
#[test]
fn parse_span_single_attr() {
let spans = parse_inline_markup(r#"<span foreground="red">text</span>"#);
assert_eq!(
spans,
vec![TextSpan::Span(
SpanAttributes {
foreground: Some("red".to_string()),
..Default::default()
},
vec![TextSpan::Plain("text".to_string())]
)]
);
}
#[test]
fn parse_span_multiple_attrs() {
let spans = parse_inline_markup(
r#"<span font_family="Serif" size="12" foreground="blue" background="yellow" weight="bold" style="italic">text</span>"#,
);
assert_eq!(
spans,
vec![TextSpan::Span(
SpanAttributes {
font_family: Some("Serif".to_string()),
size: Some("12".to_string()),
foreground: Some("blue".to_string()),
background: Some("yellow".to_string()),
weight: Some("bold".to_string()),
style: Some("italic".to_string()),
},
vec![TextSpan::Plain("text".to_string())]
)]
);
}
#[test]
fn parse_span_single_quoted_attrs() {
let spans = parse_inline_markup("<span foreground='green'>text</span>");
assert_eq!(
spans,
vec![TextSpan::Span(
SpanAttributes {
foreground: Some("green".to_string()),
..Default::default()
},
vec![TextSpan::Plain("text".to_string())]
)]
);
}
#[test]
fn parse_span_color_alias() {
let spans = parse_inline_markup(r#"<span color="red">text</span>"#);
assert_eq!(
spans,
vec![TextSpan::Span(
SpanAttributes {
foreground: Some("red".to_string()),
..Default::default()
},
vec![TextSpan::Plain("text".to_string())]
)]
);
}
#[test]
fn parse_span_case_insensitive() {
let spans = parse_inline_markup(r#"<SPAN Foreground="red">text</SPAN>"#);
assert_eq!(
spans,
vec![TextSpan::Span(
SpanAttributes {
foreground: Some("red".to_string()),
..Default::default()
},
vec![TextSpan::Plain("text".to_string())]
)]
);
}
#[test]
fn parse_span_nested_inside_bold() {
let spans = parse_inline_markup(r#"<b><span foreground="red">text</span></b>"#);
assert_eq!(
spans,
vec![TextSpan::Bold(vec![TextSpan::Span(
SpanAttributes {
foreground: Some("red".to_string()),
..Default::default()
},
vec![TextSpan::Plain("text".to_string())]
)])]
);
}
#[test]
fn parse_bold_nested_inside_span() {
let spans = parse_inline_markup(r#"<span foreground="red"><b>text</b></span>"#);
assert_eq!(
spans,
vec![TextSpan::Span(
SpanAttributes {
foreground: Some("red".to_string()),
..Default::default()
},
vec![TextSpan::Bold(vec![TextSpan::Plain("text".to_string())])]
)]
);
}
#[test]
fn parse_span_with_surrounding_text() {
let spans = parse_inline_markup(r#"Hello <span foreground="red">world</span> foo"#);
assert_eq!(
spans,
vec![
TextSpan::Plain("Hello ".to_string()),
TextSpan::Span(
SpanAttributes {
foreground: Some("red".to_string()),
..Default::default()
},
vec![TextSpan::Plain("world".to_string())]
),
TextSpan::Plain(" foo".to_string()),
]
);
}
#[test]
fn parse_span_unclosed_wraps_remaining() {
let spans = parse_inline_markup(r#"<span foreground="red">unclosed"#);
assert_eq!(
spans,
vec![TextSpan::Span(
SpanAttributes {
foreground: Some("red".to_string()),
..Default::default()
},
vec![TextSpan::Plain("unclosed".to_string())]
)]
);
}
#[test]
fn parse_span_unknown_attrs_ignored() {
let spans = parse_inline_markup(r#"<span unknown="val" foreground="red">text</span>"#);
assert_eq!(
spans,
vec![TextSpan::Span(
SpanAttributes {
foreground: Some("red".to_string()),
..Default::default()
},
vec![TextSpan::Plain("text".to_string())]
)]
);
}
#[test]
fn plain_text_extraction_simple() {
let spans = vec![TextSpan::Plain("hello".to_string())];
assert_eq!(spans_to_plain_text(&spans), "hello");
}
#[test]
fn plain_text_extraction_with_markup() {
let spans = vec![
TextSpan::Plain("Hello ".to_string()),
TextSpan::Bold(vec![TextSpan::Plain("world".to_string())]),
];
assert_eq!(spans_to_plain_text(&spans), "Hello world");
}
#[test]
fn plain_text_extraction_nested() {
let spans = vec![TextSpan::Bold(vec![TextSpan::Italic(vec![
TextSpan::Plain("nested".to_string()),
])])];
assert_eq!(spans_to_plain_text(&spans), "nested");
}
#[test]
fn plain_text_extraction_span() {
let spans = vec![TextSpan::Span(
SpanAttributes {
foreground: Some("red".to_string()),
..Default::default()
},
vec![TextSpan::Plain("colored".to_string())],
)];
assert_eq!(spans_to_plain_text(&spans), "colored");
}
#[test]
fn has_inline_markup_does_not_panic_on_multibyte_after_lt() {
assert!(!has_inline_markup("<abc\u{0300}xyz"));
}
#[test]
fn has_inline_markup_does_not_panic_on_multibyte_after_slash() {
assert!(!has_inline_markup("</abc\u{0300}xyz"));
}
#[test]
fn has_inline_markup_does_not_panic_on_emoji_after_lt() {
assert!(!has_inline_markup("<abc🎸xyz"));
}
#[test]
fn has_inline_markup_does_not_panic_on_cjk_after_slash() {
assert!(!has_inline_markup("</こんにちは"));
}
#[test]
fn parse_inline_markup_does_not_panic_on_multibyte_adjacent_to_lt() {
let spans = parse_inline_markup("<abc\u{0300}xyz");
assert_eq!(spans, vec![TextSpan::Plain("<abc\u{0300}xyz".to_string())]);
}
#[test]
fn parse_inline_markup_does_not_panic_with_real_tag_and_multibyte() {
let spans = parse_inline_markup("<b>漢字<abc\u{0300}xyz</b>");
assert!(matches!(spans.as_slice(), [TextSpan::Bold(_)]));
}
}