pub mod entities;
pub mod inline;
pub mod tokenizer;
pub use entities::decode_html_entities;
pub use inline::{format_line, InlineElement, InlineParser};
pub use tokenizer::{cjk_count, is_cjk, not_text, Token, Tokenizer};
use regex::Regex;
use std::sync::LazyLock;
use streamdown_core::{BlockType, Code, ListType, ParseState};
static CODE_FENCE_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^\s*(```+|~~~+|<pre>)\s*([^\s]*)\s*$").unwrap());
static CODE_FENCE_END_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^\s*(```+|~~~+|</pre>)\s*$").unwrap());
static SPACE_CODE_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^ \s*[^\s*]").unwrap());
static HEADING_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^(#{1,6})\s+(.*)$").unwrap());
static LIST_ITEM_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^(\s*)([+*-]|\+-+|\d+\.)\s+(.*)$").unwrap());
static BLOCK_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^\s*((>\s*)+|[◁<].?think[>▷]|</?.?think[>▷]?)(.*)$").unwrap());
static HR_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^(---+|\*\*\*+|___+)\s*$").unwrap());
static TABLE_ROW_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^\s*\|(.+)\|\s*$").unwrap());
static TABLE_SEP_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^[\s|:-]+$").unwrap());
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ListBullet {
Dash,
Asterisk,
Plus,
PlusExpand,
Ordered(usize),
}
impl ListBullet {
pub fn parse(s: &str) -> Option<Self> {
let s = s.trim();
if s.starts_with("+") && s.len() > 1 && s.chars().skip(1).all(|c| c == '-') {
return Some(ListBullet::PlusExpand);
}
match s {
"-" => Some(ListBullet::Dash),
"*" => Some(ListBullet::Asterisk),
"+" => Some(ListBullet::Plus),
s if s.ends_with('.') => {
let num = s.trim_end_matches('.').parse().ok()?;
Some(ListBullet::Ordered(num))
}
_ => None,
}
}
pub fn is_ordered(&self) -> bool {
matches!(self, ListBullet::Ordered(_))
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TableState {
Header,
Body,
}
#[derive(Debug, Clone, PartialEq)]
pub enum ParseEvent {
Text(String),
InlineCode(String),
Bold(String),
Italic(String),
Underline(String),
Strikeout(String),
BoldItalic(String),
Link { text: String, url: String },
Image { alt: String, url: String },
Footnote(String),
Heading { level: u8, content: String },
CodeBlockStart { language: Option<String>, indent: usize },
CodeBlockLine(String),
CodeBlockEnd,
ListItem { indent: usize, bullet: ListBullet, content: String },
ListEnd,
TableHeader(Vec<String>),
TableRow(Vec<String>),
TableSeparator,
TableEnd,
BlockquoteStart { depth: usize },
BlockquoteLine(String),
BlockquoteEnd,
ThinkBlockStart,
ThinkBlockLine(String),
ThinkBlockEnd,
HorizontalRule,
EmptyLine,
Newline,
Prompt(String),
InlineElements(Vec<InlineElement>),
}
impl ParseEvent {
pub fn is_block(&self) -> bool {
!self.is_inline()
}
pub fn is_inline(&self) -> bool {
matches!(
self,
ParseEvent::Text(_)
| ParseEvent::InlineCode(_)
| ParseEvent::Bold(_)
| ParseEvent::Italic(_)
| ParseEvent::Underline(_)
| ParseEvent::Strikeout(_)
| ParseEvent::BoldItalic(_)
| ParseEvent::Link { .. }
| ParseEvent::Image { .. }
| ParseEvent::Footnote(_)
)
}
}
#[derive(Debug)]
pub struct Parser {
state: ParseState,
inline_parser: InlineParser,
code_fence: Option<String>,
table_state: Option<TableState>,
events: Vec<ParseEvent>,
prev_was_empty: bool,
}
impl Default for Parser {
fn default() -> Self {
Self::new()
}
}
impl Parser {
pub fn new() -> Self {
Self {
state: ParseState::new(),
inline_parser: InlineParser::new(),
code_fence: None,
table_state: None,
events: Vec::new(),
prev_was_empty: false,
}
}
pub fn with_state(state: ParseState) -> Self {
let inline_parser = InlineParser::with_settings(state.links, state.images);
Self {
state,
inline_parser,
code_fence: None,
table_state: None,
events: Vec::new(),
prev_was_empty: false,
}
}
pub fn state(&self) -> &ParseState { &self.state }
pub fn state_mut(&mut self) -> &mut ParseState { &mut self.state }
pub fn set_process_links(&mut self, enabled: bool) {
self.state.links = enabled;
self.inline_parser.process_links = enabled;
}
pub fn set_process_images(&mut self, enabled: bool) {
self.state.images = enabled;
self.inline_parser.process_images = enabled;
}
pub fn set_code_spaces(&mut self, enabled: bool) {
self.state.code_spaces = enabled;
}
pub fn parse_line(&mut self, line: &str) -> Vec<ParseEvent> {
self.events.clear();
if self.state.is_in_code() {
self.parse_in_code_block(line);
return std::mem::take(&mut self.events);
}
if self.state.block_type == Some(BlockType::Think) {
self.parse_in_think_block(line);
return std::mem::take(&mut self.events);
}
if line.trim().is_empty() {
return self.handle_empty_line();
}
let was_prev_empty = self.prev_was_empty;
self.prev_was_empty = false;
self.state.last_line_empty = false;
if self.try_parse_space_code(line, was_prev_empty) { return self.take_events(); }
let line = self.strip_first_indent(line);
if self.try_parse_code_fence(&line) { return self.take_events(); }
if self.try_parse_block(&line) { return self.take_events(); }
if self.try_parse_heading(&line) { return self.take_events(); }
if self.try_parse_hr(&line) { return self.take_events(); }
if self.try_parse_list_item(&line) { return self.take_events(); }
if self.try_parse_table(&line) { return self.take_events(); }
self.exit_block_contexts();
self.parse_inline_content(&line);
self.take_events()
}
fn take_events(&mut self) -> Vec<ParseEvent> {
std::mem::take(&mut self.events)
}
fn strip_first_indent(&mut self, line: &str) -> String {
if self.state.first_indent.is_none() && !line.trim().is_empty() {
let indent = line.len() - line.trim_start().len();
self.state.first_indent = Some(indent);
}
if let Some(first_indent) = self.state.first_indent {
if first_indent > 0 {
let current_indent = line.len() - line.trim_start().len();
if current_indent >= first_indent {
return line[first_indent..].to_string();
}
}
}
line.to_string()
}
fn handle_empty_line(&mut self) -> Vec<ParseEvent> {
if self.prev_was_empty {
return vec![]; }
self.prev_was_empty = true;
self.state.last_line_empty = true;
if self.state.block_depth > 0 && self.state.block_type == Some(BlockType::Quote) {
while self.state.block_depth > 0 {
self.state.exit_block();
}
self.events.push(ParseEvent::BlockquoteEnd);
}
if self.state.in_list {
self.exit_list_context();
}
if self.table_state.is_some() {
self.table_state = None;
self.state.in_table = None;
self.events.push(ParseEvent::TableEnd);
}
self.events.push(ParseEvent::EmptyLine);
self.take_events()
}
fn exit_block_contexts(&mut self) {
if self.state.in_list {
self.exit_list_context();
}
if self.table_state.is_some() {
self.table_state = None;
self.state.in_table = None;
self.events.push(ParseEvent::TableEnd);
}
}
fn parse_in_code_block(&mut self, line: &str) {
if let Some(ref fence) = self.code_fence.clone() {
if let Some(caps) = CODE_FENCE_END_RE.captures(line) {
let end_fence = caps.get(1).map(|m| m.as_str()).unwrap_or("");
let matches = (fence.starts_with('`') && end_fence.starts_with('`'))
|| (fence.starts_with('~') && end_fence.starts_with('~'))
|| (fence == "<pre>" && end_fence == "</pre>");
if matches {
self.events.push(ParseEvent::CodeBlockEnd);
self.state.exit_code_block();
self.code_fence = None;
return;
}
}
}
if self.state.in_code == Some(Code::Spaces) {
let indent = line.len() - line.trim_start().len();
if indent < 4 && !line.trim().is_empty() {
self.events.push(ParseEvent::CodeBlockEnd);
self.state.exit_code_block();
self.parse_inline_content(line);
return;
}
}
let code_line = if self.state.in_code == Some(Code::Spaces) {
if line.len() >= 4 {
line[4..].to_string()
} else {
line.to_string()
}
} else {
line.to_string()
};
self.events.push(ParseEvent::CodeBlockLine(code_line));
}
fn try_parse_code_fence(&mut self, line: &str) -> bool {
if let Some(caps) = CODE_FENCE_RE.captures(line) {
let fence = caps.get(1).map(|m| m.as_str()).unwrap_or("```");
let lang = caps.get(2).map(|m| m.as_str()).filter(|s| !s.is_empty());
let indent = line.len() - line.trim_start().len();
self.code_fence = Some(fence.to_string());
self.state.code_indent = indent;
self.state.enter_code_block(
Code::Backtick,
lang.map(|s| s.to_string()).or_else(|| Some("text".to_string())),
);
self.events.push(ParseEvent::CodeBlockStart {
language: lang.map(|s| s.to_string()),
indent,
});
true
} else {
false
}
}
fn try_parse_space_code(&mut self, line: &str, was_prev_empty: bool) -> bool {
if !self.state.code_spaces {
return false;
}
if !was_prev_empty || self.state.in_list {
return false;
}
if SPACE_CODE_RE.is_match(line) {
self.state.enter_code_block(Code::Spaces, Some("text".to_string()));
self.events.push(ParseEvent::CodeBlockStart {
language: Some("text".to_string()),
indent: 4,
});
let code_line = if line.len() >= 4 { &line[4..] } else { line };
self.events.push(ParseEvent::CodeBlockLine(code_line.to_string()));
true
} else {
false
}
}
fn parse_in_think_block(&mut self, line: &str) {
if line.trim() == "</think>" || line.trim() == "</think▷" || line.trim() == "◁/think▷" {
self.events.push(ParseEvent::ThinkBlockEnd);
self.state.exit_block();
} else {
self.events.push(ParseEvent::ThinkBlockLine(line.to_string()));
}
}
fn try_parse_block(&mut self, line: &str) -> bool {
if let Some(caps) = BLOCK_RE.captures(line) {
let marker = caps.get(1).map(|m| m.as_str()).unwrap_or("");
let content = caps.get(3).map(|m| m.as_str()).unwrap_or("");
if marker.contains("think") {
if marker.contains('/') {
if self.state.block_type == Some(BlockType::Think) {
self.events.push(ParseEvent::ThinkBlockEnd);
self.state.exit_block();
}
return true;
} else {
self.state.enter_block(BlockType::Think);
self.events.push(ParseEvent::ThinkBlockStart);
if !content.trim().is_empty() {
self.events.push(ParseEvent::ThinkBlockLine(content.to_string()));
}
return true;
}
}
let depth = marker.matches('>').count();
if depth > 0 {
if self.state.block_depth != depth {
if depth > self.state.block_depth {
for _ in self.state.block_depth..depth {
self.state.enter_block(BlockType::Quote);
}
self.events.push(ParseEvent::BlockquoteStart { depth });
} else {
for _ in depth..self.state.block_depth {
self.state.exit_block();
}
}
}
self.events.push(ParseEvent::BlockquoteLine(content.to_string()));
return true;
}
}
if self.state.block_depth > 0 && self.state.block_type == Some(BlockType::Quote) {
while self.state.block_depth > 0 {
self.state.exit_block();
}
self.events.push(ParseEvent::BlockquoteEnd);
}
false
}
fn try_parse_heading(&mut self, line: &str) -> bool {
if let Some(caps) = HEADING_RE.captures(line) {
let hashes = caps.get(1).map(|m| m.as_str()).unwrap_or("");
let content = caps.get(2).map(|m| m.as_str()).unwrap_or("");
let level = hashes.len().min(6) as u8;
self.events.push(ParseEvent::Heading {
level,
content: content.to_string(),
});
true
} else {
false
}
}
fn try_parse_hr(&mut self, line: &str) -> bool {
if HR_RE.is_match(line.trim()) {
self.events.push(ParseEvent::HorizontalRule);
true
} else {
false
}
}
fn try_parse_list_item(&mut self, line: &str) -> bool {
if let Some(caps) = LIST_ITEM_RE.captures(line) {
let indent_str = caps.get(1).map(|m| m.as_str()).unwrap_or("");
let bullet_str = caps.get(2).map(|m| m.as_str()).unwrap_or("");
let content = caps.get(3).map(|m| m.as_str()).unwrap_or("");
let indent = indent_str.len();
let bullet = ListBullet::parse(bullet_str).unwrap_or(ListBullet::Dash);
self.state.list_indent_text = bullet_str.len();
let list_type = if bullet.is_ordered() {
ListType::Ordered
} else {
ListType::Bullet
};
while let Some((stack_indent, _)) = self.state.list_item_stack.last() {
if *stack_indent > indent {
self.state.pop_list();
} else {
break;
}
}
let need_push = self.state.list_item_stack.last()
.map(|(i, _)| indent > *i)
.unwrap_or(true);
if need_push {
self.state.push_list(indent, list_type);
}
let final_bullet = if let ListBullet::Ordered(_) = bullet {
ListBullet::Ordered(self.state.next_list_number().unwrap_or(1))
} else {
bullet
};
self.events.push(ParseEvent::ListItem {
indent,
bullet: final_bullet,
content: content.to_string(),
});
true
} else {
false
}
}
fn exit_list_context(&mut self) {
while self.state.in_list {
self.state.pop_list();
}
self.events.push(ParseEvent::ListEnd);
}
fn try_parse_table(&mut self, line: &str) -> bool {
if let Some(caps) = TABLE_ROW_RE.captures(line) {
let inner = caps.get(1).map(|m| m.as_str()).unwrap_or("");
if TABLE_SEP_RE.is_match(inner) {
if self.table_state == Some(TableState::Header) {
self.table_state = Some(TableState::Body);
self.state.in_table = Some(Code::Body);
self.events.push(ParseEvent::TableSeparator);
return true;
}
}
let cells: Vec<String> = inner.split('|').map(|s| s.trim().to_string()).collect();
match self.table_state {
None => {
self.table_state = Some(TableState::Header);
self.state.in_table = Some(Code::Header);
self.events.push(ParseEvent::TableHeader(cells));
}
Some(TableState::Header) => {
self.events.push(ParseEvent::TableHeader(cells));
}
Some(TableState::Body) => {
self.events.push(ParseEvent::TableRow(cells));
}
}
return true;
}
if self.table_state.is_some() {
self.table_state = None;
self.state.in_table = None;
self.events.push(ParseEvent::TableEnd);
}
false
}
fn parse_inline_content(&mut self, line: &str) {
let elements = self.inline_parser.parse(line);
for element in elements {
let event = match element {
InlineElement::Text(s) => ParseEvent::Text(s),
InlineElement::Bold(s) => ParseEvent::Bold(s),
InlineElement::Italic(s) => ParseEvent::Italic(s),
InlineElement::BoldItalic(s) => ParseEvent::BoldItalic(s),
InlineElement::Underline(s) => ParseEvent::Underline(s),
InlineElement::Strikeout(s) => ParseEvent::Strikeout(s),
InlineElement::Code(s) => ParseEvent::InlineCode(s),
InlineElement::Link { text, url } => ParseEvent::Link { text, url },
InlineElement::Image { alt, url } => ParseEvent::Image { alt, url },
InlineElement::Footnote(s) => ParseEvent::Footnote(s),
};
self.events.push(event);
}
self.events.push(ParseEvent::Newline);
}
pub fn parse_document(&mut self, content: &str) -> Vec<ParseEvent> {
let mut all_events = Vec::new();
for line in content.lines() {
all_events.extend(self.parse_line(line));
}
all_events.extend(self.finalize());
all_events
}
pub fn finalize(&mut self) -> Vec<ParseEvent> {
self.events.clear();
if self.state.is_in_code() {
self.events.push(ParseEvent::CodeBlockEnd);
self.state.exit_code_block();
self.code_fence = None;
}
if self.state.block_type == Some(BlockType::Think) {
self.events.push(ParseEvent::ThinkBlockEnd);
self.state.exit_block();
}
if self.state.block_depth > 0 {
self.events.push(ParseEvent::BlockquoteEnd);
while self.state.block_depth > 0 {
self.state.exit_block();
}
}
if self.state.in_list {
self.exit_list_context();
}
if self.table_state.is_some() {
self.table_state = None;
self.state.in_table = None;
self.events.push(ParseEvent::TableEnd);
}
self.take_events()
}
pub fn reset(&mut self) {
self.state = ParseState::new();
self.inline_parser.reset();
self.code_fence = None;
self.table_state = None;
self.events.clear();
self.prev_was_empty = false;
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_heading() {
let mut parser = Parser::new();
let events = parser.parse_line("# Hello World");
assert!(events.iter().any(|e| matches!(
e, ParseEvent::Heading { level: 1, content } if content == "Hello World"
)));
}
#[test]
fn test_parse_code_block() {
let mut parser = Parser::new();
let e1 = parser.parse_line("```rust");
assert!(e1.iter().any(|e| matches!(e, ParseEvent::CodeBlockStart { language: Some(l), .. } if l == "rust")));
let e2 = parser.parse_line("let x = 1;");
assert!(e2.iter().any(|e| matches!(e, ParseEvent::CodeBlockLine(s) if s == "let x = 1;")));
let e3 = parser.parse_line("```");
assert!(e3.iter().any(|e| matches!(e, ParseEvent::CodeBlockEnd)));
}
#[test]
fn test_parse_pre_tag() {
let mut parser = Parser::new();
let e1 = parser.parse_line("<pre>");
assert!(e1.iter().any(|e| matches!(e, ParseEvent::CodeBlockStart { .. })));
let e2 = parser.parse_line("code");
assert!(e2.iter().any(|e| matches!(e, ParseEvent::CodeBlockLine(_))));
let e3 = parser.parse_line("</pre>");
assert!(e3.iter().any(|e| matches!(e, ParseEvent::CodeBlockEnd)));
}
#[test]
fn test_space_indented_code() {
let mut parser = Parser::new();
parser.set_code_spaces(true);
parser.parse_line(""); let events = parser.parse_line(" let x = 1;");
assert!(events.iter().any(|e| matches!(e, ParseEvent::CodeBlockStart { .. })));
assert!(events.iter().any(|e| matches!(e, ParseEvent::CodeBlockLine(s) if s == "let x = 1;")));
}
#[test]
fn test_empty_line_collapsing() {
let mut parser = Parser::new();
let e1 = parser.parse_line("");
assert!(e1.iter().any(|e| matches!(e, ParseEvent::EmptyLine)));
let e2 = parser.parse_line("");
assert!(e2.is_empty()); let e3 = parser.parse_line("text");
assert!(!e3.is_empty());
let e4 = parser.parse_line("");
assert!(e4.iter().any(|e| matches!(e, ParseEvent::EmptyLine)));
}
#[test]
fn test_parse_think_block_unicode() {
let mut parser = Parser::new();
let e1 = parser.parse_line("◁think▷");
assert!(e1.iter().any(|e| matches!(e, ParseEvent::ThinkBlockStart)));
}
#[test]
fn test_parse_list() {
let mut parser = Parser::new();
let events = parser.parse_line("- Item one");
assert!(events.iter().any(|e| matches!(
e, ParseEvent::ListItem { bullet: ListBullet::Dash, content, .. } if content == "Item one"
)));
}
#[test]
fn test_parse_nested_list() {
let mut parser = Parser::new();
parser.parse_line("- Item 1");
let e2 = parser.parse_line(" - Nested");
assert!(e2.iter().any(|e| matches!(
e, ParseEvent::ListItem { indent: 2, .. }
)));
}
#[test]
fn test_parse_ordered_list_numbering() {
let mut parser = Parser::new();
parser.parse_line("1. First");
let e2 = parser.parse_line("2. Second");
assert!(e2.iter().any(|e| matches!(
e, ParseEvent::ListItem { bullet: ListBullet::Ordered(2), .. }
)));
}
#[test]
fn test_parse_blockquote() {
let mut parser = Parser::new();
let events = parser.parse_line("> Quote text");
assert!(events.iter().any(|e| matches!(e, ParseEvent::BlockquoteLine(s) if s == "Quote text")));
}
#[test]
fn test_parse_nested_blockquote() {
let mut parser = Parser::new();
let events = parser.parse_line(">> Nested quote");
assert!(events.iter().any(|e| matches!(e, ParseEvent::BlockquoteStart { depth: 2 })));
}
#[test]
fn test_parse_hr() {
let mut parser = Parser::new();
assert!(parser.parse_line("---").iter().any(|e| matches!(e, ParseEvent::HorizontalRule)));
assert!(parser.parse_line("***").iter().any(|e| matches!(e, ParseEvent::HorizontalRule)));
assert!(parser.parse_line("___").iter().any(|e| matches!(e, ParseEvent::HorizontalRule)));
}
#[test]
fn test_parse_table() {
let mut parser = Parser::new();
let e1 = parser.parse_line("| A | B | C |");
assert!(e1.iter().any(|e| matches!(e, ParseEvent::TableHeader(_))));
let e2 = parser.parse_line("|---|---|---|");
assert!(e2.iter().any(|e| matches!(e, ParseEvent::TableSeparator)));
let e3 = parser.parse_line("| 1 | 2 | 3 |");
assert!(e3.iter().any(|e| matches!(e, ParseEvent::TableRow(_))));
}
#[test]
fn test_parse_think_block() {
let mut parser = Parser::new();
let e1 = parser.parse_line("<think>");
assert!(e1.iter().any(|e| matches!(e, ParseEvent::ThinkBlockStart)));
let e2 = parser.parse_line("Thinking...");
assert!(e2.iter().any(|e| matches!(e, ParseEvent::ThinkBlockLine(s) if s == "Thinking...")));
let e3 = parser.parse_line("</think>");
assert!(e3.iter().any(|e| matches!(e, ParseEvent::ThinkBlockEnd)));
}
#[test]
fn test_first_indent_stripping() {
let mut parser = Parser::new();
let e1 = parser.parse_line(" # Hello");
assert!(e1.iter().any(|e| matches!(e, ParseEvent::Heading { level: 1, content } if content == "Hello")));
}
#[test]
fn test_parse_document() {
let mut parser = Parser::new();
let doc = "# Title\n\nSome text.\n\n```\ncode\n```";
let events = parser.parse_document(doc);
assert!(events.iter().any(|e| matches!(e, ParseEvent::Heading { level: 1, .. })));
assert!(events.iter().any(|e| matches!(e, ParseEvent::CodeBlockStart { .. })));
assert!(events.iter().any(|e| matches!(e, ParseEvent::CodeBlockEnd)));
}
#[test]
fn test_finalize_closes_blocks() {
let mut parser = Parser::new();
parser.parse_line("```");
parser.parse_line("code");
let events = parser.finalize();
assert!(events.iter().any(|e| matches!(e, ParseEvent::CodeBlockEnd)));
}
#[test]
fn test_is_block_is_inline() {
assert!(ParseEvent::Heading { level: 1, content: "x".to_string() }.is_block());
assert!(ParseEvent::CodeBlockStart { language: None, indent: 0 }.is_block());
assert!(ParseEvent::Text("x".to_string()).is_inline());
assert!(ParseEvent::Bold("x".to_string()).is_inline());
}
}