use regex::Regex;
use std::sync::LazyLock;
static INLINE_TOKEN_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(```+|~~|\*\*\*|\*\*_|_\*\*|\*\*|\*|___|__|_|`+|[^~_*`]+)").unwrap()
});
static LINK_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^\)]+)\)").unwrap());
static IMAGE_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\(([^\)]+)\)").unwrap());
static FOOTNOTE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[\^(\d+)\]:?").unwrap());
static CODE_SPAN_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"``[^`]+``|`[^`]+`").unwrap());
fn find_code_regions(line: &str) -> Vec<(usize, usize)> {
CODE_SPAN_RE
.find_iter(line)
.map(|m| (m.start(), m.end()))
.collect()
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Token {
Text(String),
TripleAsterisk,
DoubleAsterisk,
Asterisk,
TripleUnderscore,
DoubleUnderscore,
Underscore,
DoubleAsteriskUnderscore,
UnderscoreDoubleAsterisk,
DoubleTilde,
Backticks(usize),
Link { text: String, url: String },
Image { alt: String, url: String },
Footnote(u32),
}
impl Token {
pub fn is_marker(&self) -> bool {
!matches!(
self,
Token::Text(_) | Token::Link { .. } | Token::Image { .. } | Token::Footnote(_)
)
}
pub fn marker_str(&self) -> Option<&'static str> {
match self {
Token::TripleAsterisk => Some("***"),
Token::DoubleAsterisk => Some("**"),
Token::Asterisk => Some("*"),
Token::TripleUnderscore => Some("___"),
Token::DoubleUnderscore => Some("__"),
Token::Underscore => Some("_"),
Token::DoubleAsteriskUnderscore => Some("**_"),
Token::UnderscoreDoubleAsterisk => Some("_**"),
Token::DoubleTilde => Some("~~"),
Token::Backticks(_) => {
None
}
_ => None,
}
}
}
#[derive(Debug, Default)]
pub struct Tokenizer {
pub process_links: bool,
pub process_images: bool,
}
impl Tokenizer {
pub fn new() -> Self {
Self {
process_links: true,
process_images: true,
}
}
pub fn with_settings(process_links: bool, process_images: bool) -> Self {
Self {
process_links,
process_images,
}
}
pub fn tokenize(&self, line: &str) -> Vec<Token> {
let mut tokens = Vec::new();
self.tokenize_with_extractions(line, &mut tokens);
tokens
}
pub fn tokenize_inline(&self, text: &str, tokens: &mut Vec<Token>) {
for cap in INLINE_TOKEN_RE.find_iter(text) {
let s = cap.as_str();
let token = match s {
"***" => Token::TripleAsterisk,
"**" => Token::DoubleAsterisk,
"*" => Token::Asterisk,
"___" => Token::TripleUnderscore,
"__" => Token::DoubleUnderscore,
"_" => Token::Underscore,
"**_" => Token::DoubleAsteriskUnderscore,
"_**" => Token::UnderscoreDoubleAsterisk,
"~~" => Token::DoubleTilde,
_ if s.chars().all(|c| c == '`') => Token::Backticks(s.len()),
_ => Token::Text(s.to_string()),
};
tokens.push(token);
}
}
fn tokenize_with_extractions(&self, line: &str, tokens: &mut Vec<Token>) {
tokens.clear();
let mut last_end = 0;
let mut extractions: Vec<(usize, usize, Token)> = Vec::new();
if self.process_images {
for cap in IMAGE_RE.captures_iter(line) {
let m = cap.get(0).unwrap();
let alt = cap.get(1).map(|m| m.as_str()).unwrap_or("");
let url = cap.get(2).map(|m| m.as_str()).unwrap_or("");
extractions.push((
m.start(),
m.end(),
Token::Image {
alt: alt.to_string(),
url: url.to_string(),
},
));
}
}
if self.process_links {
for cap in LINK_RE.captures_iter(line) {
let m = cap.get(0).unwrap();
if m.start() > 0 && line.as_bytes().get(m.start() - 1) == Some(&b'!') {
continue;
}
let text = cap.get(1).map(|m| m.as_str()).unwrap_or("");
let url = cap.get(2).map(|m| m.as_str()).unwrap_or("");
extractions.push((
m.start(),
m.end(),
Token::Link {
text: text.to_string(),
url: url.to_string(),
},
));
}
}
for cap in FOOTNOTE_RE.captures_iter(line) {
let m = cap.get(0).unwrap();
if let Some(num_match) = cap.get(1) {
if let Ok(num) = num_match.as_str().parse::<u32>() {
extractions.push((m.start(), m.end(), Token::Footnote(num)));
}
}
}
let code_regions = find_code_regions(line);
extractions.retain(|(start, end, _)| {
!code_regions
.iter()
.any(|(cs, ce)| *start >= *cs && *end <= *ce)
});
extractions.sort_by_key(|(start, _, _)| *start);
let mut filtered: Vec<(usize, usize, Token)> = Vec::new();
for ext in extractions {
if filtered.is_empty() || ext.0 >= filtered.last().unwrap().1 {
filtered.push(ext);
}
}
for (start, end, token) in filtered {
if start > last_end {
self.tokenize_inline(&line[last_end..start], tokens);
}
tokens.push(token);
last_end = end;
}
if last_end < line.len() {
self.tokenize_inline(&line[last_end..], tokens);
}
}
#[allow(dead_code)]
fn extract_images(&self, text: &str) -> String {
IMAGE_RE.replace_all(text, "").to_string()
}
#[allow(dead_code)]
fn extract_links(&self, text: &str) -> String {
LINK_RE.replace_all(text, "").to_string()
}
#[allow(dead_code)]
fn extract_footnotes(&self, text: &str) -> String {
FOOTNOTE_RE.replace_all(text, "").to_string()
}
}
pub fn is_cjk(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{20000}'..='\u{2A6DF}' | '\u{2A700}'..='\u{2B73F}' | '\u{2B740}'..='\u{2B81F}' | '\u{F900}'..='\u{FAFF}' | '\u{3000}'..='\u{303F}' | '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{31F0}'..='\u{31FF}' | '\u{AC00}'..='\u{D7AF}' | '\u{1100}'..='\u{11FF}' | '\u{FF00}'..='\u{FFEF}' )
}
pub fn cjk_count(s: &str) -> usize {
s.chars().filter(|&c| is_cjk(c)).count()
}
pub fn not_text(token: &str) -> bool {
if cjk_count(token) > 0 {
return true;
}
!token
.chars()
.all(|c| c.is_alphanumeric() || c == '\\' || c == '"')
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize_plain_text() {
let tokenizer = Tokenizer::new();
let tokens = tokenizer.tokenize("Hello world");
assert_eq!(tokens, vec![Token::Text("Hello world".to_string())]);
}
#[test]
fn test_tokenize_bold() {
let tokenizer = Tokenizer::new();
let mut tokens = Vec::new();
tokenizer.tokenize_inline("**bold**", &mut tokens);
assert_eq!(
tokens,
vec![
Token::DoubleAsterisk,
Token::Text("bold".to_string()),
Token::DoubleAsterisk,
]
);
}
#[test]
fn test_tokenize_italic() {
let tokenizer = Tokenizer::new();
let mut tokens = Vec::new();
tokenizer.tokenize_inline("*italic*", &mut tokens);
assert_eq!(
tokens,
vec![
Token::Asterisk,
Token::Text("italic".to_string()),
Token::Asterisk,
]
);
}
#[test]
fn test_tokenize_triple_asterisk() {
let tokenizer = Tokenizer::new();
let mut tokens = Vec::new();
tokenizer.tokenize_inline("***bold italic***", &mut tokens);
assert_eq!(
tokens,
vec![
Token::TripleAsterisk,
Token::Text("bold italic".to_string()),
Token::TripleAsterisk,
]
);
}
#[test]
fn test_tokenize_strikethrough() {
let tokenizer = Tokenizer::new();
let mut tokens = Vec::new();
tokenizer.tokenize_inline("~~strike~~", &mut tokens);
assert_eq!(
tokens,
vec![
Token::DoubleTilde,
Token::Text("strike".to_string()),
Token::DoubleTilde,
]
);
}
#[test]
fn test_tokenize_backticks() {
let tokenizer = Tokenizer::new();
let mut tokens = Vec::new();
tokenizer.tokenize_inline("`code`", &mut tokens);
assert_eq!(
tokens,
vec![
Token::Backticks(1),
Token::Text("code".to_string()),
Token::Backticks(1),
]
);
}
#[test]
fn test_tokenize_double_backticks() {
let tokenizer = Tokenizer::new();
let mut tokens = Vec::new();
tokenizer.tokenize_inline("`` `code` ``", &mut tokens);
assert_eq!(
tokens,
vec![
Token::Backticks(2),
Token::Text(" ".to_string()),
Token::Backticks(1),
Token::Text("code".to_string()),
Token::Backticks(1),
Token::Text(" ".to_string()),
Token::Backticks(2),
]
);
}
#[test]
fn test_tokenize_link() {
let tokenizer = Tokenizer::new();
let tokens = tokenizer.tokenize("Check [this](http://example.com) out");
assert!(tokens.iter().any(|t| matches!(t, Token::Link { .. })));
}
#[test]
fn test_tokenize_image() {
let tokenizer = Tokenizer::new();
let tokens = tokenizer.tokenize("See  here");
assert!(tokens.iter().any(|t| matches!(t, Token::Image { .. })));
}
#[test]
fn test_tokenize_footnote() {
let tokenizer = Tokenizer::new();
let tokens = tokenizer.tokenize("Some text[^1] here");
assert!(tokens.iter().any(|t| matches!(t, Token::Footnote(1))));
}
#[test]
fn test_is_cjk() {
assert!(is_cjk('中'));
assert!(is_cjk('日'));
assert!(is_cjk('한'));
assert!(is_cjk('あ'));
assert!(!is_cjk('A'));
assert!(!is_cjk('1'));
}
#[test]
fn test_cjk_count() {
assert_eq!(cjk_count("Hello"), 0);
assert_eq!(cjk_count("中文"), 2);
assert_eq!(cjk_count("Hello世界"), 2);
}
#[test]
fn test_not_text() {
assert!(!not_text("hello"));
assert!(!not_text("Hello123"));
assert!(not_text("**"));
assert!(not_text("*"));
assert!(not_text("中文")); }
#[test]
fn test_link_inside_code_not_extracted() {
let tokenizer = Tokenizer::new();
let tokens = tokenizer.tokenize("`[text](url)`");
assert!(!tokens.iter().any(|t| matches!(t, Token::Link { .. })));
}
}