1use regex::Regex;
7use std::sync::LazyLock;
8
9static INLINE_TOKEN_RE: LazyLock<Regex> = LazyLock::new(|| {
12 Regex::new(r"(```+|~~|\*\*\*|\*\*_|_\*\*|\*\*|\*|___|__|_|`+|[^~_*`]+)").unwrap()
15});
16
17static LINK_RE: LazyLock<Regex> =
19 LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^\)]+)\)").unwrap());
20
21static IMAGE_RE: LazyLock<Regex> =
23 LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\(([^\)]+)\)").unwrap());
24
25static FOOTNOTE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[\^(\d+)\]:?").unwrap());
27
28static CODE_SPAN_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"``[^`]+``|`[^`]+`").unwrap());
30
31fn find_code_regions(line: &str) -> Vec<(usize, usize)> {
33 CODE_SPAN_RE
34 .find_iter(line)
35 .map(|m| (m.start(), m.end()))
36 .collect()
37}
38
39#[derive(Debug, Clone, PartialEq, Eq)]
41pub enum Token {
42 Text(String),
44
45 TripleAsterisk,
47
48 DoubleAsterisk,
50
51 Asterisk,
53
54 TripleUnderscore,
56
57 DoubleUnderscore,
59
60 Underscore,
62
63 DoubleAsteriskUnderscore,
65
66 UnderscoreDoubleAsterisk,
68
69 DoubleTilde,
71
72 Backticks(usize),
74
75 Link { text: String, url: String },
77
78 Image { alt: String, url: String },
80
81 Footnote(u32),
83}
84
85impl Token {
86 pub fn is_marker(&self) -> bool {
88 !matches!(
89 self,
90 Token::Text(_) | Token::Link { .. } | Token::Image { .. } | Token::Footnote(_)
91 )
92 }
93
94 pub fn marker_str(&self) -> Option<&'static str> {
96 match self {
97 Token::TripleAsterisk => Some("***"),
98 Token::DoubleAsterisk => Some("**"),
99 Token::Asterisk => Some("*"),
100 Token::TripleUnderscore => Some("___"),
101 Token::DoubleUnderscore => Some("__"),
102 Token::Underscore => Some("_"),
103 Token::DoubleAsteriskUnderscore => Some("**_"),
104 Token::UnderscoreDoubleAsterisk => Some("_**"),
105 Token::DoubleTilde => Some("~~"),
106 Token::Backticks(_) => {
107 None
109 }
110 _ => None,
111 }
112 }
113}
114
115#[derive(Debug, Default)]
117pub struct Tokenizer {
118 pub process_links: bool,
120 pub process_images: bool,
122}
123
124impl Tokenizer {
125 pub fn new() -> Self {
127 Self {
128 process_links: true,
129 process_images: true,
130 }
131 }
132
133 pub fn with_settings(process_links: bool, process_images: bool) -> Self {
135 Self {
136 process_links,
137 process_images,
138 }
139 }
140
141 pub fn tokenize(&self, line: &str) -> Vec<Token> {
145 let mut tokens = Vec::new();
146 self.tokenize_with_extractions(line, &mut tokens);
147 tokens
148 }
149
150 pub fn tokenize_inline(&self, text: &str, tokens: &mut Vec<Token>) {
152 for cap in INLINE_TOKEN_RE.find_iter(text) {
153 let s = cap.as_str();
154 let token = match s {
155 "***" => Token::TripleAsterisk,
156 "**" => Token::DoubleAsterisk,
157 "*" => Token::Asterisk,
158 "___" => Token::TripleUnderscore,
159 "__" => Token::DoubleUnderscore,
160 "_" => Token::Underscore,
161 "**_" => Token::DoubleAsteriskUnderscore,
162 "_**" => Token::UnderscoreDoubleAsterisk,
163 "~~" => Token::DoubleTilde,
164 _ if s.chars().all(|c| c == '`') => Token::Backticks(s.len()),
165 _ => Token::Text(s.to_string()),
166 };
167 tokens.push(token);
168 }
169 }
170
171 fn tokenize_with_extractions(&self, line: &str, tokens: &mut Vec<Token>) {
173 tokens.clear();
174
175 let mut last_end = 0;
177 let mut extractions: Vec<(usize, usize, Token)> = Vec::new();
178
179 if self.process_images {
181 for cap in IMAGE_RE.captures_iter(line) {
182 let m = cap.get(0).unwrap();
183 let alt = cap.get(1).map(|m| m.as_str()).unwrap_or("");
184 let url = cap.get(2).map(|m| m.as_str()).unwrap_or("");
185 extractions.push((
186 m.start(),
187 m.end(),
188 Token::Image {
189 alt: alt.to_string(),
190 url: url.to_string(),
191 },
192 ));
193 }
194 }
195
196 if self.process_links {
198 for cap in LINK_RE.captures_iter(line) {
199 let m = cap.get(0).unwrap();
200 if m.start() > 0 && line.as_bytes().get(m.start() - 1) == Some(&b'!') {
202 continue;
203 }
204 let text = cap.get(1).map(|m| m.as_str()).unwrap_or("");
205 let url = cap.get(2).map(|m| m.as_str()).unwrap_or("");
206 extractions.push((
207 m.start(),
208 m.end(),
209 Token::Link {
210 text: text.to_string(),
211 url: url.to_string(),
212 },
213 ));
214 }
215 }
216
217 for cap in FOOTNOTE_RE.captures_iter(line) {
219 let m = cap.get(0).unwrap();
220 if let Some(num_match) = cap.get(1) {
221 if let Ok(num) = num_match.as_str().parse::<u32>() {
222 extractions.push((m.start(), m.end(), Token::Footnote(num)));
223 }
224 }
225 }
226
227 let code_regions = find_code_regions(line);
229 extractions.retain(|(start, end, _)| {
230 !code_regions
231 .iter()
232 .any(|(cs, ce)| *start >= *cs && *end <= *ce)
233 });
234
235 extractions.sort_by_key(|(start, _, _)| *start);
237
238 let mut filtered: Vec<(usize, usize, Token)> = Vec::new();
240 for ext in extractions {
241 if filtered.is_empty() || ext.0 >= filtered.last().unwrap().1 {
242 filtered.push(ext);
243 }
244 }
245
246 for (start, end, token) in filtered {
248 if start > last_end {
250 self.tokenize_inline(&line[last_end..start], tokens);
251 }
252 tokens.push(token);
253 last_end = end;
254 }
255
256 if last_end < line.len() {
258 self.tokenize_inline(&line[last_end..], tokens);
259 }
260 }
261
262 #[allow(dead_code)]
266 fn extract_images(&self, text: &str) -> String {
267 IMAGE_RE.replace_all(text, "").to_string()
268 }
269
270 #[allow(dead_code)]
271 fn extract_links(&self, text: &str) -> String {
272 LINK_RE.replace_all(text, "").to_string()
273 }
274
275 #[allow(dead_code)]
276 fn extract_footnotes(&self, text: &str) -> String {
277 FOOTNOTE_RE.replace_all(text, "").to_string()
278 }
279}
280
281pub fn is_cjk(c: char) -> bool {
286 matches!(c,
287 '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{20000}'..='\u{2A6DF}' | '\u{2A700}'..='\u{2B73F}' | '\u{2B740}'..='\u{2B81F}' | '\u{F900}'..='\u{FAFF}' | '\u{3000}'..='\u{303F}' | '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{31F0}'..='\u{31FF}' | '\u{AC00}'..='\u{D7AF}' | '\u{1100}'..='\u{11FF}' | '\u{FF00}'..='\u{FFEF}' )
301}
302
303pub fn cjk_count(s: &str) -> usize {
305 s.chars().filter(|&c| is_cjk(c)).count()
306}
307
308pub fn not_text(token: &str) -> bool {
313 if cjk_count(token) > 0 {
314 return true;
315 }
316
317 !token
318 .chars()
319 .all(|c| c.is_alphanumeric() || c == '\\' || c == '"')
320}
321
322#[cfg(test)]
323mod tests {
324 use super::*;
325
326 #[test]
327 fn test_tokenize_plain_text() {
328 let tokenizer = Tokenizer::new();
329 let tokens = tokenizer.tokenize("Hello world");
330 assert_eq!(tokens, vec![Token::Text("Hello world".to_string())]);
331 }
332
333 #[test]
334 fn test_tokenize_bold() {
335 let tokenizer = Tokenizer::new();
336 let mut tokens = Vec::new();
337 tokenizer.tokenize_inline("**bold**", &mut tokens);
338 assert_eq!(
339 tokens,
340 vec![
341 Token::DoubleAsterisk,
342 Token::Text("bold".to_string()),
343 Token::DoubleAsterisk,
344 ]
345 );
346 }
347
348 #[test]
349 fn test_tokenize_italic() {
350 let tokenizer = Tokenizer::new();
351 let mut tokens = Vec::new();
352 tokenizer.tokenize_inline("*italic*", &mut tokens);
353 assert_eq!(
354 tokens,
355 vec![
356 Token::Asterisk,
357 Token::Text("italic".to_string()),
358 Token::Asterisk,
359 ]
360 );
361 }
362
363 #[test]
364 fn test_tokenize_triple_asterisk() {
365 let tokenizer = Tokenizer::new();
366 let mut tokens = Vec::new();
367 tokenizer.tokenize_inline("***bold italic***", &mut tokens);
368 assert_eq!(
369 tokens,
370 vec![
371 Token::TripleAsterisk,
372 Token::Text("bold italic".to_string()),
373 Token::TripleAsterisk,
374 ]
375 );
376 }
377
378 #[test]
379 fn test_tokenize_strikethrough() {
380 let tokenizer = Tokenizer::new();
381 let mut tokens = Vec::new();
382 tokenizer.tokenize_inline("~~strike~~", &mut tokens);
383 assert_eq!(
384 tokens,
385 vec![
386 Token::DoubleTilde,
387 Token::Text("strike".to_string()),
388 Token::DoubleTilde,
389 ]
390 );
391 }
392
393 #[test]
394 fn test_tokenize_backticks() {
395 let tokenizer = Tokenizer::new();
396 let mut tokens = Vec::new();
397 tokenizer.tokenize_inline("`code`", &mut tokens);
398 assert_eq!(
399 tokens,
400 vec![
401 Token::Backticks(1),
402 Token::Text("code".to_string()),
403 Token::Backticks(1),
404 ]
405 );
406 }
407
408 #[test]
409 fn test_tokenize_double_backticks() {
410 let tokenizer = Tokenizer::new();
411 let mut tokens = Vec::new();
412 tokenizer.tokenize_inline("`` `code` ``", &mut tokens);
413 assert_eq!(
415 tokens,
416 vec![
417 Token::Backticks(2),
418 Token::Text(" ".to_string()),
419 Token::Backticks(1),
420 Token::Text("code".to_string()),
421 Token::Backticks(1),
422 Token::Text(" ".to_string()),
423 Token::Backticks(2),
424 ]
425 );
426 }
427
428 #[test]
429 fn test_tokenize_link() {
430 let tokenizer = Tokenizer::new();
431 let tokens = tokenizer.tokenize("Check [this](http://example.com) out");
432
433 assert!(tokens.iter().any(|t| matches!(t, Token::Link { .. })));
435 }
436
437 #[test]
438 fn test_tokenize_image() {
439 let tokenizer = Tokenizer::new();
440 let tokens = tokenizer.tokenize("See  here");
441
442 assert!(tokens.iter().any(|t| matches!(t, Token::Image { .. })));
444 }
445
446 #[test]
447 fn test_tokenize_footnote() {
448 let tokenizer = Tokenizer::new();
449 let tokens = tokenizer.tokenize("Some text[^1] here");
450
451 assert!(tokens.iter().any(|t| matches!(t, Token::Footnote(1))));
453 }
454
455 #[test]
456 fn test_is_cjk() {
457 assert!(is_cjk('中'));
458 assert!(is_cjk('日'));
459 assert!(is_cjk('한'));
460 assert!(is_cjk('あ'));
461 assert!(!is_cjk('A'));
462 assert!(!is_cjk('1'));
463 }
464
465 #[test]
466 fn test_cjk_count() {
467 assert_eq!(cjk_count("Hello"), 0);
468 assert_eq!(cjk_count("中文"), 2);
469 assert_eq!(cjk_count("Hello世界"), 2);
470 }
471
472 #[test]
473 fn test_not_text() {
474 assert!(!not_text("hello"));
475 assert!(!not_text("Hello123"));
476 assert!(not_text("**"));
477 assert!(not_text("*"));
478 assert!(not_text("中文")); }
480
481 #[test]
482 fn test_link_inside_code_not_extracted() {
483 let tokenizer = Tokenizer::new();
484 let tokens = tokenizer.tokenize("`[text](url)`");
485 assert!(!tokens.iter().any(|t| matches!(t, Token::Link { .. })));
487 }
488}