1use regex::Regex;
7use std::sync::LazyLock;
8
9static INLINE_TOKEN_RE: LazyLock<Regex> = LazyLock::new(|| {
12 Regex::new(r"(```+|~~|\*\*\*|\*\*_|_\*\*|\*\*|\*|___|__|_|`+|[^~_*`]+)").unwrap()
15});
16
17static LINK_RE: LazyLock<Regex> =
19 LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^\)]+)\)").unwrap());
20
21static IMAGE_RE: LazyLock<Regex> =
23 LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\(([^\)]+)\)").unwrap());
24
25static FOOTNOTE_RE: LazyLock<Regex> =
27 LazyLock::new(|| Regex::new(r"\[\^(\d+)\]:?").unwrap());
28
29#[derive(Debug, Clone, PartialEq, Eq)]
31pub enum Token {
32 Text(String),
34
35 TripleAsterisk,
37
38 DoubleAsterisk,
40
41 Asterisk,
43
44 TripleUnderscore,
46
47 DoubleUnderscore,
49
50 Underscore,
52
53 DoubleAsteriskUnderscore,
55
56 UnderscoreDoubleAsterisk,
58
59 DoubleTilde,
61
62 Backticks(usize),
64
65 Link { text: String, url: String },
67
68 Image { alt: String, url: String },
70
71 Footnote(u32),
73}
74
75impl Token {
76 pub fn is_marker(&self) -> bool {
78 !matches!(self, Token::Text(_) | Token::Link { .. } | Token::Image { .. } | Token::Footnote(_))
79 }
80
81 pub fn marker_str(&self) -> Option<&'static str> {
83 match self {
84 Token::TripleAsterisk => Some("***"),
85 Token::DoubleAsterisk => Some("**"),
86 Token::Asterisk => Some("*"),
87 Token::TripleUnderscore => Some("___"),
88 Token::DoubleUnderscore => Some("__"),
89 Token::Underscore => Some("_"),
90 Token::DoubleAsteriskUnderscore => Some("**_"),
91 Token::UnderscoreDoubleAsterisk => Some("_**"),
92 Token::DoubleTilde => Some("~~"),
93 Token::Backticks(_) => {
94 None
96 }
97 _ => None,
98 }
99 }
100}
101
102#[derive(Debug, Default)]
104pub struct Tokenizer {
105 pub process_links: bool,
107 pub process_images: bool,
109}
110
111impl Tokenizer {
112 pub fn new() -> Self {
114 Self {
115 process_links: true,
116 process_images: true,
117 }
118 }
119
120 pub fn with_settings(process_links: bool, process_images: bool) -> Self {
122 Self {
123 process_links,
124 process_images,
125 }
126 }
127
128 pub fn tokenize(&self, line: &str) -> Vec<Token> {
132 let mut tokens = Vec::new();
133 self.tokenize_with_extractions(line, &mut tokens);
134 tokens
135 }
136
137 pub fn tokenize_inline(&self, text: &str, tokens: &mut Vec<Token>) {
139 for cap in INLINE_TOKEN_RE.find_iter(text) {
140 let s = cap.as_str();
141 let token = match s {
142 "***" => Token::TripleAsterisk,
143 "**" => Token::DoubleAsterisk,
144 "*" => Token::Asterisk,
145 "___" => Token::TripleUnderscore,
146 "__" => Token::DoubleUnderscore,
147 "_" => Token::Underscore,
148 "**_" => Token::DoubleAsteriskUnderscore,
149 "_**" => Token::UnderscoreDoubleAsterisk,
150 "~~" => Token::DoubleTilde,
151 _ if s.chars().all(|c| c == '`') => Token::Backticks(s.len()),
152 _ => Token::Text(s.to_string()),
153 };
154 tokens.push(token);
155 }
156 }
157
158 fn tokenize_with_extractions(&self, line: &str, tokens: &mut Vec<Token>) {
160 tokens.clear();
161
162 let mut last_end = 0;
164 let mut extractions: Vec<(usize, usize, Token)> = Vec::new();
165
166 if self.process_images {
168 for cap in IMAGE_RE.captures_iter(line) {
169 let m = cap.get(0).unwrap();
170 let alt = cap.get(1).map(|m| m.as_str()).unwrap_or("");
171 let url = cap.get(2).map(|m| m.as_str()).unwrap_or("");
172 extractions.push((
173 m.start(),
174 m.end(),
175 Token::Image {
176 alt: alt.to_string(),
177 url: url.to_string(),
178 },
179 ));
180 }
181 }
182
183 if self.process_links {
185 for cap in LINK_RE.captures_iter(line) {
186 let m = cap.get(0).unwrap();
187 if m.start() > 0 && line.as_bytes().get(m.start() - 1) == Some(&b'!') {
189 continue;
190 }
191 let text = cap.get(1).map(|m| m.as_str()).unwrap_or("");
192 let url = cap.get(2).map(|m| m.as_str()).unwrap_or("");
193 extractions.push((
194 m.start(),
195 m.end(),
196 Token::Link {
197 text: text.to_string(),
198 url: url.to_string(),
199 },
200 ));
201 }
202 }
203
204 for cap in FOOTNOTE_RE.captures_iter(line) {
206 let m = cap.get(0).unwrap();
207 if let Some(num_match) = cap.get(1) {
208 if let Ok(num) = num_match.as_str().parse::<u32>() {
209 extractions.push((m.start(), m.end(), Token::Footnote(num)));
210 }
211 }
212 }
213
214 extractions.sort_by_key(|(start, _, _)| *start);
216
217 let mut filtered: Vec<(usize, usize, Token)> = Vec::new();
219 for ext in extractions {
220 if filtered.is_empty() || ext.0 >= filtered.last().unwrap().1 {
221 filtered.push(ext);
222 }
223 }
224
225 for (start, end, token) in filtered {
227 if start > last_end {
229 self.tokenize_inline(&line[last_end..start], tokens);
230 }
231 tokens.push(token);
232 last_end = end;
233 }
234
235 if last_end < line.len() {
237 self.tokenize_inline(&line[last_end..], tokens);
238 }
239 }
240
241 #[allow(dead_code)]
245 fn extract_images(&self, text: &str) -> String {
246 IMAGE_RE.replace_all(text, "").to_string()
247 }
248
249 #[allow(dead_code)]
250 fn extract_links(&self, text: &str) -> String {
251 LINK_RE.replace_all(text, "").to_string()
252 }
253
254 #[allow(dead_code)]
255 fn extract_footnotes(&self, text: &str) -> String {
256 FOOTNOTE_RE.replace_all(text, "").to_string()
257 }
258}
259
260pub fn is_cjk(c: char) -> bool {
265 matches!(c,
266 '\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{20000}'..='\u{2A6DF}' | '\u{2A700}'..='\u{2B73F}' | '\u{2B740}'..='\u{2B81F}' | '\u{F900}'..='\u{FAFF}' | '\u{3000}'..='\u{303F}' | '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{31F0}'..='\u{31FF}' | '\u{AC00}'..='\u{D7AF}' | '\u{1100}'..='\u{11FF}' | '\u{FF00}'..='\u{FFEF}' )
280}
281
282pub fn cjk_count(s: &str) -> usize {
284 s.chars().filter(|&c| is_cjk(c)).count()
285}
286
287pub fn not_text(token: &str) -> bool {
292 if cjk_count(token) > 0 {
293 return true;
294 }
295
296 !token.chars().all(|c| c.is_alphanumeric() || c == '\\' || c == '"')
297}
298
299#[cfg(test)]
300mod tests {
301 use super::*;
302
303 #[test]
304 fn test_tokenize_plain_text() {
305 let tokenizer = Tokenizer::new();
306 let tokens = tokenizer.tokenize("Hello world");
307 assert_eq!(tokens, vec![Token::Text("Hello world".to_string())]);
308 }
309
310 #[test]
311 fn test_tokenize_bold() {
312 let tokenizer = Tokenizer::new();
313 let mut tokens = Vec::new();
314 tokenizer.tokenize_inline("**bold**", &mut tokens);
315 assert_eq!(
316 tokens,
317 vec![
318 Token::DoubleAsterisk,
319 Token::Text("bold".to_string()),
320 Token::DoubleAsterisk,
321 ]
322 );
323 }
324
325 #[test]
326 fn test_tokenize_italic() {
327 let tokenizer = Tokenizer::new();
328 let mut tokens = Vec::new();
329 tokenizer.tokenize_inline("*italic*", &mut tokens);
330 assert_eq!(
331 tokens,
332 vec![
333 Token::Asterisk,
334 Token::Text("italic".to_string()),
335 Token::Asterisk,
336 ]
337 );
338 }
339
340 #[test]
341 fn test_tokenize_triple_asterisk() {
342 let tokenizer = Tokenizer::new();
343 let mut tokens = Vec::new();
344 tokenizer.tokenize_inline("***bold italic***", &mut tokens);
345 assert_eq!(
346 tokens,
347 vec![
348 Token::TripleAsterisk,
349 Token::Text("bold italic".to_string()),
350 Token::TripleAsterisk,
351 ]
352 );
353 }
354
355 #[test]
356 fn test_tokenize_strikethrough() {
357 let tokenizer = Tokenizer::new();
358 let mut tokens = Vec::new();
359 tokenizer.tokenize_inline("~~strike~~", &mut tokens);
360 assert_eq!(
361 tokens,
362 vec![
363 Token::DoubleTilde,
364 Token::Text("strike".to_string()),
365 Token::DoubleTilde,
366 ]
367 );
368 }
369
370 #[test]
371 fn test_tokenize_backticks() {
372 let tokenizer = Tokenizer::new();
373 let mut tokens = Vec::new();
374 tokenizer.tokenize_inline("`code`", &mut tokens);
375 assert_eq!(
376 tokens,
377 vec![
378 Token::Backticks(1),
379 Token::Text("code".to_string()),
380 Token::Backticks(1),
381 ]
382 );
383 }
384
385 #[test]
386 fn test_tokenize_double_backticks() {
387 let tokenizer = Tokenizer::new();
388 let mut tokens = Vec::new();
389 tokenizer.tokenize_inline("`` `code` ``", &mut tokens);
390 assert_eq!(
392 tokens,
393 vec![
394 Token::Backticks(2),
395 Token::Text(" ".to_string()),
396 Token::Backticks(1),
397 Token::Text("code".to_string()),
398 Token::Backticks(1),
399 Token::Text(" ".to_string()),
400 Token::Backticks(2),
401 ]
402 );
403 }
404
405 #[test]
406 fn test_tokenize_link() {
407 let tokenizer = Tokenizer::new();
408 let tokens = tokenizer.tokenize("Check [this](http://example.com) out");
409
410 assert!(tokens.iter().any(|t| matches!(t, Token::Link { .. })));
412 }
413
414 #[test]
415 fn test_tokenize_image() {
416 let tokenizer = Tokenizer::new();
417 let tokens = tokenizer.tokenize("See  here");
418
419 assert!(tokens.iter().any(|t| matches!(t, Token::Image { .. })));
421 }
422
423 #[test]
424 fn test_tokenize_footnote() {
425 let tokenizer = Tokenizer::new();
426 let tokens = tokenizer.tokenize("Some text[^1] here");
427
428 assert!(tokens.iter().any(|t| matches!(t, Token::Footnote(1))));
430 }
431
432 #[test]
433 fn test_is_cjk() {
434 assert!(is_cjk('中'));
435 assert!(is_cjk('日'));
436 assert!(is_cjk('한'));
437 assert!(is_cjk('あ'));
438 assert!(!is_cjk('A'));
439 assert!(!is_cjk('1'));
440 }
441
442 #[test]
443 fn test_cjk_count() {
444 assert_eq!(cjk_count("Hello"), 0);
445 assert_eq!(cjk_count("中文"), 2);
446 assert_eq!(cjk_count("Hello世界"), 2);
447 }
448
449 #[test]
450 fn test_not_text() {
451 assert!(!not_text("hello"));
452 assert!(!not_text("Hello123"));
453 assert!(not_text("**"));
454 assert!(not_text("*"));
455 assert!(not_text("中文")); }
457}