streamdown_parser/
tokenizer.rs

1//! Tokenizer for inline markdown content.
2//!
3//! This module provides tokenization of markdown inline content,
4//! breaking text into tokens for formatting markers, text, and special elements.
5
6use regex::Regex;
7use std::sync::LazyLock;
8
9/// Regex for tokenizing inline markdown content.
10/// Matches formatting markers (**, *, _, ~~, `) and regular text.
11static INLINE_TOKEN_RE: LazyLock<Regex> = LazyLock::new(|| {
12    // Match formatting markers OR runs of non-marker text
13    // Order matters: longer markers first
14    Regex::new(r"(```+|~~|\*\*\*|\*\*_|_\*\*|\*\*|\*|___|__|_|`+|[^~_*`]+)").unwrap()
15});
16
17/// Regex for matching links: [text](url)
18static LINK_RE: LazyLock<Regex> =
19    LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^\)]+)\)").unwrap());
20
21/// Regex for matching images: ![alt](url)
22static IMAGE_RE: LazyLock<Regex> =
23    LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\(([^\)]+)\)").unwrap());
24
25/// Regex for matching footnotes: [^1] or [^1]:
26static FOOTNOTE_RE: LazyLock<Regex> =
27    LazyLock::new(|| Regex::new(r"\[\^(\d+)\]:?").unwrap());
28
29/// Token types for inline markdown content.
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub enum Token {
32    /// Plain text content
33    Text(String),
34
35    /// Triple asterisk: ***
36    TripleAsterisk,
37
38    /// Double asterisk: **
39    DoubleAsterisk,
40
41    /// Single asterisk: *
42    Asterisk,
43
44    /// Triple underscore: ___
45    TripleUnderscore,
46
47    /// Double underscore: __
48    DoubleUnderscore,
49
50    /// Single underscore: _
51    Underscore,
52
53    /// **_ combination (bold + italic start)
54    DoubleAsteriskUnderscore,
55
56    /// _** combination (italic + bold end)
57    UnderscoreDoubleAsterisk,
58
59    /// Tilde pair: ~~
60    DoubleTilde,
61
62    /// Backticks (variable count)
63    Backticks(usize),
64
65    /// A link: [text](url)
66    Link { text: String, url: String },
67
68    /// An image: ![alt](url)
69    Image { alt: String, url: String },
70
71    /// A footnote reference: [^1]
72    Footnote(u32),
73}
74
75impl Token {
76    /// Check if this token is a formatting marker.
77    pub fn is_marker(&self) -> bool {
78        !matches!(self, Token::Text(_) | Token::Link { .. } | Token::Image { .. } | Token::Footnote(_))
79    }
80
81    /// Get the marker string for formatting tokens.
82    pub fn marker_str(&self) -> Option<&'static str> {
83        match self {
84            Token::TripleAsterisk => Some("***"),
85            Token::DoubleAsterisk => Some("**"),
86            Token::Asterisk => Some("*"),
87            Token::TripleUnderscore => Some("___"),
88            Token::DoubleUnderscore => Some("__"),
89            Token::Underscore => Some("_"),
90            Token::DoubleAsteriskUnderscore => Some("**_"),
91            Token::UnderscoreDoubleAsterisk => Some("_**"),
92            Token::DoubleTilde => Some("~~"),
93            Token::Backticks(_) => {
94                // Can't return dynamic string as static
95                None
96            }
97            _ => None,
98        }
99    }
100}
101
102/// Tokenizer for inline markdown content.
103#[derive(Debug, Default)]
104pub struct Tokenizer {
105    /// Whether to process links
106    pub process_links: bool,
107    /// Whether to process images
108    pub process_images: bool,
109}
110
111impl Tokenizer {
112    /// Create a new tokenizer with default settings.
113    pub fn new() -> Self {
114        Self {
115            process_links: true,
116            process_images: true,
117        }
118    }
119
120    /// Create a tokenizer with specific settings.
121    pub fn with_settings(process_links: bool, process_images: bool) -> Self {
122        Self {
123            process_links,
124            process_images,
125        }
126    }
127
128    /// Tokenize a line of markdown content.
129    ///
130    /// This extracts links, images, footnotes, and inline formatting markers.
131    pub fn tokenize(&self, line: &str) -> Vec<Token> {
132        let mut tokens = Vec::new();
133        self.tokenize_with_extractions(line, &mut tokens);
134        tokens
135    }
136
137    /// Tokenize inline content for formatting markers.
138    pub fn tokenize_inline(&self, text: &str, tokens: &mut Vec<Token>) {
139        for cap in INLINE_TOKEN_RE.find_iter(text) {
140            let s = cap.as_str();
141            let token = match s {
142                "***" => Token::TripleAsterisk,
143                "**" => Token::DoubleAsterisk,
144                "*" => Token::Asterisk,
145                "___" => Token::TripleUnderscore,
146                "__" => Token::DoubleUnderscore,
147                "_" => Token::Underscore,
148                "**_" => Token::DoubleAsteriskUnderscore,
149                "_**" => Token::UnderscoreDoubleAsterisk,
150                "~~" => Token::DoubleTilde,
151                _ if s.chars().all(|c| c == '`') => Token::Backticks(s.len()),
152                _ => Token::Text(s.to_string()),
153            };
154            tokens.push(token);
155        }
156    }
157
158    /// Tokenize with links, images, and footnotes already extracted.
159    fn tokenize_with_extractions(&self, line: &str, tokens: &mut Vec<Token>) {
160        tokens.clear();
161
162        // We need to process the line while preserving extracted elements
163        let mut last_end = 0;
164        let mut extractions: Vec<(usize, usize, Token)> = Vec::new();
165
166        // Find all images
167        if self.process_images {
168            for cap in IMAGE_RE.captures_iter(line) {
169                let m = cap.get(0).unwrap();
170                let alt = cap.get(1).map(|m| m.as_str()).unwrap_or("");
171                let url = cap.get(2).map(|m| m.as_str()).unwrap_or("");
172                extractions.push((
173                    m.start(),
174                    m.end(),
175                    Token::Image {
176                        alt: alt.to_string(),
177                        url: url.to_string(),
178                    },
179                ));
180            }
181        }
182
183        // Find all links (that aren't part of images)
184        if self.process_links {
185            for cap in LINK_RE.captures_iter(line) {
186                let m = cap.get(0).unwrap();
187                // Check if this is part of an image (preceded by !)
188                if m.start() > 0 && line.as_bytes().get(m.start() - 1) == Some(&b'!') {
189                    continue;
190                }
191                let text = cap.get(1).map(|m| m.as_str()).unwrap_or("");
192                let url = cap.get(2).map(|m| m.as_str()).unwrap_or("");
193                extractions.push((
194                    m.start(),
195                    m.end(),
196                    Token::Link {
197                        text: text.to_string(),
198                        url: url.to_string(),
199                    },
200                ));
201            }
202        }
203
204        // Find all footnotes
205        for cap in FOOTNOTE_RE.captures_iter(line) {
206            let m = cap.get(0).unwrap();
207            if let Some(num_match) = cap.get(1) {
208                if let Ok(num) = num_match.as_str().parse::<u32>() {
209                    extractions.push((m.start(), m.end(), Token::Footnote(num)));
210                }
211            }
212        }
213
214        // Sort extractions by start position
215        extractions.sort_by_key(|(start, _, _)| *start);
216
217        // Remove overlapping extractions (keep first)
218        let mut filtered: Vec<(usize, usize, Token)> = Vec::new();
219        for ext in extractions {
220            if filtered.is_empty() || ext.0 >= filtered.last().unwrap().1 {
221                filtered.push(ext);
222            }
223        }
224
225        // Now tokenize, inserting extracted tokens at the right places
226        for (start, end, token) in filtered {
227            // Tokenize text before this extraction
228            if start > last_end {
229                self.tokenize_inline(&line[last_end..start], tokens);
230            }
231            tokens.push(token);
232            last_end = end;
233        }
234
235        // Tokenize remaining text
236        if last_end < line.len() {
237            self.tokenize_inline(&line[last_end..], tokens);
238        }
239    }
240
241    // Note: These extraction methods are kept for potential future use
242    // when we need to process links/images/footnotes separately.
243
244    #[allow(dead_code)]
245    fn extract_images(&self, text: &str) -> String {
246        IMAGE_RE.replace_all(text, "").to_string()
247    }
248
249    #[allow(dead_code)]
250    fn extract_links(&self, text: &str) -> String {
251        LINK_RE.replace_all(text, "").to_string()
252    }
253
254    #[allow(dead_code)]
255    fn extract_footnotes(&self, text: &str) -> String {
256        FOOTNOTE_RE.replace_all(text, "").to_string()
257    }
258}
259
260/// Check if a character is CJK (Chinese, Japanese, Korean).
261///
262/// CJK characters don't use spaces as word separators, so we need
263/// special handling for them.
264pub fn is_cjk(c: char) -> bool {
265    matches!(c,
266        '\u{4E00}'..='\u{9FFF}' |   // CJK Unified Ideographs
267        '\u{3400}'..='\u{4DBF}' |   // CJK Unified Ideographs Extension A
268        '\u{20000}'..='\u{2A6DF}' | // CJK Unified Ideographs Extension B
269        '\u{2A700}'..='\u{2B73F}' | // CJK Unified Ideographs Extension C
270        '\u{2B740}'..='\u{2B81F}' | // CJK Unified Ideographs Extension D
271        '\u{F900}'..='\u{FAFF}' |   // CJK Compatibility Ideographs
272        '\u{3000}'..='\u{303F}' |   // CJK Punctuation
273        '\u{3040}'..='\u{309F}' |   // Hiragana
274        '\u{30A0}'..='\u{30FF}' |   // Katakana
275        '\u{31F0}'..='\u{31FF}' |   // Katakana Phonetic Extensions
276        '\u{AC00}'..='\u{D7AF}' |   // Hangul Syllables
277        '\u{1100}'..='\u{11FF}' |   // Hangul Jamo
278        '\u{FF00}'..='\u{FFEF}'     // Fullwidth Forms
279    )
280}
281
282/// Count CJK characters in a string.
283pub fn cjk_count(s: &str) -> usize {
284    s.chars().filter(|&c| is_cjk(c)).count()
285}
286
287/// Check if a token string is "not text" (is a potential marker boundary).
288///
289/// Returns true if the token is not alphanumeric and not a quote/backslash,
290/// OR if it contains CJK characters.
291pub fn not_text(token: &str) -> bool {
292    if cjk_count(token) > 0 {
293        return true;
294    }
295
296    !token.chars().all(|c| c.is_alphanumeric() || c == '\\' || c == '"')
297}
298
299#[cfg(test)]
300mod tests {
301    use super::*;
302
303    #[test]
304    fn test_tokenize_plain_text() {
305        let tokenizer = Tokenizer::new();
306        let tokens = tokenizer.tokenize("Hello world");
307        assert_eq!(tokens, vec![Token::Text("Hello world".to_string())]);
308    }
309
310    #[test]
311    fn test_tokenize_bold() {
312        let tokenizer = Tokenizer::new();
313        let mut tokens = Vec::new();
314        tokenizer.tokenize_inline("**bold**", &mut tokens);
315        assert_eq!(
316            tokens,
317            vec![
318                Token::DoubleAsterisk,
319                Token::Text("bold".to_string()),
320                Token::DoubleAsterisk,
321            ]
322        );
323    }
324
325    #[test]
326    fn test_tokenize_italic() {
327        let tokenizer = Tokenizer::new();
328        let mut tokens = Vec::new();
329        tokenizer.tokenize_inline("*italic*", &mut tokens);
330        assert_eq!(
331            tokens,
332            vec![
333                Token::Asterisk,
334                Token::Text("italic".to_string()),
335                Token::Asterisk,
336            ]
337        );
338    }
339
340    #[test]
341    fn test_tokenize_triple_asterisk() {
342        let tokenizer = Tokenizer::new();
343        let mut tokens = Vec::new();
344        tokenizer.tokenize_inline("***bold italic***", &mut tokens);
345        assert_eq!(
346            tokens,
347            vec![
348                Token::TripleAsterisk,
349                Token::Text("bold italic".to_string()),
350                Token::TripleAsterisk,
351            ]
352        );
353    }
354
355    #[test]
356    fn test_tokenize_strikethrough() {
357        let tokenizer = Tokenizer::new();
358        let mut tokens = Vec::new();
359        tokenizer.tokenize_inline("~~strike~~", &mut tokens);
360        assert_eq!(
361            tokens,
362            vec![
363                Token::DoubleTilde,
364                Token::Text("strike".to_string()),
365                Token::DoubleTilde,
366            ]
367        );
368    }
369
370    #[test]
371    fn test_tokenize_backticks() {
372        let tokenizer = Tokenizer::new();
373        let mut tokens = Vec::new();
374        tokenizer.tokenize_inline("`code`", &mut tokens);
375        assert_eq!(
376            tokens,
377            vec![
378                Token::Backticks(1),
379                Token::Text("code".to_string()),
380                Token::Backticks(1),
381            ]
382        );
383    }
384
385    #[test]
386    fn test_tokenize_double_backticks() {
387        let tokenizer = Tokenizer::new();
388        let mut tokens = Vec::new();
389        tokenizer.tokenize_inline("`` `code` ``", &mut tokens);
390        // The tokenizer just splits on markers, the InlineParser handles matching
391        assert_eq!(
392            tokens,
393            vec![
394                Token::Backticks(2),
395                Token::Text(" ".to_string()),
396                Token::Backticks(1),
397                Token::Text("code".to_string()),
398                Token::Backticks(1),
399                Token::Text(" ".to_string()),
400                Token::Backticks(2),
401            ]
402        );
403    }
404
405    #[test]
406    fn test_tokenize_link() {
407        let tokenizer = Tokenizer::new();
408        let tokens = tokenizer.tokenize("Check [this](http://example.com) out");
409
410        // Should contain a Link token
411        assert!(tokens.iter().any(|t| matches!(t, Token::Link { .. })));
412    }
413
414    #[test]
415    fn test_tokenize_image() {
416        let tokenizer = Tokenizer::new();
417        let tokens = tokenizer.tokenize("See ![alt](http://img.png) here");
418
419        // Should contain an Image token
420        assert!(tokens.iter().any(|t| matches!(t, Token::Image { .. })));
421    }
422
423    #[test]
424    fn test_tokenize_footnote() {
425        let tokenizer = Tokenizer::new();
426        let tokens = tokenizer.tokenize("Some text[^1] here");
427
428        // Should contain a Footnote token
429        assert!(tokens.iter().any(|t| matches!(t, Token::Footnote(1))));
430    }
431
432    #[test]
433    fn test_is_cjk() {
434        assert!(is_cjk('中'));
435        assert!(is_cjk('日'));
436        assert!(is_cjk('한'));
437        assert!(is_cjk('あ'));
438        assert!(!is_cjk('A'));
439        assert!(!is_cjk('1'));
440    }
441
442    #[test]
443    fn test_cjk_count() {
444        assert_eq!(cjk_count("Hello"), 0);
445        assert_eq!(cjk_count("中文"), 2);
446        assert_eq!(cjk_count("Hello世界"), 2);
447    }
448
449    #[test]
450    fn test_not_text() {
451        assert!(!not_text("hello"));
452        assert!(!not_text("Hello123"));
453        assert!(not_text("**"));
454        assert!(not_text("*"));
455        assert!(not_text("中文")); // CJK
456    }
457}