Skip to main content

streamdown_parser/
tokenizer.rs

1//! Tokenizer for inline markdown content.
2//!
3//! This module provides tokenization of markdown inline content,
4//! breaking text into tokens for formatting markers, text, and special elements.
5
6use regex::Regex;
7use std::sync::LazyLock;
8
9/// Regex for tokenizing inline markdown content.
10/// Matches formatting markers (**, *, _, ~~, `) and regular text.
11static INLINE_TOKEN_RE: LazyLock<Regex> = LazyLock::new(|| {
12    // Match formatting markers OR runs of non-marker text
13    // Order matters: longer markers first
14    Regex::new(r"(```+|~~|\*\*\*|\*\*_|_\*\*|\*\*|\*|___|__|_|`+|[^~_*`]+)").unwrap()
15});
16
17/// Regex for matching links: [text](url)
18static LINK_RE: LazyLock<Regex> =
19    LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\(([^\)]+)\)").unwrap());
20
21/// Regex for matching images: ![alt](url)
22static IMAGE_RE: LazyLock<Regex> =
23    LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\(([^\)]+)\)").unwrap());
24
25/// Regex for matching footnotes: `[^1]` or `[^1]:`
26static FOOTNOTE_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[\^(\d+)\]:?").unwrap());
27
28/// Regex for matching inline code spans: `code` or ``code``
29static CODE_SPAN_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"``[^`]+``|`[^`]+`").unwrap());
30
31/// Find byte ranges of inline code spans in a line.
32fn find_code_regions(line: &str) -> Vec<(usize, usize)> {
33    CODE_SPAN_RE
34        .find_iter(line)
35        .map(|m| (m.start(), m.end()))
36        .collect()
37}
38
39/// Token types for inline markdown content.
40#[derive(Debug, Clone, PartialEq, Eq)]
41pub enum Token {
42    /// Plain text content
43    Text(String),
44
45    /// Triple asterisk: ***
46    TripleAsterisk,
47
48    /// Double asterisk: **
49    DoubleAsterisk,
50
51    /// Single asterisk: *
52    Asterisk,
53
54    /// Triple underscore: ___
55    TripleUnderscore,
56
57    /// Double underscore: __
58    DoubleUnderscore,
59
60    /// Single underscore: _
61    Underscore,
62
63    /// **_ combination (bold + italic start)
64    DoubleAsteriskUnderscore,
65
66    /// _** combination (italic + bold end)
67    UnderscoreDoubleAsterisk,
68
69    /// Tilde pair: ~~
70    DoubleTilde,
71
72    /// Backticks (variable count)
73    Backticks(usize),
74
75    /// A link: [text](url)
76    Link { text: String, url: String },
77
78    /// An image: ![alt](url)
79    Image { alt: String, url: String },
80
81    /// A footnote reference: `[^1]`
82    Footnote(u32),
83}
84
85impl Token {
86    /// Check if this token is a formatting marker.
87    pub fn is_marker(&self) -> bool {
88        !matches!(
89            self,
90            Token::Text(_) | Token::Link { .. } | Token::Image { .. } | Token::Footnote(_)
91        )
92    }
93
94    /// Get the marker string for formatting tokens.
95    pub fn marker_str(&self) -> Option<&'static str> {
96        match self {
97            Token::TripleAsterisk => Some("***"),
98            Token::DoubleAsterisk => Some("**"),
99            Token::Asterisk => Some("*"),
100            Token::TripleUnderscore => Some("___"),
101            Token::DoubleUnderscore => Some("__"),
102            Token::Underscore => Some("_"),
103            Token::DoubleAsteriskUnderscore => Some("**_"),
104            Token::UnderscoreDoubleAsterisk => Some("_**"),
105            Token::DoubleTilde => Some("~~"),
106            Token::Backticks(_) => {
107                // Can't return dynamic string as static
108                None
109            }
110            _ => None,
111        }
112    }
113}
114
115/// Tokenizer for inline markdown content.
116#[derive(Debug, Default)]
117pub struct Tokenizer {
118    /// Whether to process links
119    pub process_links: bool,
120    /// Whether to process images
121    pub process_images: bool,
122}
123
124impl Tokenizer {
125    /// Create a new tokenizer with default settings.
126    pub fn new() -> Self {
127        Self {
128            process_links: true,
129            process_images: true,
130        }
131    }
132
133    /// Create a tokenizer with specific settings.
134    pub fn with_settings(process_links: bool, process_images: bool) -> Self {
135        Self {
136            process_links,
137            process_images,
138        }
139    }
140
141    /// Tokenize a line of markdown content.
142    ///
143    /// This extracts links, images, footnotes, and inline formatting markers.
144    pub fn tokenize(&self, line: &str) -> Vec<Token> {
145        let mut tokens = Vec::new();
146        self.tokenize_with_extractions(line, &mut tokens);
147        tokens
148    }
149
150    /// Tokenize inline content for formatting markers.
151    pub fn tokenize_inline(&self, text: &str, tokens: &mut Vec<Token>) {
152        for cap in INLINE_TOKEN_RE.find_iter(text) {
153            let s = cap.as_str();
154            let token = match s {
155                "***" => Token::TripleAsterisk,
156                "**" => Token::DoubleAsterisk,
157                "*" => Token::Asterisk,
158                "___" => Token::TripleUnderscore,
159                "__" => Token::DoubleUnderscore,
160                "_" => Token::Underscore,
161                "**_" => Token::DoubleAsteriskUnderscore,
162                "_**" => Token::UnderscoreDoubleAsterisk,
163                "~~" => Token::DoubleTilde,
164                _ if s.chars().all(|c| c == '`') => Token::Backticks(s.len()),
165                _ => Token::Text(s.to_string()),
166            };
167            tokens.push(token);
168        }
169    }
170
171    /// Tokenize with links, images, and footnotes already extracted.
172    fn tokenize_with_extractions(&self, line: &str, tokens: &mut Vec<Token>) {
173        tokens.clear();
174
175        // We need to process the line while preserving extracted elements
176        let mut last_end = 0;
177        let mut extractions: Vec<(usize, usize, Token)> = Vec::new();
178
179        // Find all images
180        if self.process_images {
181            for cap in IMAGE_RE.captures_iter(line) {
182                let m = cap.get(0).unwrap();
183                let alt = cap.get(1).map(|m| m.as_str()).unwrap_or("");
184                let url = cap.get(2).map(|m| m.as_str()).unwrap_or("");
185                extractions.push((
186                    m.start(),
187                    m.end(),
188                    Token::Image {
189                        alt: alt.to_string(),
190                        url: url.to_string(),
191                    },
192                ));
193            }
194        }
195
196        // Find all links (that aren't part of images)
197        if self.process_links {
198            for cap in LINK_RE.captures_iter(line) {
199                let m = cap.get(0).unwrap();
200                // Check if this is part of an image (preceded by !)
201                if m.start() > 0 && line.as_bytes().get(m.start() - 1) == Some(&b'!') {
202                    continue;
203                }
204                let text = cap.get(1).map(|m| m.as_str()).unwrap_or("");
205                let url = cap.get(2).map(|m| m.as_str()).unwrap_or("");
206                extractions.push((
207                    m.start(),
208                    m.end(),
209                    Token::Link {
210                        text: text.to_string(),
211                        url: url.to_string(),
212                    },
213                ));
214            }
215        }
216
217        // Find all footnotes
218        for cap in FOOTNOTE_RE.captures_iter(line) {
219            let m = cap.get(0).unwrap();
220            if let Some(num_match) = cap.get(1) {
221                if let Ok(num) = num_match.as_str().parse::<u32>() {
222                    extractions.push((m.start(), m.end(), Token::Footnote(num)));
223                }
224            }
225        }
226
227        // Filter out extractions inside code spans (backtick-delimited regions)
228        let code_regions = find_code_regions(line);
229        extractions.retain(|(start, end, _)| {
230            !code_regions
231                .iter()
232                .any(|(cs, ce)| *start >= *cs && *end <= *ce)
233        });
234
235        // Sort extractions by start position
236        extractions.sort_by_key(|(start, _, _)| *start);
237
238        // Remove overlapping extractions (keep first)
239        let mut filtered: Vec<(usize, usize, Token)> = Vec::new();
240        for ext in extractions {
241            if filtered.is_empty() || ext.0 >= filtered.last().unwrap().1 {
242                filtered.push(ext);
243            }
244        }
245
246        // Now tokenize, inserting extracted tokens at the right places
247        for (start, end, token) in filtered {
248            // Tokenize text before this extraction
249            if start > last_end {
250                self.tokenize_inline(&line[last_end..start], tokens);
251            }
252            tokens.push(token);
253            last_end = end;
254        }
255
256        // Tokenize remaining text
257        if last_end < line.len() {
258            self.tokenize_inline(&line[last_end..], tokens);
259        }
260    }
261
262    // Note: These extraction methods are kept for potential future use
263    // when we need to process links/images/footnotes separately.
264
265    #[allow(dead_code)]
266    fn extract_images(&self, text: &str) -> String {
267        IMAGE_RE.replace_all(text, "").to_string()
268    }
269
270    #[allow(dead_code)]
271    fn extract_links(&self, text: &str) -> String {
272        LINK_RE.replace_all(text, "").to_string()
273    }
274
275    #[allow(dead_code)]
276    fn extract_footnotes(&self, text: &str) -> String {
277        FOOTNOTE_RE.replace_all(text, "").to_string()
278    }
279}
280
281/// Check if a character is CJK (Chinese, Japanese, Korean).
282///
283/// CJK characters don't use spaces as word separators, so we need
284/// special handling for them.
285pub fn is_cjk(c: char) -> bool {
286    matches!(c,
287        '\u{4E00}'..='\u{9FFF}' |   // CJK Unified Ideographs
288        '\u{3400}'..='\u{4DBF}' |   // CJK Unified Ideographs Extension A
289        '\u{20000}'..='\u{2A6DF}' | // CJK Unified Ideographs Extension B
290        '\u{2A700}'..='\u{2B73F}' | // CJK Unified Ideographs Extension C
291        '\u{2B740}'..='\u{2B81F}' | // CJK Unified Ideographs Extension D
292        '\u{F900}'..='\u{FAFF}' |   // CJK Compatibility Ideographs
293        '\u{3000}'..='\u{303F}' |   // CJK Punctuation
294        '\u{3040}'..='\u{309F}' |   // Hiragana
295        '\u{30A0}'..='\u{30FF}' |   // Katakana
296        '\u{31F0}'..='\u{31FF}' |   // Katakana Phonetic Extensions
297        '\u{AC00}'..='\u{D7AF}' |   // Hangul Syllables
298        '\u{1100}'..='\u{11FF}' |   // Hangul Jamo
299        '\u{FF00}'..='\u{FFEF}'     // Fullwidth Forms
300    )
301}
302
303/// Count CJK characters in a string.
304pub fn cjk_count(s: &str) -> usize {
305    s.chars().filter(|&c| is_cjk(c)).count()
306}
307
308/// Check if a token string is "not text" (is a potential marker boundary).
309///
310/// Returns true if the token is not alphanumeric and not a quote/backslash,
311/// OR if it contains CJK characters.
312pub fn not_text(token: &str) -> bool {
313    if cjk_count(token) > 0 {
314        return true;
315    }
316
317    !token
318        .chars()
319        .all(|c| c.is_alphanumeric() || c == '\\' || c == '"')
320}
321
322#[cfg(test)]
323mod tests {
324    use super::*;
325
326    #[test]
327    fn test_tokenize_plain_text() {
328        let tokenizer = Tokenizer::new();
329        let tokens = tokenizer.tokenize("Hello world");
330        assert_eq!(tokens, vec![Token::Text("Hello world".to_string())]);
331    }
332
333    #[test]
334    fn test_tokenize_bold() {
335        let tokenizer = Tokenizer::new();
336        let mut tokens = Vec::new();
337        tokenizer.tokenize_inline("**bold**", &mut tokens);
338        assert_eq!(
339            tokens,
340            vec![
341                Token::DoubleAsterisk,
342                Token::Text("bold".to_string()),
343                Token::DoubleAsterisk,
344            ]
345        );
346    }
347
348    #[test]
349    fn test_tokenize_italic() {
350        let tokenizer = Tokenizer::new();
351        let mut tokens = Vec::new();
352        tokenizer.tokenize_inline("*italic*", &mut tokens);
353        assert_eq!(
354            tokens,
355            vec![
356                Token::Asterisk,
357                Token::Text("italic".to_string()),
358                Token::Asterisk,
359            ]
360        );
361    }
362
363    #[test]
364    fn test_tokenize_triple_asterisk() {
365        let tokenizer = Tokenizer::new();
366        let mut tokens = Vec::new();
367        tokenizer.tokenize_inline("***bold italic***", &mut tokens);
368        assert_eq!(
369            tokens,
370            vec![
371                Token::TripleAsterisk,
372                Token::Text("bold italic".to_string()),
373                Token::TripleAsterisk,
374            ]
375        );
376    }
377
378    #[test]
379    fn test_tokenize_strikethrough() {
380        let tokenizer = Tokenizer::new();
381        let mut tokens = Vec::new();
382        tokenizer.tokenize_inline("~~strike~~", &mut tokens);
383        assert_eq!(
384            tokens,
385            vec![
386                Token::DoubleTilde,
387                Token::Text("strike".to_string()),
388                Token::DoubleTilde,
389            ]
390        );
391    }
392
393    #[test]
394    fn test_tokenize_backticks() {
395        let tokenizer = Tokenizer::new();
396        let mut tokens = Vec::new();
397        tokenizer.tokenize_inline("`code`", &mut tokens);
398        assert_eq!(
399            tokens,
400            vec![
401                Token::Backticks(1),
402                Token::Text("code".to_string()),
403                Token::Backticks(1),
404            ]
405        );
406    }
407
408    #[test]
409    fn test_tokenize_double_backticks() {
410        let tokenizer = Tokenizer::new();
411        let mut tokens = Vec::new();
412        tokenizer.tokenize_inline("`` `code` ``", &mut tokens);
413        // The tokenizer just splits on markers, the InlineParser handles matching
414        assert_eq!(
415            tokens,
416            vec![
417                Token::Backticks(2),
418                Token::Text(" ".to_string()),
419                Token::Backticks(1),
420                Token::Text("code".to_string()),
421                Token::Backticks(1),
422                Token::Text(" ".to_string()),
423                Token::Backticks(2),
424            ]
425        );
426    }
427
428    #[test]
429    fn test_tokenize_link() {
430        let tokenizer = Tokenizer::new();
431        let tokens = tokenizer.tokenize("Check [this](http://example.com) out");
432
433        // Should contain a Link token
434        assert!(tokens.iter().any(|t| matches!(t, Token::Link { .. })));
435    }
436
437    #[test]
438    fn test_tokenize_image() {
439        let tokenizer = Tokenizer::new();
440        let tokens = tokenizer.tokenize("See ![alt](http://img.png) here");
441
442        // Should contain an Image token
443        assert!(tokens.iter().any(|t| matches!(t, Token::Image { .. })));
444    }
445
446    #[test]
447    fn test_tokenize_footnote() {
448        let tokenizer = Tokenizer::new();
449        let tokens = tokenizer.tokenize("Some text[^1] here");
450
451        // Should contain a Footnote token
452        assert!(tokens.iter().any(|t| matches!(t, Token::Footnote(1))));
453    }
454
455    #[test]
456    fn test_is_cjk() {
457        assert!(is_cjk('中'));
458        assert!(is_cjk('日'));
459        assert!(is_cjk('한'));
460        assert!(is_cjk('あ'));
461        assert!(!is_cjk('A'));
462        assert!(!is_cjk('1'));
463    }
464
465    #[test]
466    fn test_cjk_count() {
467        assert_eq!(cjk_count("Hello"), 0);
468        assert_eq!(cjk_count("中文"), 2);
469        assert_eq!(cjk_count("Hello世界"), 2);
470    }
471
472    #[test]
473    fn test_not_text() {
474        assert!(!not_text("hello"));
475        assert!(!not_text("Hello123"));
476        assert!(not_text("**"));
477        assert!(not_text("*"));
478        assert!(not_text("中文")); // CJK
479    }
480
481    #[test]
482    fn test_link_inside_code_not_extracted() {
483        let tokenizer = Tokenizer::new();
484        let tokens = tokenizer.tokenize("`[text](url)`");
485        // Link inside backticks should NOT be extracted
486        assert!(!tokens.iter().any(|t| matches!(t, Token::Link { .. })));
487    }
488}