Skip to main content

streamdown_parser/
inline.rs

1//! Inline markdown parser.
2//!
3//! This module handles parsing of inline markdown formatting including
4//! bold, italic, underline, strikethrough, inline code, links, images,
5//! and footnotes.
6
7use crate::tokenizer::{Token, Tokenizer};
8use streamdown_ansi::codes::digit_to_superscript;
9
10/// Result of parsing inline content.
11#[derive(Debug, Clone, PartialEq, Eq)]
12pub enum InlineElement {
13    /// Plain text
14    Text(String),
15    /// Bold text
16    Bold(String),
17    /// Italic text
18    Italic(String),
19    /// Bold and italic text
20    BoldItalic(String),
21    /// Underlined text
22    Underline(String),
23    /// Strikethrough text
24    Strikeout(String),
25    /// Inline code
26    Code(String),
27    /// A link
28    Link { text: String, url: String },
29    /// An image
30    Image { alt: String, url: String },
31    /// Footnote reference (as superscript)
32    Footnote(String),
33}
34
35/// State for tracking active formatting.
36#[derive(Debug, Clone, Default)]
37struct FormatState {
38    /// Bold is active
39    bold: bool,
40    /// Italic is active
41    italic: bool,
42    /// Underline is active
43    underline: bool,
44    /// Strikeout is active
45    strikeout: bool,
46    /// In inline code (with backtick count)
47    code_backticks: Option<usize>,
48    /// Code content buffer
49    code_buffer: String,
50}
51
52impl FormatState {
53    fn new() -> Self {
54        Self::default()
55    }
56
57    #[allow(dead_code)]
58    fn any_active(&self) -> bool {
59        self.bold
60            || self.italic
61            || self.underline
62            || self.strikeout
63            || self.code_backticks.is_some()
64    }
65
66    fn reset(&mut self) {
67        self.bold = false;
68        self.italic = false;
69        self.underline = false;
70        self.strikeout = false;
71        self.code_backticks = None;
72        self.code_buffer.clear();
73    }
74}
75
76/// Inline markdown parser.
77///
78/// Parses inline formatting and returns structured elements.
79#[derive(Debug)]
80pub struct InlineParser {
81    tokenizer: Tokenizer,
82    state: FormatState,
83    /// Whether to process links
84    pub process_links: bool,
85    /// Whether to process images  
86    pub process_images: bool,
87}
88
89impl Default for InlineParser {
90    fn default() -> Self {
91        Self::new()
92    }
93}
94
95impl InlineParser {
96    /// Create a new inline parser.
97    pub fn new() -> Self {
98        Self {
99            tokenizer: Tokenizer::new(),
100            state: FormatState::new(),
101            process_links: true,
102            process_images: true,
103        }
104    }
105
106    /// Create parser with specific settings.
107    pub fn with_settings(process_links: bool, process_images: bool) -> Self {
108        Self {
109            tokenizer: Tokenizer::with_settings(process_links, process_images),
110            state: FormatState::new(),
111            process_links,
112            process_images,
113        }
114    }
115
116    /// Parse a line of markdown and return inline elements.
117    ///
118    /// This is the main entry point for inline parsing.
119    pub fn parse(&mut self, line: &str) -> Vec<InlineElement> {
120        let tokens = self.tokenizer.tokenize(line);
121        self.parse_tokens(&tokens)
122    }
123
124    /// Parse a sequence of tokens into inline elements.
125    fn parse_tokens(&mut self, tokens: &[Token]) -> Vec<InlineElement> {
126        let mut elements = Vec::new();
127        let mut buffer = String::new();
128        let mut i = 0;
129
130        while i < tokens.len() {
131            let token = &tokens[i];
132
133            // If we're in code mode, handle specially
134            if let Some(expected_backticks) = self.state.code_backticks {
135                match token {
136                    Token::Backticks(n) if *n == expected_backticks => {
137                        // End of inline code
138                        let code = std::mem::take(&mut self.state.code_buffer);
139                        // Trim single leading/trailing space (Markdown spec)
140                        let code = code.strip_prefix(' ').unwrap_or(&code);
141                        let code = code.strip_suffix(' ').unwrap_or(code);
142                        elements.push(InlineElement::Code(code.to_string()));
143                        self.state.code_backticks = None;
144                    }
145                    _ => {
146                        // Add to code buffer
147                        match token {
148                            Token::Text(s) => self.state.code_buffer.push_str(s),
149                            Token::Backticks(n) => {
150                                self.state.code_buffer.push_str(&"`".repeat(*n));
151                            }
152                            _ => {
153                                if let Some(marker) = token.marker_str() {
154                                    self.state.code_buffer.push_str(marker);
155                                }
156                            }
157                        }
158                    }
159                }
160                i += 1;
161                continue;
162            }
163
164            match token {
165                Token::Text(s) => {
166                    buffer.push_str(s);
167                }
168
169                Token::Backticks(n) => {
170                    // Flush buffer
171                    if !buffer.is_empty() {
172                        self.emit_formatted(&mut elements, std::mem::take(&mut buffer));
173                    }
174                    // Start inline code
175                    self.state.code_backticks = Some(*n);
176                }
177
178                Token::TripleAsterisk => {
179                    // Flush buffer first
180                    if !buffer.is_empty() {
181                        self.emit_formatted(&mut elements, std::mem::take(&mut buffer));
182                    }
183
184                    if self.state.bold && self.state.italic {
185                        // End both
186                        self.state.bold = false;
187                        self.state.italic = false;
188                    } else if !self.state.bold && !self.state.italic {
189                        // Start both
190                        self.state.bold = true;
191                        self.state.italic = true;
192                    } else {
193                        // Mixed state - just emit as text
194                        buffer.push_str("***");
195                    }
196                }
197
198                Token::DoubleAsterisk => {
199                    if !buffer.is_empty() {
200                        self.emit_formatted(&mut elements, std::mem::take(&mut buffer));
201                    }
202                    self.state.bold = !self.state.bold;
203                }
204
205                Token::Asterisk => {
206                    if !buffer.is_empty() {
207                        self.emit_formatted(&mut elements, std::mem::take(&mut buffer));
208                    }
209                    self.state.italic = !self.state.italic;
210                }
211
212                Token::DoubleAsteriskUnderscore => {
213                    // **_ = start bold + start italic
214                    if !buffer.is_empty() {
215                        self.emit_formatted(&mut elements, std::mem::take(&mut buffer));
216                    }
217                    if !self.state.bold {
218                        self.state.bold = true;
219                    }
220                    self.state.italic = !self.state.italic;
221                }
222
223                Token::UnderscoreDoubleAsterisk => {
224                    // _** = end italic + end bold
225                    if !buffer.is_empty() {
226                        self.emit_formatted(&mut elements, std::mem::take(&mut buffer));
227                    }
228                    self.state.italic = false;
229                    self.state.bold = false;
230                }
231
232                Token::TripleUnderscore => {
233                    if !buffer.is_empty() {
234                        self.emit_formatted(&mut elements, std::mem::take(&mut buffer));
235                    }
236
237                    if self.state.underline && self.state.italic {
238                        self.state.underline = false;
239                        self.state.italic = false;
240                    } else if !self.state.underline && !self.state.italic {
241                        self.state.underline = true;
242                        self.state.italic = true;
243                    } else {
244                        buffer.push_str("___");
245                    }
246                }
247
248                Token::DoubleUnderscore => {
249                    if !buffer.is_empty() {
250                        self.emit_formatted(&mut elements, std::mem::take(&mut buffer));
251                    }
252                    self.state.underline = !self.state.underline;
253                }
254
255                Token::Underscore => {
256                    // Check context - underscore in middle of word shouldn't trigger italic.
257                    // We check the ADJACENT character, not the entire token, because tokens
258                    // may contain spaces (e.g., "use sem" before "_search tool").
259                    let prev_char_is_alnum = i > 0
260                        && matches!(&tokens[i - 1], Token::Text(s) if s.chars().last().map(|c| c.is_alphanumeric()).unwrap_or(false));
261                    let next_char_is_alnum = i + 1 < tokens.len()
262                        && matches!(&tokens[i + 1], Token::Text(s) if s.chars().next().map(|c| c.is_alphanumeric()).unwrap_or(false));
263
264                    if prev_char_is_alnum && next_char_is_alnum {
265                        // Underscore in middle of word - treat as text
266                        buffer.push('_');
267                    } else {
268                        if !buffer.is_empty() {
269                            self.emit_formatted(&mut elements, std::mem::take(&mut buffer));
270                        }
271                        self.state.italic = !self.state.italic;
272                    }
273                }
274
275                Token::DoubleTilde => {
276                    if !buffer.is_empty() {
277                        self.emit_formatted(&mut elements, std::mem::take(&mut buffer));
278                    }
279                    self.state.strikeout = !self.state.strikeout;
280                }
281
282                Token::Link { text, url } => {
283                    if !buffer.is_empty() {
284                        self.emit_formatted(&mut elements, std::mem::take(&mut buffer));
285                    }
286                    elements.push(InlineElement::Link {
287                        text: text.clone(),
288                        url: url.clone(),
289                    });
290                }
291
292                Token::Image { alt, url } => {
293                    if !buffer.is_empty() {
294                        self.emit_formatted(&mut elements, std::mem::take(&mut buffer));
295                    }
296                    elements.push(InlineElement::Image {
297                        alt: alt.clone(),
298                        url: url.clone(),
299                    });
300                }
301
302                Token::Footnote(num) => {
303                    if !buffer.is_empty() {
304                        self.emit_formatted(&mut elements, std::mem::take(&mut buffer));
305                    }
306                    // Convert number to superscript
307                    let superscript = number_to_superscript(*num);
308                    elements.push(InlineElement::Footnote(superscript));
309                }
310            }
311
312            i += 1;
313        }
314
315        // Flush remaining buffer
316        if !buffer.is_empty() {
317            self.emit_formatted(&mut elements, buffer);
318        }
319
320        // Flush any unclosed code block
321        if self.state.code_backticks.is_some() {
322            let code = std::mem::take(&mut self.state.code_buffer);
323            if !code.is_empty() {
324                elements.push(InlineElement::Code(code));
325            }
326            self.state.code_backticks = None;
327        }
328
329        // Reset state for next line
330        self.state.reset();
331
332        elements
333    }
334
335    /// Emit formatted text based on current state.
336    fn emit_formatted(&self, elements: &mut Vec<InlineElement>, text: String) {
337        if text.is_empty() {
338            return;
339        }
340
341        if self.state.bold && self.state.italic {
342            elements.push(InlineElement::BoldItalic(text));
343        } else if self.state.bold {
344            elements.push(InlineElement::Bold(text));
345        } else if self.state.italic {
346            elements.push(InlineElement::Italic(text));
347        } else if self.state.underline {
348            elements.push(InlineElement::Underline(text));
349        } else if self.state.strikeout {
350            elements.push(InlineElement::Strikeout(text));
351        } else {
352            elements.push(InlineElement::Text(text));
353        }
354    }
355
356    /// Reset the parser state.
357    pub fn reset(&mut self) {
358        self.state.reset();
359    }
360}
361
362/// Convert a number to superscript string.
363fn number_to_superscript(num: u32) -> String {
364    num.to_string()
365        .chars()
366        .map(|c| {
367            let digit = c.to_digit(10).unwrap_or(0) as u8;
368            digit_to_superscript(digit)
369        })
370        .collect()
371}
372
373/// Format a line with inline markdown.
374///
375/// This is a convenience function that parses a line and returns
376/// the formatted result as ANSI-styled text.
377pub fn format_line(line: &str, process_links: bool, process_images: bool) -> String {
378    use streamdown_ansi::codes::*;
379    use streamdown_ansi::style::*;
380
381    let mut parser = InlineParser::with_settings(process_links, process_images);
382    let elements = parser.parse(line);
383
384    let mut result = String::new();
385
386    for element in elements {
387        match element {
388            InlineElement::Text(s) => result.push_str(&s),
389            InlineElement::Bold(s) => {
390                result.push_str(BOLD.0);
391                result.push_str(&s);
392                result.push_str(BOLD.1);
393            }
394            InlineElement::Italic(s) => {
395                result.push_str(ITALIC.0);
396                result.push_str(&s);
397                result.push_str(ITALIC.1);
398            }
399            InlineElement::BoldItalic(s) => {
400                result.push_str(BOLD.0);
401                result.push_str(ITALIC.0);
402                result.push_str(&s);
403                result.push_str(ITALIC.1);
404                result.push_str(BOLD.1);
405            }
406            InlineElement::Underline(s) => {
407                result.push_str(UNDERLINE.0);
408                result.push_str(&s);
409                result.push_str(UNDERLINE.1);
410            }
411            InlineElement::Strikeout(s) => {
412                result.push_str(STRIKEOUT.0);
413                result.push_str(&s);
414                result.push_str(STRIKEOUT.1);
415            }
416            InlineElement::Code(s) => {
417                result.push_str(DIM_ON);
418                result.push_str(&s);
419                result.push_str(DIM_OFF);
420            }
421            InlineElement::Link { text, url } => {
422                result.push_str(LINK.0);
423                result.push_str(&url);
424                result.push('\x1b');
425                result.push_str(UNDERLINE.0);
426                result.push_str(&text);
427                result.push_str(UNDERLINE.1);
428                result.push_str(LINK.1);
429            }
430            InlineElement::Image { alt, url: _ } => {
431                result.push_str(DIM_ON);
432                result.push_str("[\u{1F5BC} ");
433                result.push_str(&alt);
434                result.push(']');
435                result.push_str(DIM_OFF);
436            }
437            InlineElement::Footnote(s) => {
438                result.push_str(&s);
439            }
440        }
441    }
442
443    result
444}
445
446#[cfg(test)]
447mod tests {
448    use super::*;
449
450    #[test]
451    fn test_parse_plain_text() {
452        let mut parser = InlineParser::new();
453        let elements = parser.parse("Hello world");
454        assert_eq!(
455            elements,
456            vec![InlineElement::Text("Hello world".to_string())]
457        );
458    }
459
460    #[test]
461    fn test_parse_bold() {
462        let mut parser = InlineParser::new();
463        let elements = parser.parse("Hello **bold** world");
464        assert_eq!(
465            elements,
466            vec![
467                InlineElement::Text("Hello ".to_string()),
468                InlineElement::Bold("bold".to_string()),
469                InlineElement::Text(" world".to_string()),
470            ]
471        );
472    }
473
474    #[test]
475    fn test_parse_italic() {
476        let mut parser = InlineParser::new();
477        let elements = parser.parse("Hello *italic* world");
478        assert_eq!(
479            elements,
480            vec![
481                InlineElement::Text("Hello ".to_string()),
482                InlineElement::Italic("italic".to_string()),
483                InlineElement::Text(" world".to_string()),
484            ]
485        );
486    }
487
488    #[test]
489    fn test_parse_bold_italic() {
490        let mut parser = InlineParser::new();
491        let elements = parser.parse("Hello ***bold italic*** world");
492        assert_eq!(
493            elements,
494            vec![
495                InlineElement::Text("Hello ".to_string()),
496                InlineElement::BoldItalic("bold italic".to_string()),
497                InlineElement::Text(" world".to_string()),
498            ]
499        );
500    }
501
502    #[test]
503    fn test_parse_strikethrough() {
504        let mut parser = InlineParser::new();
505        let elements = parser.parse("Hello ~~strike~~ world");
506        assert_eq!(
507            elements,
508            vec![
509                InlineElement::Text("Hello ".to_string()),
510                InlineElement::Strikeout("strike".to_string()),
511                InlineElement::Text(" world".to_string()),
512            ]
513        );
514    }
515
516    #[test]
517    fn test_parse_inline_code() {
518        let mut parser = InlineParser::new();
519        let elements = parser.parse("Use `code` here");
520        assert_eq!(
521            elements,
522            vec![
523                InlineElement::Text("Use ".to_string()),
524                InlineElement::Code("code".to_string()),
525                InlineElement::Text(" here".to_string()),
526            ]
527        );
528    }
529
530    #[test]
531    fn test_parse_double_backtick_code() {
532        let mut parser = InlineParser::new();
533        let elements = parser.parse("Use `` `backticks` `` here");
534        assert_eq!(
535            elements,
536            vec![
537                InlineElement::Text("Use ".to_string()),
538                InlineElement::Code("`backticks`".to_string()),
539                InlineElement::Text(" here".to_string()),
540            ]
541        );
542    }
543
544    #[test]
545    fn test_parse_link() {
546        let mut parser = InlineParser::new();
547        let elements = parser.parse("Check [this](http://example.com) out");
548
549        assert!(elements.iter().any(|e| matches!(
550            e,
551            InlineElement::Link { text, url }
552            if text == "this" && url == "http://example.com"
553        )));
554    }
555
556    #[test]
557    fn test_parse_image() {
558        let mut parser = InlineParser::new();
559        let elements = parser.parse("See ![alt text](http://img.png) here");
560
561        assert!(elements.iter().any(|e| matches!(
562            e,
563            InlineElement::Image { alt, url }
564            if alt == "alt text" && url == "http://img.png"
565        )));
566    }
567
568    #[test]
569    fn test_parse_footnote() {
570        let mut parser = InlineParser::new();
571        let elements = parser.parse("Some text[^1] here");
572
573        assert!(elements
574            .iter()
575            .any(|e| matches!(e, InlineElement::Footnote(s) if s == "¹")));
576    }
577
578    #[test]
579    fn test_parse_footnote_multi_digit() {
580        let mut parser = InlineParser::new();
581        let elements = parser.parse("Reference[^42]");
582
583        assert!(elements
584            .iter()
585            .any(|e| matches!(e, InlineElement::Footnote(s) if s == "⁴²")));
586    }
587
588    #[test]
589    fn test_underscore_in_word() {
590        let mut parser = InlineParser::new();
591        let elements = parser.parse("some_variable_name");
592        // Underscores in middle of word should not trigger formatting
593        assert_eq!(
594            elements,
595            vec![InlineElement::Text("some_variable_name".to_string())]
596        );
597    }
598
599    #[test]
600    fn test_underscore_in_word_with_surrounding_text() {
601        // This is the key test case - underscore in word with spaces around
602        // Previously this would incorrectly parse "_search tool" as italic
603        let mut parser = InlineParser::new();
604        let elements = parser.parse("use sem_search tool");
605        assert_eq!(
606            elements,
607            vec![InlineElement::Text("use sem_search tool".to_string())]
608        );
609    }
610
611    #[test]
612    fn test_underscore_at_start_of_text() {
613        let mut parser = InlineParser::new();
614        let elements = parser.parse("sem_search");
615        assert_eq!(
616            elements,
617            vec![InlineElement::Text("sem_search".to_string())]
618        );
619    }
620
621    #[test]
622    fn test_underscore_at_end_of_text() {
623        let mut parser = InlineParser::new();
624        let elements = parser.parse("sem_search is useful");
625        assert_eq!(
626            elements,
627            vec![InlineElement::Text("sem_search is useful".to_string())]
628        );
629    }
630
631    #[test]
632    fn test_multiple_underscores_in_text() {
633        let mut parser = InlineParser::new();
634        let elements = parser.parse("use my_var_name here");
635        assert_eq!(
636            elements,
637            vec![InlineElement::Text("use my_var_name here".to_string())]
638        );
639    }
640
641    #[test]
642    fn test_underscore_italic_still_works() {
643        // Real italic with underscores should still work
644        let mut parser = InlineParser::new();
645        let elements = parser.parse("this is _italic_ text");
646        assert_eq!(
647            elements,
648            vec![
649                InlineElement::Text("this is ".to_string()),
650                InlineElement::Italic("italic".to_string()),
651                InlineElement::Text(" text".to_string()),
652            ]
653        );
654    }
655
656    #[test]
657    fn test_underscore_italic_at_boundaries() {
658        // Italic at word boundaries (space before underscore)
659        let mut parser = InlineParser::new();
660        let elements = parser.parse("word _italic_");
661        assert_eq!(
662            elements,
663            vec![
664                InlineElement::Text("word ".to_string()),
665                InlineElement::Italic("italic".to_string()),
666            ]
667        );
668    }
669
670    #[test]
671    fn test_mixed_underscore_scenarios() {
672        // Mix of variable names and real italic
673        let mut parser = InlineParser::new();
674        let elements = parser.parse("use my_func for _emphasis_");
675        assert_eq!(
676            elements,
677            vec![
678                InlineElement::Text("use my_func for ".to_string()),
679                InlineElement::Italic("emphasis".to_string()),
680            ]
681        );
682    }
683
684    #[test]
685    fn test_format_line() {
686        let result = format_line("Hello **bold** world", true, true);
687        assert!(result.contains("bold"));
688        assert!(result.contains("\x1b[1m")); // Bold on
689        assert!(result.contains("\x1b[22m")); // Bold off
690    }
691
692    #[test]
693    fn test_number_to_superscript() {
694        assert_eq!(number_to_superscript(0), "⁰");
695        assert_eq!(number_to_superscript(1), "¹");
696        assert_eq!(number_to_superscript(2), "²");
697        assert_eq!(number_to_superscript(42), "⁴²");
698        assert_eq!(number_to_superscript(123), "¹²³");
699    }
700}