Skip to main content

marco_core/parser/inlines/
text_parser.rs

1//! Text parser - handle plain text fallback
2//!
3//! Parses plain text segments when no inline elements match. Handles special
4//! cases like trailing spaces before newlines and consecutive backticks.
5
6use super::shared::{opt_span, GrammarSpan};
7use crate::parser::ast::{Node, NodeKind};
8use nom::bytes::complete::take;
9use nom::IResult;
10use nom::Input;
11use nom::Parser;
12
13/// Parse plain text up to the next special character
14///
15/// Consumes text until a special inline character is found (*_`[<!&\\n).
16///
17/// Note: This also stops at several extension delimiters so they can be parsed
18/// in the middle of a line:
19/// - `^` (superscript)
20/// - `~` (subscript or strikethrough)
21/// - `==` (mark)
22/// - `--` (dash strikethrough)
23/// - `˅` (arrow-style subscript)
24///   Handles special cases:
25/// - Trailing spaces before newlines (potential hard line break)
26/// - Consecutive backticks (consume all together)
27///
28/// # Arguments
29/// * `input` - The input text as a GrammarSpan
30///
31/// # Returns
32/// * `Ok((remaining, node))` - Successfully parsed text node
33/// * `Err(_)` - No text to parse (input starts with special character)
34pub fn parse_text(input: GrammarSpan) -> IResult<GrammarSpan, Node> {
35    let text_fragment = input.fragment();
36
37    // GFM autolink literals can appear in the middle of a text node.
38    // If we can see a valid autolink literal starting at some offset, we must
39    // stop before it so the dedicated parser can run.
40    let next_autolink_literal =
41        super::gfm_autolink_literal_parser::find_next_autolink_literal_start(text_fragment)
42            .unwrap_or(text_fragment.len());
43
44    // Emoji shortcodes (extended syntax) can appear in the middle of a text node.
45    // Only stop for *recognized* shortcodes; unknown ones remain literal.
46    let next_emoji_shortcode =
47        super::marco_emoji_shortcode_parser::find_next_emoji_shortcode_start(text_fragment)
48            .unwrap_or(text_fragment.len());
49
50    // Platform mentions (extended syntax) can appear in the middle of a text node.
51    let next_platform_mention =
52        super::marco_platform_mentions_parser::find_next_platform_mention_start(text_fragment)
53            .unwrap_or(text_fragment.len());
54
55    // Find the next special character / delimiter start.
56    //
57    // Important: we intentionally do NOT treat a single '-' as special because
58    // it's too common in normal prose. Instead we only stop at the start of
59    // a *double* dash sequence "--".
60    let next_special = text_fragment
61        .char_indices()
62        .find_map(|(idx, ch)| match ch {
63            '*' | '_' | '`' | '[' | '<' | '!' | '&' | '\n' | '\\' | '$' => Some(idx),
64            '^' | '~' | '˅' => Some(idx),
65            '=' => {
66                if text_fragment[idx..].starts_with("==") {
67                    Some(idx)
68                } else {
69                    None
70                }
71            }
72            '-' => {
73                if text_fragment[idx..].starts_with("--") {
74                    Some(idx)
75                } else {
76                    None
77                }
78            }
79            _ => None,
80        })
81        .unwrap_or(text_fragment.len());
82
83    // If an autolink literal begins at offset 0, do not treat it as plain text.
84    if next_autolink_literal == 0 {
85        return Err(nom::Err::Error(nom::error::Error::new(
86            input,
87            nom::error::ErrorKind::Verify,
88        )));
89    }
90
91    // If an emoji shortcode begins at offset 0, do not treat it as plain text.
92    if next_emoji_shortcode == 0 {
93        return Err(nom::Err::Error(nom::error::Error::new(
94            input,
95            nom::error::ErrorKind::Verify,
96        )));
97    }
98
99    // If a platform mention begins at offset 0, do not treat it as plain text.
100    if next_platform_mention == 0 {
101        return Err(nom::Err::Error(nom::error::Error::new(
102            input,
103            nom::error::ErrorKind::Verify,
104        )));
105    }
106
107    let next_special = next_special
108        .min(next_autolink_literal)
109        .min(next_emoji_shortcode)
110        .min(next_platform_mention);
111
112    if next_special == 0 {
113        // No text - input starts with special character
114        return Err(nom::Err::Error(nom::error::Error::new(
115            input,
116            nom::error::ErrorKind::Verify,
117        )));
118    }
119
120    // Check if the upcoming character is a newline and the text ends with spaces
121    // If so, don't consume trailing spaces (they might be part of a hard line break)
122    let mut text_len = next_special;
123    if next_special < text_fragment.len() && text_fragment[next_special..].starts_with('\n') {
124        // Check for trailing spaces
125        let mut trailing_spaces = 0;
126        for ch in text_fragment[..next_special].chars().rev() {
127            if ch == ' ' {
128                trailing_spaces += 1;
129            } else {
130                break;
131            }
132        }
133
134        // If we have 2+ trailing spaces, don't consume them
135        // (they might be part of a hard line break pattern)
136        if trailing_spaces >= 2 {
137            text_len = next_special - trailing_spaces;
138        }
139    }
140
141    if text_len == 0 {
142        // Only trailing spaces - don't consume them
143        return Err(nom::Err::Error(nom::error::Error::new(
144            input,
145            nom::error::ErrorKind::Verify,
146        )));
147    }
148
149    // Use nom::Input to properly advance by byte count (not character count!)
150    let text_content = input.take(text_len);
151    let rest = input.take_from(text_len);
152
153    let span = opt_span(text_content);
154
155    let node = Node {
156        kind: NodeKind::Text(text_content.fragment().to_string()),
157        span,
158        children: Vec::new(),
159    };
160
161    Ok((rest, node))
162}
163
164/// Parse a single special character as text (fallback for unmatched syntax)
165///
166/// When an inline element parser fails to match, this function consumes the
167/// special character as plain text. For backticks, consumes all consecutive
168/// backticks together.
169///
170/// # Arguments
171/// * `input` - The input text as a GrammarSpan
172///
173/// # Returns
174/// * `Ok((remaining, node))` - Successfully parsed text node with special character
175/// * `Err(_)` - Input is empty
176pub fn parse_special_as_text(input: GrammarSpan) -> IResult<GrammarSpan, Node> {
177    let text_fragment = input.fragment();
178
179    if text_fragment.is_empty() {
180        return Err(nom::Err::Error(nom::error::Error::new(
181            input,
182            nom::error::ErrorKind::Eof,
183        )));
184    }
185
186    // Special case: if it's a backtick, consume all consecutive backticks
187    // This prevents ```foo`` from being parsed as ` + ``foo``
188    let char_len = if text_fragment.starts_with('`') {
189        // Count all consecutive backticks
190        text_fragment.chars().take_while(|&c| c == '`').count()
191    } else {
192        text_fragment
193            .chars()
194            .next()
195            .map(|c| c.len_utf8())
196            .unwrap_or(1)
197    };
198
199    let (rest, text_content) = take(char_len).parse(input)?;
200
201    let span = opt_span(text_content);
202
203    let node = Node {
204        kind: NodeKind::Text(text_content.fragment().to_string()),
205        span,
206        children: Vec::new(),
207    };
208
209    Ok((rest, node))
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215
216    #[test]
217    fn smoke_test_parse_text_basic() {
218        let input = GrammarSpan::new("Hello World*");
219        let result = parse_text(input);
220
221        assert!(result.is_ok(), "Failed to parse plain text");
222        let (rest, node) = result.unwrap();
223
224        assert_eq!(rest.fragment(), &"*");
225
226        if let NodeKind::Text(text) = &node.kind {
227            assert_eq!(text, "Hello World");
228        } else {
229            panic!("Expected Text node");
230        }
231    }
232
233    #[test]
234    fn smoke_test_parse_text_up_to_special() {
235        let input = GrammarSpan::new("text with `code`");
236        let result = parse_text(input);
237
238        assert!(result.is_ok());
239        let (rest, node) = result.unwrap();
240
241        assert_eq!(rest.fragment(), &"`code`");
242
243        if let NodeKind::Text(text) = &node.kind {
244            assert_eq!(text, "text with ");
245        }
246    }
247
248    #[test]
249    fn smoke_test_parse_text_trailing_spaces() {
250        let input = GrammarSpan::new("text  \n");
251        let result = parse_text(input);
252
253        assert!(result.is_ok());
254        let (rest, node) = result.unwrap();
255
256        // Should not consume trailing spaces before newline
257        assert_eq!(rest.fragment(), &"  \n");
258
259        if let NodeKind::Text(text) = &node.kind {
260            assert_eq!(text, "text");
261        }
262    }
263
264    #[test]
265    fn smoke_test_parse_text_starts_with_special() {
266        let input = GrammarSpan::new("*emphasis*");
267        let result = parse_text(input);
268
269        assert!(
270            result.is_err(),
271            "Should not parse text starting with special char"
272        );
273    }
274
275    #[test]
276    fn smoke_test_parse_special_as_text_asterisk() {
277        let input = GrammarSpan::new("* not emphasis");
278        let result = parse_special_as_text(input);
279
280        assert!(result.is_ok(), "Failed to parse special as text");
281        let (rest, node) = result.unwrap();
282
283        assert_eq!(rest.fragment(), &" not emphasis");
284
285        if let NodeKind::Text(text) = &node.kind {
286            assert_eq!(text, "*");
287        }
288    }
289
290    #[test]
291    fn smoke_test_parse_special_as_text_backticks() {
292        let input = GrammarSpan::new("```not code");
293        let result = parse_special_as_text(input);
294
295        assert!(result.is_ok());
296        let (rest, node) = result.unwrap();
297
298        assert_eq!(rest.fragment(), &"not code");
299
300        if let NodeKind::Text(text) = &node.kind {
301            assert_eq!(text, "```");
302        }
303    }
304
305    #[test]
306    fn smoke_test_parse_text_position() {
307        let input = GrammarSpan::new("Hello*");
308        let result = parse_text(input);
309
310        assert!(result.is_ok());
311        let (_, node) = result.unwrap();
312
313        assert!(node.span.is_some(), "Text should have position info");
314
315        let span = node.span.unwrap();
316        assert_eq!(span.start.offset, 0);
317        assert_eq!(span.end.offset, 5); // "Hello" is 5 bytes
318    }
319}