Skip to main content

lex_core/lex/
escape.rs

1//! Centralized escape/unescape logic for Lex content
2//!
3//! Inline Escaping Rules:
4//!   - Backslash before non-alphanumeric: escapes the character (backslash removed)
5//!   - Backslash before alphanumeric: backslash preserved (for paths like C:\Users)
6//!   - Double backslash (\\): produces a single backslash
7//!   - Trailing backslash at end of input: preserved
8//!
9//! Quoted Parameter Value Escaping Rules:
10//!   - `\"` inside a quoted value: literal quote (backslash removed)
11//!   - `\\` inside a quoted value: literal backslash
12//!   - Only `"` and `\` can be escaped; other backslashes are literal
13//!
14//! Verbatim blocks and labels have no character-level escaping.
15
16/// Result of processing a backslash at position `i` in a character stream.
17pub enum EscapeAction {
18    /// Backslash escapes the next character; consume 2 chars, emit the given char.
19    Escape(char),
20    /// Backslash is literal (before alphanumeric or at end); consume 1 char, emit `\`.
21    Literal,
22}
23
24/// Decide what to do with a backslash at the current position.
25///
26/// `next` is the character after the backslash, if any.
27/// Used by the inline parser to handle escapes character-by-character.
28pub fn unescape_inline_char(next: Option<char>) -> EscapeAction {
29    match next {
30        Some(ch) if !ch.is_alphanumeric() => EscapeAction::Escape(ch),
31        _ => EscapeAction::Literal,
32    }
33}
34
35/// Process escape sequences in inline text content.
36///
37/// Applies backslash escaping rules:
38/// - `\*` → `*` (non-alphanumeric: escape removes backslash)
39/// - `\n` → `\n` (alphanumeric: backslash preserved, where n is a letter)
40/// - `\\` → `\` (backslash is non-alphanumeric, so it escapes itself)
41/// - trailing `\` → `\` (no character follows, preserved)
42pub fn unescape_inline(text: &str) -> String {
43    let chars: Vec<char> = text.chars().collect();
44    let mut result = String::with_capacity(text.len());
45    let mut i = 0;
46
47    while i < chars.len() {
48        if chars[i] == '\\' {
49            if let Some(&next) = chars.get(i + 1) {
50                if next.is_alphanumeric() {
51                    // Preserve backslash before alphanumeric (e.g. C:\Users)
52                    result.push('\\');
53                    i += 1;
54                } else {
55                    // Escape: consume backslash, emit next char
56                    result.push(next);
57                    i += 2;
58                }
59            } else {
60                // Trailing backslash: preserve
61                result.push('\\');
62                i += 1;
63            }
64        } else {
65            result.push(chars[i]);
66            i += 1;
67        }
68    }
69
70    result
71}
72
73/// Escape special inline characters so they won't be parsed as inline markup.
74///
75/// This is the inverse of `unescape_inline`: given plain text, produce escaped text
76/// that round-trips through unescape back to the original.
77///
78/// Escapes: `\`, `*`, `_`, `` ` ``, `#`, `[`, `]`
79pub fn escape_inline(text: &str) -> String {
80    let mut result = String::with_capacity(text.len());
81
82    for ch in text.chars() {
83        if is_inline_special(ch) {
84            result.push('\\');
85        }
86        result.push(ch);
87    }
88
89    result
90}
91
92/// Characters that have special meaning in inline parsing and need escaping.
93fn is_inline_special(ch: char) -> bool {
94    matches!(ch, '\\' | '*' | '_' | '`' | '#' | '[' | ']')
95}
96
97// --- Structural marker detection ---
98
99/// Check whether the token immediately before a `Quote` is a `Text` ending
100/// with an odd number of backslashes, which means the quote is escaped.
101fn is_quote_escaped_by_prev_token(prev: Option<&crate::lex::token::Token>) -> bool {
102    use crate::lex::token::Token;
103    match prev {
104        Some(Token::Text(s)) => {
105            let trailing = s.bytes().rev().take_while(|&b| b == b'\\').count();
106            trailing % 2 == 1
107        }
108        _ => false,
109    }
110}
111
112/// Find positions of `LexMarker` tokens that are NOT inside a quoted context.
113///
114/// Tracks quote state by toggling on each `Quote` token. LexMarkers inside
115/// quoted regions are treated as content, not structural delimiters.
116/// Escaped quotes (`\"`) do not toggle quote state.
117///
118/// Works with bare `Token` slices (no byte ranges needed).
119pub fn find_structural_lex_markers(tokens: &[crate::lex::token::Token]) -> Vec<usize> {
120    use crate::lex::token::Token;
121    let mut markers = Vec::new();
122    let mut in_quotes = false;
123    for (i, token) in tokens.iter().enumerate() {
124        match token {
125            Token::Quote => {
126                if !is_quote_escaped_by_prev_token(if i > 0 { Some(&tokens[i - 1]) } else { None })
127                {
128                    in_quotes = !in_quotes;
129                }
130            }
131            Token::LexMarker if !in_quotes => markers.push(i),
132            _ => {}
133        }
134    }
135    markers
136}
137
138/// Find positions of structural `LexMarker` tokens in a paired token/span slice.
139///
140/// Same logic as `find_structural_lex_markers` but for `(Token, Range)` pairs.
141/// Escaped quotes (`\"`) do not toggle quote state.
142pub fn find_structural_lex_marker_pairs<R>(tokens: &[(crate::lex::token::Token, R)]) -> Vec<usize> {
143    use crate::lex::token::Token;
144    let mut markers = Vec::new();
145    let mut in_quotes = false;
146    for (i, (token, _)) in tokens.iter().enumerate() {
147        match token {
148            Token::Quote => {
149                let prev = if i > 0 { Some(&tokens[i - 1].0) } else { None };
150                if !is_quote_escaped_by_prev_token(prev) {
151                    in_quotes = !in_quotes;
152                }
153            }
154            Token::LexMarker if !in_quotes => markers.push(i),
155            _ => {}
156        }
157    }
158    markers
159}
160
161// --- Quoted parameter value escaping ---
162
163/// Check whether a quote at `pos` in `source` is escaped by a preceding backslash.
164///
165/// Correctly handles chains of backslashes: `\\"` is NOT escaped (even backslashes),
166/// `\\\"` IS escaped (odd backslashes before the quote).
167pub fn is_quote_escaped(source: &[u8], pos: usize) -> bool {
168    let mut backslash_count = 0;
169    let mut check = pos;
170    while check > 0 && source[check - 1] == b'\\' {
171        backslash_count += 1;
172        check -= 1;
173    }
174    backslash_count % 2 == 1
175}
176
177/// Unescape a quoted parameter value.
178///
179/// Input should be the raw stored value including outer quotes (e.g., `"Hello World"`).
180/// Returns the semantic content with escapes resolved and outer quotes stripped.
181///
182/// Escapes: `\"` → `"`, `\\` → `\`. Other backslashes are literal.
183pub fn unescape_quoted(raw: &str) -> String {
184    // Strip outer quotes if present
185    let inner = if raw.starts_with('"') && raw.ends_with('"') && raw.len() >= 2 {
186        &raw[1..raw.len() - 1]
187    } else {
188        raw
189    };
190
191    let mut result = String::with_capacity(inner.len());
192    let chars: Vec<char> = inner.chars().collect();
193    let mut i = 0;
194
195    while i < chars.len() {
196        if chars[i] == '\\' {
197            if let Some(&next) = chars.get(i + 1) {
198                if next == '"' || next == '\\' {
199                    result.push(next);
200                    i += 2;
201                    continue;
202                }
203            }
204        }
205        result.push(chars[i]);
206        i += 1;
207    }
208
209    result
210}
211
212/// Escape a string for use as a quoted parameter value.
213///
214/// Escapes `\` → `\\` and `"` → `\"`. Does NOT add outer quotes.
215pub fn escape_quoted(text: &str) -> String {
216    let mut result = String::with_capacity(text.len());
217    for ch in text.chars() {
218        if ch == '\\' || ch == '"' {
219            result.push('\\');
220        }
221        result.push(ch);
222    }
223    result
224}
225
226#[cfg(test)]
227mod tests {
228    use super::*;
229
230    // --- unescape_inline ---
231
232    #[test]
233    fn unescape_plain_text_unchanged() {
234        assert_eq!(unescape_inline("hello world"), "hello world");
235    }
236
237    #[test]
238    fn unescape_empty_string() {
239        assert_eq!(unescape_inline(""), "");
240    }
241
242    #[test]
243    fn unescape_asterisk() {
244        assert_eq!(unescape_inline("\\*literal\\*"), "*literal*");
245    }
246
247    #[test]
248    fn unescape_underscore() {
249        assert_eq!(unescape_inline("\\_not emphasis\\_"), "_not emphasis_");
250    }
251
252    #[test]
253    fn unescape_backtick() {
254        assert_eq!(unescape_inline("\\`not code\\`"), "`not code`");
255    }
256
257    #[test]
258    fn unescape_hash() {
259        assert_eq!(unescape_inline("\\#not math\\#"), "#not math#");
260    }
261
262    #[test]
263    fn unescape_brackets() {
264        assert_eq!(unescape_inline("\\[not a ref\\]"), "[not a ref]");
265    }
266
267    #[test]
268    fn unescape_backslash_before_alphanumeric_preserved() {
269        assert_eq!(unescape_inline("C:\\Users\\name"), "C:\\Users\\name");
270    }
271
272    #[test]
273    fn unescape_double_backslash() {
274        assert_eq!(unescape_inline("C:\\\\Users\\\\name"), "C:\\Users\\name");
275    }
276
277    #[test]
278    fn unescape_trailing_backslash() {
279        assert_eq!(unescape_inline("text\\"), "text\\");
280    }
281
282    #[test]
283    fn unescape_backslash_before_space() {
284        assert_eq!(unescape_inline("hello\\ world"), "hello world");
285    }
286
287    #[test]
288    fn unescape_backslash_before_punctuation() {
289        assert_eq!(unescape_inline("\\!\\?\\,\\."), "!?,.");
290    }
291
292    #[test]
293    fn unescape_multiple_consecutive_backslashes() {
294        // \\\\ = 4 backslashes → 2 backslashes (each pair escapes to one)
295        assert_eq!(unescape_inline("\\\\\\\\"), "\\\\");
296    }
297
298    #[test]
299    fn unescape_triple_backslash_then_star() {
300        // \\\\\\* = \\\* → \\ produces \, then \* produces *
301        assert_eq!(unescape_inline("\\\\\\*"), "\\*");
302    }
303
304    #[test]
305    fn unescape_mixed_escaped_and_plain() {
306        assert_eq!(
307            unescape_inline("plain \\*escaped\\* plain"),
308            "plain *escaped* plain"
309        );
310    }
311
312    #[test]
313    fn unescape_backslash_before_digit_preserved() {
314        assert_eq!(unescape_inline("item\\1"), "item\\1");
315    }
316
317    #[test]
318    fn unescape_backslash_before_unicode_letter_preserved() {
319        assert_eq!(unescape_inline("path\\ñ"), "path\\ñ");
320    }
321
322    #[test]
323    fn unescape_backslash_before_non_ascii_symbol() {
324        // Non-alphanumeric non-ASCII: backslash removed
325        assert_eq!(unescape_inline("\\→"), "→");
326    }
327
328    // --- escape_inline ---
329
330    #[test]
331    fn escape_plain_text_unchanged() {
332        assert_eq!(escape_inline("hello world"), "hello world");
333    }
334
335    #[test]
336    fn escape_empty_string() {
337        assert_eq!(escape_inline(""), "");
338    }
339
340    #[test]
341    fn escape_special_chars() {
342        assert_eq!(escape_inline("*bold*"), "\\*bold\\*");
343        assert_eq!(escape_inline("_emph_"), "\\_emph\\_");
344        assert_eq!(escape_inline("`code`"), "\\`code\\`");
345        assert_eq!(escape_inline("#math#"), "\\#math\\#");
346        assert_eq!(escape_inline("[ref]"), "\\[ref\\]");
347    }
348
349    #[test]
350    fn escape_backslash() {
351        assert_eq!(escape_inline("C:\\Users"), "C:\\\\Users");
352    }
353
354    // --- roundtrip ---
355
356    #[test]
357    fn roundtrip_plain_text() {
358        let original = "hello world";
359        assert_eq!(unescape_inline(&escape_inline(original)), original);
360    }
361
362    #[test]
363    fn roundtrip_special_chars() {
364        let original = "*bold* and _emph_ and `code` and #math# and [ref]";
365        assert_eq!(unescape_inline(&escape_inline(original)), original);
366    }
367
368    #[test]
369    fn roundtrip_backslashes() {
370        let original = "C:\\Users\\name";
371        assert_eq!(unescape_inline(&escape_inline(original)), original);
372    }
373
374    #[test]
375    fn roundtrip_mixed() {
376        let original = "path\\file *bold* and \\more";
377        assert_eq!(unescape_inline(&escape_inline(original)), original);
378    }
379
380    // --- unescape_quoted ---
381
382    #[test]
383    fn unescape_quoted_simple() {
384        assert_eq!(unescape_quoted("\"Hello World\""), "Hello World");
385    }
386
387    #[test]
388    fn unescape_quoted_with_escaped_quote() {
389        assert_eq!(unescape_quoted("\"say \\\"hello\\\"\""), "say \"hello\"");
390    }
391
392    #[test]
393    fn unescape_quoted_with_escaped_backslash() {
394        assert_eq!(unescape_quoted("\"path\\\\to\""), "path\\to");
395    }
396
397    #[test]
398    fn unescape_quoted_escaped_backslash_before_quote() {
399        // \\\\" = escaped backslash then real closing quote
400        assert_eq!(unescape_quoted("\"end\\\\\""), "end\\");
401    }
402
403    #[test]
404    fn unescape_quoted_other_backslash_literal() {
405        // \n is not a recognized escape, backslash preserved
406        assert_eq!(unescape_quoted("\"hello\\nworld\""), "hello\\nworld");
407    }
408
409    #[test]
410    fn unescape_quoted_empty() {
411        assert_eq!(unescape_quoted("\"\""), "");
412    }
413
414    #[test]
415    fn unescape_quoted_no_quotes() {
416        // Unquoted values pass through (backslash handling still applies)
417        assert_eq!(unescape_quoted("simple"), "simple");
418    }
419
420    // --- escape_quoted ---
421
422    #[test]
423    fn escape_quoted_simple() {
424        assert_eq!(escape_quoted("Hello World"), "Hello World");
425    }
426
427    #[test]
428    fn escape_quoted_with_quote() {
429        assert_eq!(escape_quoted("say \"hello\""), "say \\\"hello\\\"");
430    }
431
432    #[test]
433    fn escape_quoted_with_backslash() {
434        assert_eq!(escape_quoted("path\\to"), "path\\\\to");
435    }
436
437    #[test]
438    fn escape_quoted_empty() {
439        assert_eq!(escape_quoted(""), "");
440    }
441
442    // --- quoted roundtrip ---
443
444    #[test]
445    fn roundtrip_quoted_simple() {
446        let original = "Hello World";
447        let escaped = format!("\"{}\"", escape_quoted(original));
448        assert_eq!(unescape_quoted(&escaped), original);
449    }
450
451    #[test]
452    fn roundtrip_quoted_with_quotes() {
453        let original = "say \"hello\" and \"bye\"";
454        let escaped = format!("\"{}\"", escape_quoted(original));
455        assert_eq!(unescape_quoted(&escaped), original);
456    }
457
458    #[test]
459    fn roundtrip_quoted_with_backslashes() {
460        let original = "C:\\Users\\name";
461        let escaped = format!("\"{}\"", escape_quoted(original));
462        assert_eq!(unescape_quoted(&escaped), original);
463    }
464
465    #[test]
466    fn roundtrip_quoted_with_both() {
467        let original = "path\\to \"file\"";
468        let escaped = format!("\"{}\"", escape_quoted(original));
469        assert_eq!(unescape_quoted(&escaped), original);
470    }
471
472    // --- is_quote_escaped ---
473
474    #[test]
475    fn is_quote_escaped_no_backslash() {
476        assert!(!is_quote_escaped(b"hello\"", 5));
477    }
478
479    #[test]
480    fn is_quote_escaped_single_backslash() {
481        assert!(is_quote_escaped(b"hello\\\"", 6));
482    }
483
484    #[test]
485    fn is_quote_escaped_double_backslash() {
486        assert!(!is_quote_escaped(b"hello\\\\\"", 7));
487    }
488
489    #[test]
490    fn is_quote_escaped_triple_backslash() {
491        assert!(is_quote_escaped(b"hello\\\\\\\"", 8));
492    }
493
494    #[test]
495    fn is_quote_escaped_at_start() {
496        assert!(!is_quote_escaped(b"\"", 0));
497    }
498
499    // --- find_structural_lex_markers ---
500
501    #[test]
502    fn structural_markers_no_quotes() {
503        use crate::lex::token::Token;
504        let tokens = vec![
505            Token::LexMarker,
506            Token::Whitespace(1),
507            Token::Text("note".into()),
508            Token::Whitespace(1),
509            Token::LexMarker,
510        ];
511        assert_eq!(find_structural_lex_markers(&tokens), vec![0, 4]);
512    }
513
514    #[test]
515    fn structural_markers_with_quoted_marker() {
516        use crate::lex::token::Token;
517        // :: note foo=":: value" ::
518        let tokens = vec![
519            Token::LexMarker, // 0: structural
520            Token::Whitespace(1),
521            Token::Text("note".into()),
522            Token::Whitespace(1),
523            Token::Text("foo".into()),
524            Token::Equals,
525            Token::Quote,     // 6: opens quote
526            Token::LexMarker, // 7: inside quotes — NOT structural
527            Token::Whitespace(1),
528            Token::Text("value".into()),
529            Token::Quote, // 10: closes quote
530            Token::Whitespace(1),
531            Token::LexMarker, // 12: structural
532        ];
533        assert_eq!(find_structural_lex_markers(&tokens), vec![0, 12]);
534    }
535
536    #[test]
537    fn structural_markers_data_line_with_quoted_marker() {
538        use crate::lex::token::Token;
539        // :: note foo=":: value"  (no closing ::)
540        let tokens = vec![
541            Token::LexMarker, // 0: structural
542            Token::Whitespace(1),
543            Token::Text("note".into()),
544            Token::Equals,
545            Token::Quote,
546            Token::LexMarker, // inside quotes
547            Token::Text("value".into()),
548            Token::Quote,
549        ];
550        // Only one structural marker (opening)
551        assert_eq!(find_structural_lex_markers(&tokens), vec![0]);
552    }
553
554    #[test]
555    fn structural_markers_escaped_quote_does_not_toggle() {
556        use crate::lex::token::Token;
557        // :: note foo="value with \" inside" ::
558        // The \" should NOT toggle quote state
559        let tokens = vec![
560            Token::LexMarker, // 0: structural
561            Token::Whitespace(1),
562            Token::Text("note".into()),
563            Token::Whitespace(1),
564            Token::Text("foo".into()),
565            Token::Equals,
566            Token::Quote,                        // 6: opens quote
567            Token::Text("value with \\".into()), // 7: text ending in backslash
568            Token::Quote,                        // 8: escaped quote (preceded by \)
569            Token::Text(" inside".into()),       // 9
570            Token::Quote,                        // 10: real closing quote
571            Token::Whitespace(1),
572            Token::LexMarker, // 12: structural
573        ];
574        assert_eq!(find_structural_lex_markers(&tokens), vec![0, 12]);
575    }
576
577    #[test]
578    fn structural_markers_double_backslash_before_quote_not_escaped() {
579        use crate::lex::token::Token;
580        // :: note foo="val\\" ::
581        // \\\\ (double backslash) before quote means the backslashes escape each other,
582        // so the quote IS a real closing quote
583        let tokens = vec![
584            Token::LexMarker, // 0: structural
585            Token::Whitespace(1),
586            Token::Text("note".into()),
587            Token::Whitespace(1),
588            Token::Text("foo".into()),
589            Token::Equals,
590            Token::Quote,                  // 6: opens quote
591            Token::Text("val\\\\".into()), // 7: text ending in \\
592            Token::Quote,                  // 8: real closing quote (even backslashes)
593            Token::Whitespace(1),
594            Token::LexMarker, // 10: structural
595        ];
596        assert_eq!(find_structural_lex_markers(&tokens), vec![0, 10]);
597    }
598
599    #[test]
600    fn is_quote_escaped_by_prev_token_tests() {
601        use crate::lex::token::Token;
602        // No prev token
603        assert!(!is_quote_escaped_by_prev_token(None));
604        // Non-text prev
605        assert!(!is_quote_escaped_by_prev_token(Some(&Token::Whitespace(1))));
606        // Text not ending in backslash
607        assert!(!is_quote_escaped_by_prev_token(Some(&Token::Text(
608            "hello".into()
609        ))));
610        // Text ending in single backslash (escaped)
611        assert!(is_quote_escaped_by_prev_token(Some(&Token::Text(
612            "hello\\".into()
613        ))));
614        // Text ending in double backslash (not escaped)
615        assert!(!is_quote_escaped_by_prev_token(Some(&Token::Text(
616            "hello\\\\".into()
617        ))));
618        // Text ending in triple backslash (escaped)
619        assert!(is_quote_escaped_by_prev_token(Some(&Token::Text(
620            "hello\\\\\\".into()
621        ))));
622    }
623}