facet_macro_parse/
unescaping.rs

1//! Processes string escapes and unescapes them for docstrings.
2
3use crate::DocInner;
4
5/// Errors that can occur while unescaping a string
6#[derive(Debug, PartialEq, Eq)]
7pub enum UnescapeError {
8    /// An illegal character was found following a backslash (e.g., `\a`)
9    IllegalCharacterFollowingBackslash {
10        /// Index of the backslash in the original string
11        character_index: usize,
12        /// The illegal character found
13        found: char,
14        /// The original string being unescaped
15        string: String,
16    },
17    /// The string ended unexpectedly after a backslash character
18    UnexpectedEofFollowingBackslash {
19        /// Index of the backslash in the original string
20        character_index: usize,
21        /// The original string being unescaped
22        string: String,
23    },
24    /// Invalid hex digit in \xNN escape
25    InvalidHexEscape {
26        /// Index of the escape start in the original string
27        character_index: usize,
28        /// The original string being unescaped
29        string: String,
30    },
31    /// Invalid unicode escape \u{...}
32    InvalidUnicodeEscape {
33        /// Index of the escape start in the original string
34        character_index: usize,
35        /// The original string being unescaped
36        string: String,
37    },
38}
39
40impl std::fmt::Display for UnescapeError {
41    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
42        match self {
43            UnescapeError::IllegalCharacterFollowingBackslash {
44                character_index,
45                found,
46                string,
47            } => {
48                write!(
49                    f,
50                    "Illegal character following a backslash at index {character_index} in {string:?}: found '{found}'"
51                )
52            }
53            UnescapeError::UnexpectedEofFollowingBackslash {
54                character_index,
55                string,
56            } => {
57                write!(
58                    f,
59                    "Unexpected end of file following a backslash at index {character_index} in {string:?}"
60                )
61            }
62            UnescapeError::InvalidHexEscape {
63                character_index,
64                string,
65            } => {
66                write!(
67                    f,
68                    "Invalid hex escape at index {character_index} in {string:?}"
69                )
70            }
71            UnescapeError::InvalidUnicodeEscape {
72                character_index,
73                string,
74            } => {
75                write!(
76                    f,
77                    "Invalid unicode escape at index {character_index} in {string:?}"
78                )
79            }
80        }
81    }
82}
83
84/// Unescapes a doc attribute string.
85pub fn unescape(doc_attr: &DocInner) -> Result<String, UnescapeError> {
86    unescape_inner(doc_attr.value.as_str())
87}
88
89/// Parse exactly 2 hex digits from the iterator.
90fn parse_hex_escape(
91    chars: &mut std::iter::Peekable<impl Iterator<Item = (usize, char)>>,
92    escape_start: usize,
93    s: &str,
94) -> Result<char, UnescapeError> {
95    let mut value = 0u8;
96    for _ in 0..2 {
97        match chars.next() {
98            Some((_, c)) if c.is_ascii_hexdigit() => {
99                value = value * 16 + c.to_digit(16).unwrap() as u8;
100            }
101            _ => {
102                return Err(UnescapeError::InvalidHexEscape {
103                    character_index: escape_start,
104                    string: s.to_string(),
105                });
106            }
107        }
108    }
109    Ok(value as char)
110}
111
112/// Parse a unicode escape \u{NNNN} from the iterator.
113fn parse_unicode_escape(
114    chars: &mut std::iter::Peekable<impl Iterator<Item = (usize, char)>>,
115    escape_start: usize,
116    s: &str,
117) -> Result<char, UnescapeError> {
118    // Expect opening brace
119    match chars.next() {
120        Some((_, '{')) => {}
121        _ => {
122            return Err(UnescapeError::InvalidUnicodeEscape {
123                character_index: escape_start,
124                string: s.to_string(),
125            });
126        }
127    }
128
129    let mut value = 0u32;
130    let mut digit_count = 0;
131
132    loop {
133        match chars.next() {
134            Some((_, '}')) => break,
135            Some((_, c)) if c.is_ascii_hexdigit() => {
136                digit_count += 1;
137                if digit_count > 6 {
138                    return Err(UnescapeError::InvalidUnicodeEscape {
139                        character_index: escape_start,
140                        string: s.to_string(),
141                    });
142                }
143                value = value * 16 + c.to_digit(16).unwrap();
144            }
145            _ => {
146                return Err(UnescapeError::InvalidUnicodeEscape {
147                    character_index: escape_start,
148                    string: s.to_string(),
149                });
150            }
151        }
152    }
153
154    if digit_count == 0 {
155        return Err(UnescapeError::InvalidUnicodeEscape {
156            character_index: escape_start,
157            string: s.to_string(),
158        });
159    }
160
161    char::from_u32(value).ok_or_else(|| UnescapeError::InvalidUnicodeEscape {
162        character_index: escape_start,
163        string: s.to_string(),
164    })
165}
166
167/// Unescapes a string with Rust-style escape sequences.
168///
169/// Supported escapes:
170/// - `\\` -> backslash
171/// - `\"` -> double quote
172/// - `\'` -> single quote
173/// - `\n` -> newline
174/// - `\r` -> carriage return
175/// - `\t` -> tab
176/// - `\0` -> null
177/// - `\xNN` -> byte value (2 hex digits, ASCII only)
178/// - `\u{NNNNNN}` -> unicode scalar value (1-6 hex digits)
179pub fn unescape_inner(s: &str) -> Result<String, UnescapeError> {
180    let mut out = String::with_capacity(s.len());
181    let mut chars = s.char_indices().peekable();
182
183    while let Some((i, c)) = chars.next() {
184        if c == '\\' {
185            match chars.next() {
186                Some((_, '\\')) => out.push('\\'),
187                Some((_, '"')) => out.push('"'),
188                Some((_, '\'')) => out.push('\''),
189                Some((_, 'n')) => out.push('\n'),
190                Some((_, 'r')) => out.push('\r'),
191                Some((_, 't')) => out.push('\t'),
192                Some((_, '0')) => out.push('\0'),
193                Some((_, 'x')) => {
194                    out.push(parse_hex_escape(&mut chars, i, s)?);
195                }
196                Some((_, 'u')) => {
197                    out.push(parse_unicode_escape(&mut chars, i, s)?);
198                }
199                Some((_, found)) => {
200                    return Err(UnescapeError::IllegalCharacterFollowingBackslash {
201                        character_index: i,
202                        found,
203                        string: s.to_string(),
204                    });
205                }
206                None => {
207                    return Err(UnescapeError::UnexpectedEofFollowingBackslash {
208                        character_index: i,
209                        string: s.to_string(),
210                    });
211                }
212            }
213        } else {
214            out.push(c);
215        }
216    }
217    Ok(out)
218}
219
220#[cfg(test)]
221mod tests {
222    use super::*;
223
224    #[test]
225    fn test_unescape_basic() {
226        assert_eq!(unescape_inner("hello").unwrap(), "hello");
227        assert_eq!(
228            unescape_inner(r#"hello \"world\""#).unwrap(),
229            r#"hello "world""#
230        );
231        assert_eq!(
232            unescape_inner(r#"hello \'world\'"#).unwrap(),
233            "hello 'world'"
234        );
235        assert_eq!(unescape_inner(r"back\\slash").unwrap(), r"back\slash");
236    }
237
238    #[test]
239    fn test_unescape_newline() {
240        // This is the case from issue #921
241        assert_eq!(
242            unescape_inner(r"```solidity\nstruct MyStruct { ... }\n```").unwrap(),
243            "```solidity\nstruct MyStruct { ... }\n```"
244        );
245    }
246
247    #[test]
248    fn test_unescape_common_escapes() {
249        assert_eq!(unescape_inner(r"hello\nworld").unwrap(), "hello\nworld");
250        assert_eq!(unescape_inner(r"hello\rworld").unwrap(), "hello\rworld");
251        assert_eq!(unescape_inner(r"hello\tworld").unwrap(), "hello\tworld");
252        assert_eq!(unescape_inner(r"null\0char").unwrap(), "null\0char");
253        assert_eq!(
254            unescape_inner(r"line1\nline2\nline3").unwrap(),
255            "line1\nline2\nline3"
256        );
257        assert_eq!(unescape_inner(r"tab\there").unwrap(), "tab\there");
258        assert_eq!(unescape_inner(r"cr\rlf").unwrap(), "cr\rlf");
259        assert_eq!(unescape_inner(r"crlf\r\n").unwrap(), "crlf\r\n");
260    }
261
262    #[test]
263    fn test_unescape_hex() {
264        assert_eq!(unescape_inner(r"\x41").unwrap(), "A");
265        assert_eq!(unescape_inner(r"\x61").unwrap(), "a");
266        assert_eq!(unescape_inner(r"\x00").unwrap(), "\0");
267        assert_eq!(unescape_inner(r"\x7f").unwrap(), "\x7f");
268        assert_eq!(unescape_inner(r"hello\x20world").unwrap(), "hello world");
269    }
270
271    #[test]
272    fn test_unescape_unicode() {
273        assert_eq!(unescape_inner(r"\u{41}").unwrap(), "A");
274        assert_eq!(unescape_inner(r"\u{0041}").unwrap(), "A");
275        assert_eq!(unescape_inner(r"\u{1F600}").unwrap(), "😀");
276        assert_eq!(unescape_inner(r"\u{10FFFF}").unwrap(), "\u{10FFFF}");
277        assert_eq!(unescape_inner(r"hello\u{20}world").unwrap(), "hello world");
278    }
279
280    #[test]
281    fn test_unescape_mixed() {
282        assert_eq!(
283            unescape_inner(r#"line1\nline2\ttab\\backslash\"quote"#).unwrap(),
284            "line1\nline2\ttab\\backslash\"quote"
285        );
286    }
287
288    #[test]
289    fn test_unescape_errors() {
290        // Invalid escape character
291        assert!(matches!(
292            unescape_inner(r"invalid \a escape"),
293            Err(UnescapeError::IllegalCharacterFollowingBackslash {
294                character_index: 8,
295                found: 'a',
296                ..
297            })
298        ));
299
300        // Trailing backslash
301        assert!(matches!(
302            unescape_inner(r"trailing backslash \"),
303            Err(UnescapeError::UnexpectedEofFollowingBackslash {
304                character_index: 19,
305                ..
306            })
307        ));
308
309        // Invalid hex escape (not enough digits)
310        assert!(matches!(
311            unescape_inner(r"\x4"),
312            Err(UnescapeError::InvalidHexEscape { .. })
313        ));
314
315        // Invalid hex escape (non-hex character)
316        assert!(matches!(
317            unescape_inner(r"\xGG"),
318            Err(UnescapeError::InvalidHexEscape { .. })
319        ));
320
321        // Invalid unicode escape (no braces)
322        assert!(matches!(
323            unescape_inner(r"\u0041"),
324            Err(UnescapeError::InvalidUnicodeEscape { .. })
325        ));
326
327        // Invalid unicode escape (empty)
328        assert!(matches!(
329            unescape_inner(r"\u{}"),
330            Err(UnescapeError::InvalidUnicodeEscape { .. })
331        ));
332
333        // Invalid unicode escape (too many digits)
334        assert!(matches!(
335            unescape_inner(r"\u{1234567}"),
336            Err(UnescapeError::InvalidUnicodeEscape { .. })
337        ));
338
339        // Invalid unicode escape (invalid codepoint)
340        assert!(matches!(
341            unescape_inner(r"\u{FFFFFF}"),
342            Err(UnescapeError::InvalidUnicodeEscape { .. })
343        ));
344    }
345}