rustc_literal_escaper/
lib.rs

1//! Utilities for validating (raw) string, char, and byte literals and
2//! turning escape sequences into the values they represent.
3
4#![no_std]
5
6use core::ffi::CStr;
7use core::num::NonZero;
8use core::ops::Range;
9use core::str::Chars;
10
11/// Errors and warnings that can occur during string, char, and byte unescaping.
12///
13/// Mostly relating to malformed escape sequences, but also a few other problems.
14#[derive(Debug, PartialEq, Eq)]
15pub enum EscapeError {
16    /// Expected 1 char, but 0 were found.
17    ZeroChars,
18    /// Expected 1 char, but more than 1 were found.
19    MoreThanOneChar,
20
21    /// Escaped '\' character without continuation.
22    LoneSlash,
23    /// Invalid escape character (e.g. '\z').
24    InvalidEscape,
25    /// Raw '\r' encountered.
26    BareCarriageReturn,
27    /// Raw '\r' encountered in raw string.
28    BareCarriageReturnInRawString,
29    /// Unescaped character that was expected to be escaped (e.g. raw '\t').
30    EscapeOnlyChar,
31
32    /// Numeric character escape is too short (e.g. '\x1').
33    TooShortHexEscape,
34    /// Invalid character in numeric escape (e.g. '\xz')
35    InvalidCharInHexEscape,
36    /// Character code in numeric escape is non-ascii (e.g. '\xFF').
37    OutOfRangeHexEscape,
38
39    /// '\u' not followed by '{'.
40    NoBraceInUnicodeEscape,
41    /// Non-hexadecimal value in '\u{..}'.
42    InvalidCharInUnicodeEscape,
43    /// '\u{}'
44    EmptyUnicodeEscape,
45    /// No closing brace in '\u{..}', e.g. '\u{12'.
46    UnclosedUnicodeEscape,
47    /// '\u{_12}'
48    LeadingUnderscoreUnicodeEscape,
49    /// More than 6 characters in '\u{..}', e.g. '\u{10FFFF_FF}'
50    OverlongUnicodeEscape,
51    /// Invalid in-bound unicode character code, e.g. '\u{DFFF}'.
52    LoneSurrogateUnicodeEscape,
53    /// Out of bounds unicode character code, e.g. '\u{FFFFFF}'.
54    OutOfRangeUnicodeEscape,
55
56    /// Unicode escape code in byte literal.
57    UnicodeEscapeInByte,
58    /// Non-ascii character in byte literal, byte string literal, or raw byte string literal.
59    NonAsciiCharInByte,
60
61    /// `\0` in a C string literal.
62    NulInCStr,
63
64    /// After a line ending with '\', the next line contains whitespace
65    /// characters that are not skipped.
66    UnskippedWhitespaceWarning,
67
68    /// After a line ending with '\', multiple lines are skipped.
69    MultipleSkippedLinesWarning,
70}
71
72impl EscapeError {
73    /// Returns true for actual errors, as opposed to warnings.
74    pub fn is_fatal(&self) -> bool {
75        !matches!(
76            self,
77            EscapeError::UnskippedWhitespaceWarning | EscapeError::MultipleSkippedLinesWarning
78        )
79    }
80}
81
82/// Check a raw string literal for validity
83///
84/// Takes the contents of a raw string literal (without quotes)
85/// and produces a sequence of characters or errors,
86/// which are returned by invoking `callback`.
87/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
88pub fn check_raw_str(src: &str, callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
89    str::check_raw(src, callback);
90}
91
92/// Check a raw byte string literal for validity
93///
94/// Takes the contents of a raw byte string literal (without quotes)
95/// and produces a sequence of bytes or errors,
96/// which are returned by invoking `callback`.
97/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
98pub fn check_raw_byte_str(src: &str, callback: impl FnMut(Range<usize>, Result<u8, EscapeError>)) {
99    <[u8]>::check_raw(src, callback);
100}
101
102/// Check a raw C string literal for validity
103///
104/// Takes the contents of a raw C string literal (without quotes)
105/// and produces a sequence of characters or errors,
106/// which are returned by invoking `callback`.
107/// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
108pub fn check_raw_c_str(
109    src: &str,
110    callback: impl FnMut(Range<usize>, Result<NonZero<char>, EscapeError>),
111) {
112    CStr::check_raw(src, callback);
113}
114
115/// Trait for checking raw string literals for validity
116trait CheckRaw {
117    /// Unit type of the implementing string type (`char` for string, `u8` for byte string)
118    type RawUnit;
119
120    /// Converts chars to the unit type of the literal type
121    fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError>;
122
123    /// Takes the contents of a raw literal (without quotes)
124    /// and produces a sequence of `Result<Self::RawUnit, EscapeError>`
125    /// which are returned via `callback`.
126    ///
127    /// NOTE: Does no escaping, but produces errors for bare carriage return ('\r').
128    fn check_raw(
129        src: &str,
130        mut callback: impl FnMut(Range<usize>, Result<Self::RawUnit, EscapeError>),
131    ) {
132        let mut chars = src.chars();
133        while let Some(c) = chars.next() {
134            let start = src.len() - chars.as_str().len() - c.len_utf8();
135            let res = match c {
136                '\r' => Err(EscapeError::BareCarriageReturnInRawString),
137                _ => Self::char2raw_unit(c),
138            };
139            let end = src.len() - chars.as_str().len();
140            callback(start..end, res);
141        }
142
143        // Unfortunately, it is a bit unclear whether the following equivalent code is slower or faster: bug 141855
144        // src.char_indices().for_each(|(pos, c)| {
145        //     callback(
146        //         pos..pos + c.len_utf8(),
147        //         if c == '\r' {
148        //             Err(EscapeError::BareCarriageReturnInRawString)
149        //         } else {
150        //             Self::char2raw_unit(c)
151        //         },
152        //     );
153        // });
154    }
155}
156
157impl CheckRaw for str {
158    type RawUnit = char;
159
160    #[inline]
161    fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError> {
162        Ok(c)
163    }
164}
165
166impl CheckRaw for [u8] {
167    type RawUnit = u8;
168
169    #[inline]
170    fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError> {
171        char2byte(c)
172    }
173}
174
175/// Turn an ascii char into a byte
176#[inline]
177fn char2byte(c: char) -> Result<u8, EscapeError> {
178    // do NOT do: c.try_into().ok_or(EscapeError::NonAsciiCharInByte)
179    if c.is_ascii() {
180        Ok(c as u8)
181    } else {
182        Err(EscapeError::NonAsciiCharInByte)
183    }
184}
185
186impl CheckRaw for CStr {
187    type RawUnit = NonZero<char>;
188
189    #[inline]
190    fn char2raw_unit(c: char) -> Result<Self::RawUnit, EscapeError> {
191        NonZero::new(c).ok_or(EscapeError::NulInCStr)
192    }
193}
194
195/// Unescape a char literal
196///
197/// Takes the contents of a char literal (without quotes),
198/// and returns an unescaped char or an error.
199#[inline]
200pub fn unescape_char(src: &str) -> Result<char, EscapeError> {
201    str::unescape_single(&mut src.chars())
202}
203
204/// Unescape a byte literal
205///
206/// Takes the contents of a byte literal (without quotes),
207/// and returns an unescaped byte or an error.
208#[inline]
209pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
210    <[u8]>::unescape_single(&mut src.chars())
211}
212
213/// Unescape a string literal
214///
215/// Takes the contents of a string literal (without quotes)
216/// and produces a sequence of escaped characters or errors,
217/// which are returned by invoking `callback`.
218pub fn unescape_str(src: &str, callback: impl FnMut(Range<usize>, Result<char, EscapeError>)) {
219    str::unescape(src, callback)
220}
221
222/// Unescape a byte string literal
223///
224/// Takes the contents of a byte string literal (without quotes)
225/// and produces a sequence of escaped bytes or errors,
226/// which are returned by invoking `callback`.
227pub fn unescape_byte_str(src: &str, callback: impl FnMut(Range<usize>, Result<u8, EscapeError>)) {
228    <[u8]>::unescape(src, callback)
229}
230
231/// Unescape a C string literal
232///
233/// Takes the contents of a C string literal (without quotes)
234/// and produces a sequence of escaped MixedUnits or errors,
235/// which are returned by invoking `callback`.
236pub fn unescape_c_str(
237    src: &str,
238    callback: impl FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
239) {
240    CStr::unescape(src, callback)
241}
242
243/// Enum representing either a char or a byte
244///
245/// Used for mixed utf8 string literals, i.e. those that allow both unicode
246/// chars and high bytes.
247#[derive(Copy, Clone, Debug, PartialEq, Eq)]
248pub enum MixedUnit {
249    /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes)
250    /// and Unicode chars (written directly or via `\u` escapes).
251    ///
252    /// For example, if '¥' appears in a string it is represented here as
253    /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte
254    /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]`
255    Char(NonZero<char>),
256
257    /// Used for high bytes (`\x80`..`\xff`).
258    ///
259    /// For example, if `\xa5` appears in a string it is represented here as
260    /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant
261    /// byte string as the single byte `0xa5`.
262    HighByte(NonZero<u8>),
263}
264
265impl From<NonZero<char>> for MixedUnit {
266    #[inline]
267    fn from(c: NonZero<char>) -> Self {
268        MixedUnit::Char(c)
269    }
270}
271
272impl From<NonZero<u8>> for MixedUnit {
273    #[inline]
274    fn from(byte: NonZero<u8>) -> Self {
275        if byte.get().is_ascii() {
276            MixedUnit::Char(NonZero::new(byte.get() as char).unwrap())
277        } else {
278            MixedUnit::HighByte(byte)
279        }
280    }
281}
282
283impl TryFrom<char> for MixedUnit {
284    type Error = EscapeError;
285
286    #[inline]
287    fn try_from(c: char) -> Result<Self, EscapeError> {
288        NonZero::new(c)
289            .map(MixedUnit::Char)
290            .ok_or(EscapeError::NulInCStr)
291    }
292}
293
294impl TryFrom<u8> for MixedUnit {
295    type Error = EscapeError;
296
297    #[inline]
298    fn try_from(byte: u8) -> Result<Self, EscapeError> {
299        NonZero::new(byte)
300            .map(From::from)
301            .ok_or(EscapeError::NulInCStr)
302    }
303}
304
305/// Trait for unescaping escape sequences in strings
306trait Unescape {
307    /// Unit type of the implementing string type (`char` for string, `u8` for byte string)
308    type Unit;
309
310    /// Result of unescaping the zero char ('\0')
311    const ZERO_RESULT: Result<Self::Unit, EscapeError>;
312
313    /// Converts non-zero bytes to the unit type
314    fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit;
315
316    /// Converts chars to the unit type
317    fn char2unit(c: char) -> Result<Self::Unit, EscapeError>;
318
319    /// Converts the byte of a hex escape to the unit type
320    fn hex2unit(b: u8) -> Result<Self::Unit, EscapeError>;
321
322    /// Converts the result of a unicode escape to the unit type
323    fn unicode2unit(r: Result<char, EscapeError>) -> Result<Self::Unit, EscapeError>;
324
325    /// Unescape a single unit (single quote syntax)
326    fn unescape_single(chars: &mut Chars<'_>) -> Result<Self::Unit, EscapeError> {
327        let res = match chars.next().ok_or(EscapeError::ZeroChars)? {
328            '\\' => Self::unescape_1(chars),
329            '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
330            '\r' => Err(EscapeError::BareCarriageReturn),
331            c => Self::char2unit(c),
332        }?;
333        if chars.next().is_some() {
334            return Err(EscapeError::MoreThanOneChar);
335        }
336        Ok(res)
337    }
338
339    /// Unescape the first unit of a string (double quoted syntax)
340    fn unescape_1(chars: &mut Chars<'_>) -> Result<Self::Unit, EscapeError> {
341        // Previous character was '\\', unescape what follows.
342        let c = chars.next().ok_or(EscapeError::LoneSlash)?;
343        if c == '0' {
344            Self::ZERO_RESULT
345        } else {
346            simple_escape(c)
347                .map(|b| Self::nonzero_byte2unit(b))
348                .or_else(|c| match c {
349                    'x' => Self::hex2unit(hex_escape(chars)?),
350                    'u' => Self::unicode2unit({
351                        let value = unicode_escape(chars)?;
352                        if value > char::MAX as u32 {
353                            Err(EscapeError::OutOfRangeUnicodeEscape)
354                        } else {
355                            char::from_u32(value).ok_or(EscapeError::LoneSurrogateUnicodeEscape)
356                        }
357                    }),
358                    _ => Err(EscapeError::InvalidEscape),
359                })
360        }
361    }
362
363    /// Unescape a string literal
364    ///
365    /// Takes the contents of a raw string literal (without quotes)
366    /// and produces a sequence of `Result<Self::Unit, EscapeError>`
367    /// which are returned via `callback`.
368    fn unescape(
369        src: &str,
370        mut callback: impl FnMut(Range<usize>, Result<Self::Unit, EscapeError>),
371    ) {
372        let mut chars = src.chars();
373        while let Some(c) = chars.next() {
374            let start = src.len() - chars.as_str().len() - c.len_utf8();
375            let res = match c {
376                '\\' => {
377                    if let Some(b'\n') = chars.as_str().as_bytes().first() {
378                        let _ = chars.next();
379                        // skip whitespace for backslash newline, see [Rust language reference]
380                        // (https://doc.rust-lang.org/reference/tokens.html#string-literals).
381                        let callback_err = |range, err| callback(range, Err(err));
382                        skip_ascii_whitespace(&mut chars, start, callback_err);
383                        continue;
384                    } else {
385                        Self::unescape_1(&mut chars)
386                    }
387                }
388                '"' => Err(EscapeError::EscapeOnlyChar),
389                '\r' => Err(EscapeError::BareCarriageReturn),
390                c => Self::char2unit(c),
391            };
392            let end = src.len() - chars.as_str().len();
393            callback(start..end, res);
394        }
395    }
396}
397
398/// Interpret a non-nul ASCII escape
399///
400/// Parses the character of an ASCII escape (except nul) without the leading backslash.
401#[inline] // single use in Unescape::unescape_1
402fn simple_escape(c: char) -> Result<NonZero<u8>, char> {
403    // Previous character was '\\', unescape what follows.
404    Ok(NonZero::new(match c {
405        '"' => b'"',
406        'n' => b'\n',
407        'r' => b'\r',
408        't' => b'\t',
409        '\\' => b'\\',
410        '\'' => b'\'',
411        _ => Err(c)?,
412    })
413    .unwrap())
414}
415
416/// Interpret a hexadecimal escape
417///
418/// Parses the two hexadecimal characters of a hexadecimal escape without the leading r"\x".
419#[inline] // single use in Unescape::unescape_1
420fn hex_escape(chars: &mut impl Iterator<Item = char>) -> Result<u8, EscapeError> {
421    let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
422    let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
423
424    let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
425    let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
426
427    Ok((hi * 16 + lo) as u8)
428}
429
430/// Interpret a unicode escape
431///
432/// Parse the braces with hexadecimal characters (and underscores) part of a unicode escape.
433/// This r"{...}" normally comes after r"\u" and cannot start with an underscore.
434#[inline] // single use in Unescape::unescape_1
435fn unicode_escape(chars: &mut impl Iterator<Item = char>) -> Result<u32, EscapeError> {
436    if chars.next() != Some('{') {
437        return Err(EscapeError::NoBraceInUnicodeEscape);
438    }
439
440    // First character must be a hexadecimal digit.
441    let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
442        '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
443        '}' => return Err(EscapeError::EmptyUnicodeEscape),
444        c => c
445            .to_digit(16)
446            .ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
447    };
448
449    // First character is valid, now parse the rest of the number
450    // and closing brace.
451    let mut n_digits = 1;
452    loop {
453        match chars.next() {
454            None => return Err(EscapeError::UnclosedUnicodeEscape),
455            Some('_') => continue,
456            Some('}') => {
457                // Incorrect syntax has higher priority for error reporting
458                // than unallowed value for a literal.
459                return if n_digits > 6 {
460                    Err(EscapeError::OverlongUnicodeEscape)
461                } else {
462                    Ok(value)
463                };
464            }
465            Some(c) => {
466                let digit: u32 = c
467                    .to_digit(16)
468                    .ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
469                n_digits += 1;
470                if n_digits > 6 {
471                    // Stop updating value since we're sure that it's incorrect already.
472                    continue;
473                }
474                value = value * 16 + digit;
475            }
476        };
477    }
478}
479
480/// Interpret a string continuation escape (https://doc.rust-lang.org/reference/expressions/literal-expr.html#string-continuation-escapes)
481///
482/// Skip ASCII whitespace, except for the formfeed character
483/// (see [this issue](https://github.com/rust-lang/rust/issues/136600)).
484/// Warns on unescaped newline and following non-ASCII whitespace.
485#[inline] // single use in Unescape::unescape
486fn skip_ascii_whitespace(
487    chars: &mut Chars<'_>,
488    start: usize,
489    mut callback: impl FnMut(Range<usize>, EscapeError),
490) {
491    let rest = chars.as_str();
492    let first_non_space = rest
493        .bytes()
494        .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
495        .unwrap_or(rest.len());
496    let (space, rest) = rest.split_at(first_non_space);
497    // backslash newline adds 2 bytes
498    let end = start + 2 + first_non_space;
499    if space.contains('\n') {
500        callback(start..end, EscapeError::MultipleSkippedLinesWarning);
501    }
502    *chars = rest.chars();
503    if let Some(c) = chars.clone().next() {
504        if c.is_whitespace() {
505            // for error reporting, include the character that was not skipped in the span
506            callback(
507                start..end + c.len_utf8(),
508                EscapeError::UnskippedWhitespaceWarning,
509            );
510        }
511    }
512}
513
514impl Unescape for str {
515    type Unit = char;
516
517    const ZERO_RESULT: Result<Self::Unit, EscapeError> = Ok('\0');
518
519    #[inline]
520    fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
521        b.get().into()
522    }
523
524    #[inline]
525    fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
526        Ok(c)
527    }
528
529    #[inline]
530    fn hex2unit(b: u8) -> Result<Self::Unit, EscapeError> {
531        if b.is_ascii() {
532            Ok(b as char)
533        } else {
534            Err(EscapeError::OutOfRangeHexEscape)
535        }
536    }
537
538    #[inline]
539    fn unicode2unit(r: Result<char, EscapeError>) -> Result<Self::Unit, EscapeError> {
540        r
541    }
542}
543
544impl Unescape for [u8] {
545    type Unit = u8;
546
547    const ZERO_RESULT: Result<Self::Unit, EscapeError> = Ok(b'\0');
548
549    #[inline]
550    fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
551        b.get()
552    }
553
554    #[inline]
555    fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
556        char2byte(c)
557    }
558
559    #[inline]
560    fn hex2unit(b: u8) -> Result<Self::Unit, EscapeError> {
561        Ok(b)
562    }
563
564    #[inline]
565    fn unicode2unit(_r: Result<char, EscapeError>) -> Result<Self::Unit, EscapeError> {
566        Err(EscapeError::UnicodeEscapeInByte)
567    }
568}
569
570impl Unescape for CStr {
571    type Unit = MixedUnit;
572
573    const ZERO_RESULT: Result<Self::Unit, EscapeError> = Err(EscapeError::NulInCStr);
574
575    #[inline]
576    fn nonzero_byte2unit(b: NonZero<u8>) -> Self::Unit {
577        b.into()
578    }
579
580    #[inline]
581    fn char2unit(c: char) -> Result<Self::Unit, EscapeError> {
582        c.try_into()
583    }
584
585    #[inline]
586    fn hex2unit(byte: u8) -> Result<Self::Unit, EscapeError> {
587        byte.try_into()
588    }
589
590    #[inline]
591    fn unicode2unit(r: Result<char, EscapeError>) -> Result<Self::Unit, EscapeError> {
592        Self::char2unit(r?)
593    }
594}
595
596/// Enum of the different kinds of literal
597#[derive(Debug, Clone, Copy, PartialEq)]
598pub enum Mode {
599    /// `'a'`
600    Char,
601
602    /// `b'a'`
603    Byte,
604
605    /// `"hello"`
606    Str,
607    /// `r"hello"`
608    RawStr,
609
610    /// `b"hello"`
611    ByteStr,
612    /// `br"hello"`
613    RawByteStr,
614
615    /// `c"hello"`
616    CStr,
617    /// `cr"hello"`
618    RawCStr,
619}
620
621impl Mode {
622    pub fn in_double_quotes(self) -> bool {
623        match self {
624            Mode::Str
625            | Mode::RawStr
626            | Mode::ByteStr
627            | Mode::RawByteStr
628            | Mode::CStr
629            | Mode::RawCStr => true,
630            Mode::Char | Mode::Byte => false,
631        }
632    }
633
634    pub fn prefix_noraw(self) -> &'static str {
635        match self {
636            Mode::Char | Mode::Str | Mode::RawStr => "",
637            Mode::Byte | Mode::ByteStr | Mode::RawByteStr => "b",
638            Mode::CStr | Mode::RawCStr => "c",
639        }
640    }
641}
642
643/// Check a literal only for errors
644///
645/// Takes the contents of a literal (without quotes)
646/// and produces a sequence of only errors,
647/// which are returned by invoking `error_callback`.
648///
649/// NB Does not produce any output other than errors
650pub fn check_for_errors(
651    src: &str,
652    mode: Mode,
653    mut error_callback: impl FnMut(Range<usize>, EscapeError),
654) {
655    match mode {
656        Mode::Char => {
657            let mut chars = src.chars();
658            if let Err(e) = str::unescape_single(&mut chars) {
659                error_callback(0..(src.len() - chars.as_str().len()), e);
660            }
661        }
662        Mode::Byte => {
663            let mut chars = src.chars();
664            if let Err(e) = <[u8]>::unescape_single(&mut chars) {
665                error_callback(0..(src.len() - chars.as_str().len()), e);
666            }
667        }
668        Mode::Str => unescape_str(src, |range, res| {
669            if let Err(e) = res {
670                error_callback(range, e);
671            }
672        }),
673        Mode::ByteStr => unescape_byte_str(src, |range, res| {
674            if let Err(e) = res {
675                error_callback(range, e);
676            }
677        }),
678        Mode::CStr => unescape_c_str(src, |range, res| {
679            if let Err(e) = res {
680                error_callback(range, e);
681            }
682        }),
683        Mode::RawStr => check_raw_str(src, |range, res| {
684            if let Err(e) = res {
685                error_callback(range, e);
686            }
687        }),
688        Mode::RawByteStr => check_raw_byte_str(src, |range, res| {
689            if let Err(e) = res {
690                error_callback(range, e);
691            }
692        }),
693        Mode::RawCStr => check_raw_c_str(src, |range, res| {
694            if let Err(e) = res {
695                error_callback(range, e);
696            }
697        }),
698    }
699}