descape/
lib.rs

1#![no_std]
2#![forbid(unsafe_code)]
3#![warn(clippy::pedantic, clippy::perf, missing_docs, clippy::panic, clippy::cargo)]
4#![allow(clippy::type_complexity)]
5#![cfg_attr(docsrs, feature(doc_cfg))]
6
7
8/*!
9
10# descape
11
12Provides utilities for easily parsing escape sequences in a string via [`UnescapeExt`], using [`alloc::borrow::Cow`] to only borrow when needed.
13
14This library supports many escape sequences:
15- `\\a` -> `\x07`
16- `\\b` -> `\x08`
17- `\\t` -> `\x09`
18- `\\n` -> `\x0A`
19- `\\v` -> `\x0B`
20- `\\f` -> `\x0C`
21- `\\r` -> `\x0D`
22- `\\e` -> `\x1B`
23- `\\'` -> `'`
24- `\\"` -> `"`
25- <code>&bsol;&bsol;&grave;</code> -> <code>&grave;</code>
26- `\\\\` -> `\\`
27- `\\xNN` -> `\xNN`
28- `\\o` -> `\o`, for all octal digits `o`
29- `\\oo` -> `\oo`, for all octal digits `o`
30- `\\ooo` -> `\ooo`, for all octal digits `o`
31- `\\uXXXX` -> `\u{XXXX}`
32- `\\u{HEX}` -> `\u{HEX}`
33
34Along with this, you can define your own custom escape handlers! See [`UnescapeExt::to_unescaped_with`] for more information on that.
35
36This crate supports `no-std`.
37
38Optionally, this crate has the `std` and `core_error` features, 
39to allow the error type of an invalid escape to implement the `Error` trait.
40
41`std` uses `std::error::Error`, and `core_error` depends on `core::error::Error`, which is stable on Rust 1.82.0 or greater.
42
43*/
44
45
46#[cfg(any(feature = "std", docsrs))]
47extern crate std;
48#[cfg(any(feature = "std", docsrs))]
49use std::error::Error as ErrorTrait;
50#[cfg(all(feature = "core_error", not(feature = "std")))]
51use core::error::Error as ErrorTrait;
52
53extern crate alloc;
54
55use alloc::{
56    borrow::Cow,
57    string::{
58        String,
59        ToString
60    },
61    str::CharIndices
62};
63
64mod sealed {
65    pub trait Sealed {}
66    impl Sealed for str {}
67}
68
69#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Default, Hash)]
70/// An error representing an invalid escape sequence in a string.
71pub struct InvalidEscape {
72    /// The index of the invalid escape sequence.
73    pub index: usize,
74}
75
76impl InvalidEscape {
77    /// Constructs an invalid escape error from an index.
78    #[must_use]
79    pub const fn new(index: usize) -> Self {
80        Self { index }
81    }
82}
83
84impl core::fmt::Display for InvalidEscape {
85    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
86        write!(f, "invalid escape sequence at index {}", self.index)?;
87        Ok(())
88    }
89}
90
91#[cfg_attr(docsrs, doc(cfg(any(feature = "std", feature = "core_error"))))]
92#[cfg(any(feature = "std", feature = "core_error", docsrs))]
93impl ErrorTrait for InvalidEscape {}
94
95/// A trait distinguishing an object as a handler for custom escape sequences.
96/// 
97/// For convenience, this trait is **automatically implemented** for all implementors of `FnMut` with the correct signature.
98/// 
99pub trait EscapeHandler {
100    /// Definition of a custom escape handler.
101    /// 
102    /// Custom escape handlers are called before parsing any escape sequences,
103    /// and are given 3 arguments:
104    /// - `idx`: The index of the current character (e.g. `Hello\nthere` gets `5`)
105    /// - `chr`: The current character in the string (e.g. `\\n` gets `'n'`)
106    /// - `iter`: A mutable reference to the underlying character iterator -
107    ///     use this to get the rest of the string via `CharIndices::as_str`,
108    ///     or get the next characters
109    /// 
110    /// Handlers return a `Result<Option<char>, ()>`.
111    /// Returning `Ok(Some(char))` replaces the sequence with the given character,
112    /// returning `Ok(None)` removes the sequence entirely,
113    /// and returning `Err` errors the unescaping at the index of the escape sequence.
114    /// 
115    /// 
116    /// # Examples
117
118    /// ## Permitting any escape, handing it back raw
119    /// ```rust
120    /// # use descape::UnescapeExt; use std::str::CharIndices;
121    /// fn raw(idx: usize, chr: char, _: &mut CharIndices) -> Result<Option<char>, ()> {
122    ///     Ok(Some(chr))
123    /// }
124    
125    /// let escaped = r"\H\e\l\l\o \n \W\o\r\l\d";
126    /// let unescaped = escaped.to_unescaped_with(raw).expect("this is fine");
127    /// assert_eq!(unescaped, "Hello n World");
128    /// ```
129
130    /// ## Removing escape sequences entirely
131    /// ```rust
132    /// # use descape::UnescapeExt; use std::str::CharIndices;
133    /// fn raw(idx: usize, chr: char, _: &mut CharIndices) -> Result<Option<char>, ()> {
134    ///     Ok(None)
135    /// }
136
137    /// let escaped = r"What if I want a \nnewline?";
138    /// let unescaped = escaped.to_unescaped_with(raw).expect("this should work");
139    /// assert_eq!(unescaped, "What if I want a newline?");
140    /// ```
141
142    /// ## Not allowing escape sequences unsupported by Rust
143    /// ```rust
144    /// # use descape::{UnescapeExt, EscapeHandler}; use std::str::CharIndices;
145    /// fn rust_only(idx: usize, chr: char, iter: &mut CharIndices) -> Result<Option<char>, ()> {
146    ///     match chr {
147    ///         'a' | 'b' | 'v' | 'f' | 'e' | '`' => Err(()),
148    ///         _ => descape::DefaultHandler.escape(idx, chr, iter)
149    ///     }
150    /// }
151    
152    /// r"This is \nfine".to_unescaped_with(rust_only).expect(r"\n is valid");
153    /// r"This is not \fine".to_unescaped_with(rust_only).expect_err(r"\f is invalid");
154    /// ```
155    
156    /// # An informal note
157    /// Ideally, this trait would return `Result<Option<char>, Option<Box<dyn Error>>>`, but `Error` has only been in `core`
158    /// since Rust version `1.82.0`. Using it would bump the MSRV by a tremendous amount,
159    /// and as such it has been left out.
160    #[allow(clippy::result_unit_err, clippy::missing_errors_doc)]
161    fn escape(&mut self, idx: usize, chr: char, iter: &mut CharIndices<'_>) -> Result<Option<char>, ()>;
162}
163
164impl<F> EscapeHandler for F 
165    where F: for<'iter, 'source> FnMut(usize, char, &'iter mut CharIndices<'source>) -> Result<Option<char>, ()>
166{
167    fn escape(&mut self, idx: usize, chr: char, iter: &mut CharIndices<'_>) -> Result<Option<char>, ()> {
168        self(idx, chr, iter)
169    }
170}
171
172/// An extension trait for [`&str`](str) to allow parsing escape sequences in strings, only copying when needed.
173pub trait UnescapeExt: sealed::Sealed {
174
175    /**
176    Unescapes a string, returning an [`alloc::borrow::Cow`].
177    Will only allocate if the string has any escape sequences.
178
179    Uses [`crate::DefaultHandler`].
180
181    # Errors
182    Errors if there's an invalid escape sequence in the string.
183    Passes back the byte index of the invalid character.
184
185    # Examples
186    ## Parsing an escaped string
187    ```rust
188    # use std::borrow::Cow; use descape::UnescapeExt;
189    let escaped = "Hello,\\nworld!".to_unescaped();
190    assert_eq!(
191        escaped.unwrap(),
192        Cow::Owned::<'_, str>("Hello,\nworld!".to_string())
193    );
194    ```
195
196    ## Not allocating for a string without escapes
197    ```rust
198    # use std::borrow::Cow; use descape::UnescapeExt;
199    let no_escapes = "No escapes here!".to_unescaped();
200    assert_eq!(
201        no_escapes.unwrap(),
202        Cow::Borrowed("No escapes here!")
203    );
204    ```
205
206    ## Erroring for invalid escapes
207    ```
208    //                            v  invalid at index 7
209    # use std::borrow::Cow; use descape::UnescapeExt;
210    let invalid_escape = r"Uh oh! \xJJ".to_unescaped();
211    assert_eq!(
212        invalid_escape.unwrap_err().index,
213        7
214    );
215    ```
216     */
217    fn to_unescaped(&self) -> Result<Cow<'_, str>, InvalidEscape>;
218    /**
219    Unescapes a string using a custom escape handler. See the documentation of [`crate::EscapeHandler`] for more details.
220
221    # Errors
222
223    Errors if there's an invalid escape sequence in the string.
224    Passes back the byte index of the invalid character.
225
226    */
227    fn to_unescaped_with(
228        &self,
229        callback: impl EscapeHandler
230    ) -> Result<Cow<'_, str>, InvalidEscape>;
231}
232
233
234impl UnescapeExt for str {
235    #[inline]
236    fn to_unescaped(&self) -> Result<Cow<str>, InvalidEscape> {
237        self.to_unescaped_with(DefaultHandler)
238    }
239
240    // Put this outside to prevent monomorphization bloat
241    fn to_unescaped_with(
242        &self, 
243        mut callback: impl EscapeHandler
244    ) -> Result<Cow<str>, InvalidEscape> {
245        to_unescaped_with_mono(self, &mut callback)
246    }
247}
248
249fn to_unescaped_with_mono<'this, 'cb>(
250    this: &'this str,
251    callback: &'cb mut dyn EscapeHandler
252) -> Result<Cow<'this, str>, InvalidEscape> {
253    // Iterates over each character as a UTF-8 string slice
254    let mut iter = this.char_indices();
255    let mut seen: &'this str = "";
256    let mut owned = None::<String>;
257
258    while let Some((index, chr)) = iter.next() {
259        if chr != '\\' {
260            if let Some(owned) = &mut owned {
261                owned.push(chr);
262            } else {
263                seen = &this[..index + chr.len_utf8()];
264            }
265            continue;
266        }
267        let owned = owned.get_or_insert_with(|| {
268            let mut string = seen.to_string();
269            string.reserve_exact(this.len() - seen.len());
270            string
271        });
272        if let Some((_, chr)) = iter.next() {
273            if let Some(res) = callback.escape(index, chr, &mut iter)
274                .map_err(|()| InvalidEscape { index })?
275            {
276                owned.push(res);
277                continue;
278            }
279        } else {
280            // No matches found
281            return Err(InvalidEscape::new(owned.len()));
282        }
283    }
284
285    match owned {
286        Some(string) => Ok(Cow::Owned(string)),
287        None => Ok(Cow::Borrowed(this)),
288    }
289}
290
291/// The default escape sequence handler. 
292///
293/// The following escapes are valid:
294/// - `\\a` -> `\x07`
295/// - `\\b` -> `\x08`
296/// - `\\t` -> `\x09`
297/// - `\\n` -> `\x0A`
298/// - `\\v` -> `\x0B`
299/// - `\\f` -> `\x0C`
300/// - `\\r` -> `\x0D`
301/// - `\\e` -> `\x1B`
302/// - `\\'` -> `'`
303/// - `\\"` -> `"`
304/// - <code>&bsol;&bsol;&grave;</code> -> <code>&grave;</code>
305/// - `\\\\` -> `\\`
306/// - `\\xNN` -> `\xNN`
307/// - `\\o` -> `\o`, for all octal digits `o`
308/// - `\\oo` -> `\oo`, for all octal digits `o`
309/// - `\\ooo` -> `\ooo`, for all octal digits `o`
310/// - `\\uXXXX` -> `\u{XXXX}`
311/// - `\\u{HEX}` -> `\u{HEX}`
312///
313pub struct DefaultHandler;
314
315impl EscapeHandler for DefaultHandler {
316    fn escape(&mut self, _: usize, chr: char, iter: &mut CharIndices) -> Result<Option<char>, ()> {
317        Ok( match chr {
318            'a' => Some('\x07'),
319            'b' => Some('\x08'),
320            't' => Some('\x09'),
321            'n' => Some('\x0A'),
322            'v' => Some('\x0B'),
323            'f' => Some('\x0C'),
324            'r' => Some('\x0D'),
325            'e' => Some('\x1B'),
326            '`' => Some('`'),
327            '\'' => Some('\''),
328            '"' => Some('"'),
329            '\\' => Some('\\'),
330            'u' => {
331                let (chr, skip) = unescape_unicode(iter).ok_or(())?;
332                // Skip the needed amount of characters
333                for _ in 0..skip { iter.next(); }
334                Some(chr)
335            },
336            'x' => {
337                // Skip two characters
338                let res = unescape_hex(iter).ok_or(())?;
339                iter.next();
340                iter.next();
341                Some(res)
342            },
343            c if c.is_digit(8) => {
344                let (chr, skip) = unescape_oct(c, iter).ok_or(())?;
345                for _ in 0..skip { iter.next(); }
346                Some(chr)
347            },
348            _ => return Err(()),
349        } )
350    }
351}
352
353fn unescape_unicode(
354    iter: &mut CharIndices
355) -> Option<(char, usize)> {
356    let string = iter.as_str();
357    let (_, next) = iter.next()?;
358    if next == '{' {
359        // \u{HEX}
360        let end = string[1 ..].find('}')?;
361        let num = &string[1 ..= end];
362        let codepoint = u32::from_str_radix(num, 16).ok()?;
363        char::from_u32(codepoint).map(|v| (v, end + 1))
364    } else {
365        // \uNNNN
366        // If any of these are non-ASCII, then it's already invalid,
367        // so a direct slice is fine
368        let next_four = string.get( ..4 )?;
369        let codepoint = u32::from_str_radix(next_four, 16).ok()?;
370        // Encode the u32
371        char::from_u32(codepoint).map(|v| (v, 3))
372    }
373}
374
375// FIXME: This could be factored out along with part of unescape_unicode into its own function.
376fn unescape_hex(
377    iter: &mut CharIndices
378) -> Option<char> {
379
380    // Must be \xNN
381    let codepoint = iter.as_str()
382        .get(..2)
383        .and_then(|num| u32::from_str_radix(num, 16).ok())?;
384    char::from_u32(codepoint)
385}
386
387#[allow(clippy::cast_possible_truncation)] // Can't actually happen
388fn unescape_oct(
389    chr: char,
390    iter: &mut CharIndices
391) -> Option<(char, usize)> {
392
393    // Could be \o, \oo, or \ooo
394    let str = iter.as_str();
395    let end = iter.clone() // Cloning this is pretty cheap
396        .take(2)
397        .take_while(|(_, c)| c.is_digit(8))
398        .enumerate()
399        .last()
400        .map_or(0, |(idx, _)| idx + 1);
401    let num = &str[ .. end];
402    // These are the characters _after_ the first
403    let mut codepoint = if num.is_empty() { 0 } else { u32::from_str_radix(num, 8).ok()? };
404    // Add the first character at the top of the number
405    codepoint += (chr as u32 - '0' as u32) * 8u32.pow(end as u32);
406    char::from_u32(codepoint).map(|chr| (chr, end))
407}
408