descape/lib.rs
1#![no_std]
2#![forbid(unsafe_code)]
3#![warn(clippy::pedantic, clippy::perf, missing_docs, clippy::panic, clippy::cargo)]
4#![allow(clippy::type_complexity)]
5#![cfg_attr(docsrs, feature(doc_cfg))]
6
7
8/*!
9
10# descape
11
12Provides utilities for easily parsing escape sequences in a string via [`UnescapeExt`], using [`alloc::borrow::Cow`] to only borrow when needed.
13
14This library supports many escape sequences:
15- `\\a` -> `\x07`
16- `\\b` -> `\x08`
17- `\\t` -> `\x09`
18- `\\n` -> `\x0A`
19- `\\v` -> `\x0B`
20- `\\f` -> `\x0C`
21- `\\r` -> `\x0D`
22- `\\e` -> `\x1B`
23- `\\'` -> `'`
24- `\\"` -> `"`
25- <code>\\`</code> -> <code>`</code>
26- `\\\\` -> `\\`
27- `\\xNN` -> `\xNN`
28- `\\o` -> `\o`, for all octal digits `o`
29- `\\oo` -> `\oo`, for all octal digits `o`
30- `\\ooo` -> `\ooo`, for all octal digits `o`
31- `\\uXXXX` -> `\u{XXXX}`
32- `\\u{HEX}` -> `\u{HEX}`
33
34Along with this, you can define your own custom escape handlers! See [`UnescapeExt::to_unescaped_with`] for more information on that.
35
36This crate supports `no-std`.
37
38Optionally, this crate has the `std` and `core_error` features,
39to allow the error type of an invalid escape to implement the `Error` trait.
40
41`std` uses `std::error::Error`, and `core_error` depends on `core::error::Error`, which is stable on Rust 1.82.0 or greater.
42
43*/
44
45
46#[cfg(any(feature = "std", docsrs))]
47extern crate std;
48#[cfg(any(feature = "std", docsrs))]
49use std::error::Error as ErrorTrait;
50#[cfg(all(feature = "core_error", not(feature = "std")))]
51use core::error::Error as ErrorTrait;
52
53extern crate alloc;
54
55use alloc::{
56 borrow::Cow,
57 string::{
58 String,
59 ToString
60 },
61 str::CharIndices
62};
63
64mod sealed {
65 pub trait Sealed {}
66 impl Sealed for str {}
67}
68
69#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Default, Hash)]
70/// An error representing an invalid escape sequence in a string.
71pub struct InvalidEscape {
72 /// The index of the invalid escape sequence.
73 pub index: usize,
74}
75
76impl InvalidEscape {
77 /// Constructs an invalid escape error from an index.
78 #[must_use]
79 pub const fn new(index: usize) -> Self {
80 Self { index }
81 }
82}
83
84impl core::fmt::Display for InvalidEscape {
85 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
86 write!(f, "invalid escape sequence at index {}", self.index)?;
87 Ok(())
88 }
89}
90
91#[cfg_attr(docsrs, doc(cfg(any(feature = "std", feature = "core_error"))))]
92#[cfg(any(feature = "std", feature = "core_error", docsrs))]
93impl ErrorTrait for InvalidEscape {}
94
95/// A trait distinguishing an object as a handler for custom escape sequences.
96///
97/// For convenience, this trait is **automatically implemented** for all implementors of `FnMut` with the correct signature.
98///
99pub trait EscapeHandler {
100 /// Definition of a custom escape handler.
101 ///
102 /// Custom escape handlers are called before parsing any escape sequences,
103 /// and are given 3 arguments:
104 /// - `idx`: The index of the current character (e.g. `Hello\nthere` gets `5`)
105 /// - `chr`: The current character in the string (e.g. `\\n` gets `'n'`)
106 /// - `iter`: A mutable reference to the underlying character iterator -
107 /// use this to get the rest of the string via `CharIndices::as_str`,
108 /// or get the next characters
109 ///
110 /// Handlers return a `Result<Option<char>, ()>`.
111 /// Returning `Ok(Some(char))` replaces the sequence with the given character,
112 /// returning `Ok(None)` removes the sequence entirely,
113 /// and returning `Err` errors the unescaping at the index of the escape sequence.
114 ///
115 ///
116 /// # Examples
117
118 /// ## Permitting any escape, handing it back raw
119 /// ```rust
120 /// # use descape::UnescapeExt; use std::str::CharIndices;
121 /// fn raw(idx: usize, chr: char, _: &mut CharIndices) -> Result<Option<char>, ()> {
122 /// Ok(Some(chr))
123 /// }
124
125 /// let escaped = r"\H\e\l\l\o \n \W\o\r\l\d";
126 /// let unescaped = escaped.to_unescaped_with(raw).expect("this is fine");
127 /// assert_eq!(unescaped, "Hello n World");
128 /// ```
129
130 /// ## Removing escape sequences entirely
131 /// ```rust
132 /// # use descape::UnescapeExt; use std::str::CharIndices;
133 /// fn raw(idx: usize, chr: char, _: &mut CharIndices) -> Result<Option<char>, ()> {
134 /// Ok(None)
135 /// }
136
137 /// let escaped = r"What if I want a \nnewline?";
138 /// let unescaped = escaped.to_unescaped_with(raw).expect("this should work");
139 /// assert_eq!(unescaped, "What if I want a newline?");
140 /// ```
141
142 /// ## Not allowing escape sequences unsupported by Rust
143 /// ```rust
144 /// # use descape::{UnescapeExt, EscapeHandler}; use std::str::CharIndices;
145 /// fn rust_only(idx: usize, chr: char, iter: &mut CharIndices) -> Result<Option<char>, ()> {
146 /// match chr {
147 /// 'a' | 'b' | 'v' | 'f' | 'e' | '`' => Err(()),
148 /// _ => descape::DefaultHandler.escape(idx, chr, iter)
149 /// }
150 /// }
151
152 /// r"This is \nfine".to_unescaped_with(rust_only).expect(r"\n is valid");
153 /// r"This is not \fine".to_unescaped_with(rust_only).expect_err(r"\f is invalid");
154 /// ```
155
156 /// # An informal note
157 /// Ideally, this trait would return `Result<Option<char>, Option<Box<dyn Error>>>`, but `Error` has only been in `core`
158 /// since Rust version `1.82.0`. Using it would bump the MSRV by a tremendous amount,
159 /// and as such it has been left out.
160 #[allow(clippy::result_unit_err, clippy::missing_errors_doc)]
161 fn escape(&mut self, idx: usize, chr: char, iter: &mut CharIndices<'_>) -> Result<Option<char>, ()>;
162}
163
164impl<F> EscapeHandler for F
165 where F: for<'iter, 'source> FnMut(usize, char, &'iter mut CharIndices<'source>) -> Result<Option<char>, ()>
166{
167 fn escape(&mut self, idx: usize, chr: char, iter: &mut CharIndices<'_>) -> Result<Option<char>, ()> {
168 self(idx, chr, iter)
169 }
170}
171
172/// An extension trait for [`&str`](str) to allow parsing escape sequences in strings, only copying when needed.
173pub trait UnescapeExt: sealed::Sealed {
174
175 /**
176 Unescapes a string, returning an [`alloc::borrow::Cow`].
177 Will only allocate if the string has any escape sequences.
178
179 Uses [`crate::DefaultHandler`].
180
181 # Errors
182 Errors if there's an invalid escape sequence in the string.
183 Passes back the byte index of the invalid character.
184
185 # Examples
186 ## Parsing an escaped string
187 ```rust
188 # use std::borrow::Cow; use descape::UnescapeExt;
189 let escaped = "Hello,\\nworld!".to_unescaped();
190 assert_eq!(
191 escaped.unwrap(),
192 Cow::Owned::<'_, str>("Hello,\nworld!".to_string())
193 );
194 ```
195
196 ## Not allocating for a string without escapes
197 ```rust
198 # use std::borrow::Cow; use descape::UnescapeExt;
199 let no_escapes = "No escapes here!".to_unescaped();
200 assert_eq!(
201 no_escapes.unwrap(),
202 Cow::Borrowed("No escapes here!")
203 );
204 ```
205
206 ## Erroring for invalid escapes
207 ```
208 // v invalid at index 7
209 # use std::borrow::Cow; use descape::UnescapeExt;
210 let invalid_escape = r"Uh oh! \xJJ".to_unescaped();
211 assert_eq!(
212 invalid_escape.unwrap_err().index,
213 7
214 );
215 ```
216 */
217 fn to_unescaped(&self) -> Result<Cow<'_, str>, InvalidEscape>;
218 /**
219 Unescapes a string using a custom escape handler. See the documentation of [`crate::EscapeHandler`] for more details.
220
221 # Errors
222
223 Errors if there's an invalid escape sequence in the string.
224 Passes back the byte index of the invalid character.
225
226 */
227 fn to_unescaped_with(
228 &self,
229 callback: impl EscapeHandler
230 ) -> Result<Cow<'_, str>, InvalidEscape>;
231}
232
233
234impl UnescapeExt for str {
235 #[inline]
236 fn to_unescaped(&self) -> Result<Cow<str>, InvalidEscape> {
237 self.to_unescaped_with(DefaultHandler)
238 }
239
240 // Put this outside to prevent monomorphization bloat
241 fn to_unescaped_with(
242 &self,
243 mut callback: impl EscapeHandler
244 ) -> Result<Cow<str>, InvalidEscape> {
245 to_unescaped_with_mono(self, &mut callback)
246 }
247}
248
249fn to_unescaped_with_mono<'this, 'cb>(
250 this: &'this str,
251 callback: &'cb mut dyn EscapeHandler
252) -> Result<Cow<'this, str>, InvalidEscape> {
253 // Iterates over each character as a UTF-8 string slice
254 let mut iter = this.char_indices();
255 let mut seen: &'this str = "";
256 let mut owned = None::<String>;
257
258 while let Some((index, chr)) = iter.next() {
259 if chr != '\\' {
260 if let Some(owned) = &mut owned {
261 owned.push(chr);
262 } else {
263 seen = &this[..index + chr.len_utf8()];
264 }
265 continue;
266 }
267 let owned = owned.get_or_insert_with(|| {
268 let mut string = seen.to_string();
269 string.reserve_exact(this.len() - seen.len());
270 string
271 });
272 if let Some((_, chr)) = iter.next() {
273 if let Some(res) = callback.escape(index, chr, &mut iter)
274 .map_err(|()| InvalidEscape { index })?
275 {
276 owned.push(res);
277 continue;
278 }
279 } else {
280 // No matches found
281 return Err(InvalidEscape::new(owned.len()));
282 }
283 }
284
285 match owned {
286 Some(string) => Ok(Cow::Owned(string)),
287 None => Ok(Cow::Borrowed(this)),
288 }
289}
290
291/// The default escape sequence handler.
292///
293/// The following escapes are valid:
294/// - `\\a` -> `\x07`
295/// - `\\b` -> `\x08`
296/// - `\\t` -> `\x09`
297/// - `\\n` -> `\x0A`
298/// - `\\v` -> `\x0B`
299/// - `\\f` -> `\x0C`
300/// - `\\r` -> `\x0D`
301/// - `\\e` -> `\x1B`
302/// - `\\'` -> `'`
303/// - `\\"` -> `"`
304/// - <code>\\`</code> -> <code>`</code>
305/// - `\\\\` -> `\\`
306/// - `\\xNN` -> `\xNN`
307/// - `\\o` -> `\o`, for all octal digits `o`
308/// - `\\oo` -> `\oo`, for all octal digits `o`
309/// - `\\ooo` -> `\ooo`, for all octal digits `o`
310/// - `\\uXXXX` -> `\u{XXXX}`
311/// - `\\u{HEX}` -> `\u{HEX}`
312///
313pub struct DefaultHandler;
314
315impl EscapeHandler for DefaultHandler {
316 fn escape(&mut self, _: usize, chr: char, iter: &mut CharIndices) -> Result<Option<char>, ()> {
317 Ok( match chr {
318 'a' => Some('\x07'),
319 'b' => Some('\x08'),
320 't' => Some('\x09'),
321 'n' => Some('\x0A'),
322 'v' => Some('\x0B'),
323 'f' => Some('\x0C'),
324 'r' => Some('\x0D'),
325 'e' => Some('\x1B'),
326 '`' => Some('`'),
327 '\'' => Some('\''),
328 '"' => Some('"'),
329 '\\' => Some('\\'),
330 'u' => {
331 let (chr, skip) = unescape_unicode(iter).ok_or(())?;
332 // Skip the needed amount of characters
333 for _ in 0..skip { iter.next(); }
334 Some(chr)
335 },
336 'x' => {
337 // Skip two characters
338 let res = unescape_hex(iter).ok_or(())?;
339 iter.next();
340 iter.next();
341 Some(res)
342 },
343 c if c.is_digit(8) => {
344 let (chr, skip) = unescape_oct(c, iter).ok_or(())?;
345 for _ in 0..skip { iter.next(); }
346 Some(chr)
347 },
348 _ => return Err(()),
349 } )
350 }
351}
352
353fn unescape_unicode(
354 iter: &mut CharIndices
355) -> Option<(char, usize)> {
356 let string = iter.as_str();
357 let (_, next) = iter.next()?;
358 if next == '{' {
359 // \u{HEX}
360 let end = string[1 ..].find('}')?;
361 let num = &string[1 ..= end];
362 let codepoint = u32::from_str_radix(num, 16).ok()?;
363 char::from_u32(codepoint).map(|v| (v, end + 1))
364 } else {
365 // \uNNNN
366 // If any of these are non-ASCII, then it's already invalid,
367 // so a direct slice is fine
368 let next_four = string.get( ..4 )?;
369 let codepoint = u32::from_str_radix(next_four, 16).ok()?;
370 // Encode the u32
371 char::from_u32(codepoint).map(|v| (v, 3))
372 }
373}
374
375// FIXME: This could be factored out along with part of unescape_unicode into its own function.
376fn unescape_hex(
377 iter: &mut CharIndices
378) -> Option<char> {
379
380 // Must be \xNN
381 let codepoint = iter.as_str()
382 .get(..2)
383 .and_then(|num| u32::from_str_radix(num, 16).ok())?;
384 char::from_u32(codepoint)
385}
386
387#[allow(clippy::cast_possible_truncation)] // Can't actually happen
388fn unescape_oct(
389 chr: char,
390 iter: &mut CharIndices
391) -> Option<(char, usize)> {
392
393 // Could be \o, \oo, or \ooo
394 let str = iter.as_str();
395 let end = iter.clone() // Cloning this is pretty cheap
396 .take(2)
397 .take_while(|(_, c)| c.is_digit(8))
398 .enumerate()
399 .last()
400 .map_or(0, |(idx, _)| idx + 1);
401 let num = &str[ .. end];
402 // These are the characters _after_ the first
403 let mut codepoint = if num.is_empty() { 0 } else { u32::from_str_radix(num, 8).ok()? };
404 // Add the first character at the top of the number
405 codepoint += (chr as u32 - '0' as u32) * 8u32.pow(end as u32);
406 char::from_u32(codepoint).map(|chr| (chr, end))
407}
408