Skip to main content

escaping/
lib.rs

1//! Escaping provides general round trippable string escaping. build an `Escape`
2//! with either `new` or `const_new`
3use anyhow::{bail, Result};
4use compact_str::CompactString;
5use std::borrow::Cow;
6
7#[cfg(test)]
8mod test;
9
10#[derive(Debug, Clone)]
11pub struct Escape {
12    escape_char: char,
13    escape: Box<[char]>,
14    tr: Box<[(char, CompactString)]>,
15    generic: Option<fn(char) -> bool>,
16}
17
18fn is_sep(esc: &mut bool, escape_char: char, c: char, sep: char) -> bool {
19    if c == sep {
20        !*esc
21    } else {
22        *esc = c == escape_char && !*esc;
23        false
24    }
25}
26
27impl Escape {
28    /// return the escape char
29    pub fn get_escape_char(&self) -> char {
30        self.escape_char
31    }
32
33    /// return the set of escaped chars
34    pub fn get_escaped(&self) -> &[char] {
35        &self.escape
36    }
37
38    /// return the translations
39    pub fn get_tr(&self) -> &[(char, CompactString)] {
40        &self.tr
41    }
42
43    /// Create a new Escape, return an error if the folowing invariants are violated
44    /// - the escape array must contain the escape_char.
45    /// - the escape array must contain every first char in tr
46    /// - the escape char, and the target tr char must be ascii
47    /// - translation key may not be the escape char
48    /// - translation targets must be ascii,
49    /// - translation targets must be unique
50    /// - translation targets may not be empty
51    /// - translation targets may not start with u
52    /// - translation targets may not contain the escape char
53    ///
54    /// `escape` is the list of characters that will be escaped when you call `escape`
55    ///
56    /// `tr` is the set of characters that are translated when escaped. For
57    /// example the newline character might translate to \n. The original
58    /// character is first followed by the escaped translation. e.g. [('\n',
59    /// 'n')] for newline to \n translation.
60    ///
61    /// `generic`, if specified, will be called for each char, if it returns true,
62    /// then the character will be translated to it's unicode escape sequence
63    pub fn new(
64        escape_char: char,
65        escape: &[char],
66        tr: &[(char, &str)],
67        generic: Option<fn(char) -> bool>,
68    ) -> Result<Self> {
69        if !escape_char.is_ascii() {
70            bail!("the escape char must be ascii")
71        }
72        if !escape.contains(&escape_char) {
73            bail!("the escape slice must contain the escape character")
74        }
75        for (i, (c, s)) in tr.iter().enumerate() {
76            if *c == escape_char {
77                bail!("you cannot translate the escape char")
78            }
79            if s.len() == 0 {
80                bail!("translation targets may not be empty")
81            }
82            if !s.is_ascii() {
83                bail!("translation targets must be ascii")
84            }
85            if s.starts_with("u") {
86                bail!("translation targets must not start with u")
87            }
88            if s.contains(escape_char) {
89                bail!("translation targets may not contain the escape char")
90            }
91            if !escape.contains(&c) {
92                bail!("the escape array must contain every translation key")
93            }
94            for (j, (c1, s1)) in tr.iter().enumerate() {
95                if i != j {
96                    if c == c1 {
97                        bail!("duplicate translation key {c}")
98                    }
99                    if s == s1 {
100                        bail!("duplicate translation target {s}")
101                    }
102                }
103            }
104        }
105        Ok(Self {
106            escape_char,
107            escape: Box::from(escape),
108            tr: Box::from_iter(tr.iter().map(|(c, s)| (*c, CompactString::new(s)))),
109            generic,
110        })
111    }
112
113    /// Escape the string and place the results into the buffer
114    pub fn escape_to<T>(&self, s: &T, buf: &mut String)
115    where
116        T: AsRef<str> + ?Sized,
117    {
118        for c in s.as_ref().chars() {
119            if self.escape.contains(&c) {
120                buf.push(self.escape_char);
121                match self
122                    .tr
123                    .iter()
124                    .find_map(|(s, e)| if c == *s { Some(e) } else { None })
125                {
126                    Some(e) => buf.push_str(e),
127                    None => buf.push(c),
128                }
129            } else if let Some(generic) = &self.generic
130                && (generic)(c)
131            {
132                use std::fmt::Write;
133                buf.push(self.escape_char);
134                write!(buf, "u{{{:x}}}", c as u32).unwrap();
135            } else {
136                buf.push(c);
137            }
138        }
139    }
140
141    /// Escape the string, or return it unmodifed if it did not need
142    /// to be escaped
143    pub fn escape<'a, T>(&self, s: &'a T) -> Cow<'a, str>
144    where
145        T: AsRef<str> + ?Sized,
146    {
147        let s = s.as_ref();
148        let mut to_escape = 0;
149        for c in s.chars() {
150            if self.escape.contains(&c)
151                || self.generic.as_ref().map(|f| (f)(c)).unwrap_or(false)
152            {
153                to_escape += 1
154            }
155        }
156        if to_escape == 0 {
157            Cow::Borrowed(s.as_ref())
158        } else {
159            let mut out = String::with_capacity(s.len() + to_escape);
160            self.escape_to(s, &mut out);
161            Cow::Owned(out)
162        }
163    }
164
165    /// Unescape the string and place the result in the buffer.
166    pub fn unescape_to<T>(&self, s: &T, buf: &mut String)
167    where
168        T: AsRef<str> + ?Sized,
169    {
170        fn parse_unicode_escape_seq(s: &str) -> Option<(usize, char)> {
171            if !s.starts_with("u{") {
172                return None;
173            }
174            let i = s.find('}')?;
175            let n = u32::from_str_radix(&s[2..i], 16).ok()?;
176            let c = char::from_u32(n)?;
177            Some((i + 1, c))
178        }
179        let mut escaped = false;
180        let mut skip_to = 0;
181        let s = s.as_ref();
182        buf.extend(s.char_indices().filter_map(|(i, c)| {
183            if i < skip_to {
184                None
185            } else if c == self.escape_char && !escaped {
186                escaped = true;
187                None
188            } else if escaped {
189                escaped = false;
190                for (v, k) in &self.tr {
191                    if s[i..].starts_with(k.as_str()) {
192                        skip_to = i + k.len();
193                        return Some(*v);
194                    }
195                }
196                if let Some((j, c)) = parse_unicode_escape_seq(&s[i..]) {
197                    skip_to = i + j;
198                    return Some(c);
199                }
200                Some(c)
201            } else {
202                Some(c)
203            }
204        }))
205    }
206
207    /// Unescape the string, or return it unmodified if it did not need to be
208    /// unescaped
209    pub fn unescape<'a, T>(&self, s: &'a T) -> Cow<'a, str>
210    where
211        T: AsRef<str> + ?Sized,
212    {
213        let s = s.as_ref();
214        if !s.contains(self.escape_char) {
215            Cow::Borrowed(s.as_ref())
216        } else {
217            let mut res = String::with_capacity(s.len());
218            self.unescape_to(s, &mut res);
219            Cow::Owned(res)
220        }
221    }
222
223    /// return true if the char at the `i` is escaped. Return true if `i` is
224    /// not a valid char boundary
225    pub fn is_escaped<T>(&self, s: &T, i: usize) -> bool
226    where
227        T: AsRef<str> + ?Sized,
228    {
229        is_escaped(s, self.escape_char, i)
230    }
231
232    /// split the string into at most `n` parts separated by non escaped
233    /// instances of `sep` and return an iterator over the parts
234    pub fn splitn<'a, T>(
235        &self,
236        s: &'a T,
237        n: usize,
238        sep: char,
239    ) -> impl Iterator<Item = &'a str> + use<'a, T>
240    where
241        T: AsRef<str> + ?Sized,
242    {
243        splitn(s, self.escape_char, n, sep)
244    }
245
246    /// split the string into parts separated by non escaped instances of `sep`
247    /// and return an iterator over the parts
248    pub fn split<'a, T>(
249        &self,
250        s: &'a T,
251        sep: char,
252    ) -> impl Iterator<Item = &'a str> + use<'a, T>
253    where
254        T: AsRef<str> + ?Sized,
255    {
256        split(s, self.escape_char, sep)
257    }
258}
259
260/// return true if the char at i is not a valid character boundary or is escaped
261/// with the escape character
262pub fn is_escaped<T>(s: &T, escape_char: char, i: usize) -> bool
263where
264    T: AsRef<str> + ?Sized,
265{
266    let s = s.as_ref();
267    let b = s.as_bytes();
268    !s.is_char_boundary(i) || {
269        let mut res = false;
270        for j in (0..i).rev() {
271            if s.is_char_boundary(j) && b[j] == (escape_char as u8) {
272                res = !res;
273            } else {
274                break;
275            }
276        }
277        res
278    }
279}
280
281/// split the string into at most `n` parts separated by non escaped
282/// instances of `sep` and return an iterator over the parts
283pub fn splitn<'a, T>(
284    s: &'a T,
285    escape_char: char,
286    n: usize,
287    sep: char,
288) -> impl Iterator<Item = &'a str> + use<'a, T>
289where
290    T: AsRef<str> + ?Sized,
291{
292    s.as_ref().splitn(n, {
293        let mut esc = false;
294        move |c| is_sep(&mut esc, escape_char, c, sep)
295    })
296}
297
298/// split the string into parts separated by non escaped instances of `sep`
299/// and return an iterator over the parts
300pub fn split<'a, T>(
301    s: &'a T,
302    escape_char: char,
303    sep: char,
304) -> impl Iterator<Item = &'a str> + use<'a, T>
305where
306    T: AsRef<str> + ?Sized,
307{
308    s.as_ref().split({
309        let mut esc = false;
310        move |c| is_sep(&mut esc, escape_char, c, sep)
311    })
312}