Skip to main content

eml_codec/text/
quoted.rs

1#[cfg(feature = "arbitrary")]
2use arbitrary::Arbitrary;
3use bounded_static::ToStatic;
4use nom::{
5    branch::alt,
6    bytes::complete::{tag, take, take_while1},
7    combinator::{map, opt, verify},
8    multi::many0,
9    sequence::{delimited, pair, preceded},
10    IResult,
11};
12use std::borrow::Cow;
13use std::fmt;
14#[cfg(feature = "arbitrary")]
15use std::ops::ControlFlow;
16#[cfg(feature = "tracing")]
17use tracing::warn;
18
19use crate::i18n::ContainsUtf8;
20use crate::print::{Formatter, Print, ToStringFromPrint};
21use crate::text::ascii;
22use crate::text::utf8::{is_nonascii_or, take_utf8_while1};
23use crate::text::whitespace::{cfws, fws, is_obs_no_ws_ctl};
24use crate::text::words::is_vchar;
25#[cfg(feature = "tracing-recover")]
26use crate::utils::bytes_to_trace_string;
27#[cfg(feature = "arbitrary")]
28use crate::{arbitrary_utils::arbitrary_string_where, fuzz_eq::FuzzEq};
29use eml_codec_derives::instrument_input;
30
31// A quoted string contains bytes that satisfy `is_vchar` or are in `ascii::WS`.
32#[derive(Clone, ContainsUtf8, PartialEq, Default, ToStatic, ToStringFromPrint)]
33pub struct QuotedString<'a>(pub Vec<Cow<'a, str>>);
34
35impl<'a> fmt::Debug for QuotedString<'a> {
36    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
37        fmt.debug_tuple("QuotedString")
38            .field(&self.0.iter().collect::<Vec<_>>())
39            .finish()
40    }
41}
42
43impl<'a> QuotedString<'a> {
44    pub fn push_str(&mut self, e: &'a str) {
45        self.0.push(Cow::Borrowed(e))
46    }
47
48    pub fn push(&mut self, e: Cow<'a, str>) {
49        self.0.push(e)
50    }
51
52    pub fn chars<'b>(&'b self) -> QuotedStringChars<'a, 'b> {
53        QuotedStringChars {
54            q: self,
55            inner: QuotedStringCharsInner::NextFragment(0),
56        }
57    }
58}
59impl<'a> Print for QuotedString<'a> {
60    fn print(&self, fmt: &mut impl Formatter) {
61        print_quoted(fmt, self.chars())
62    }
63}
64
65#[cfg(feature = "arbitrary")]
66impl<'a> Arbitrary<'a> for QuotedString<'a> {
67    fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
68        let mut chunks = Vec::new();
69        u.arbitrary_loop(None, Some(10), |u| {
70            let bytes = arbitrary_string_where(u, |c| is_vchar(c) || ascii::WS_CHAR.contains(&c))?;
71            chunks.push(Cow::Owned(bytes));
72            Ok(ControlFlow::Continue(()))
73        })?;
74        Ok(QuotedString(chunks))
75    }
76}
77
78#[cfg(feature = "arbitrary")]
79impl<'a> FuzzEq for QuotedString<'a> {
80    fn fuzz_eq(&self, other: &Self) -> bool {
81        self.chars().collect::<String>() == other.chars().collect::<String>()
82    }
83}
84
85#[derive(Clone)]
86pub struct QuotedStringChars<'a, 'b> {
87    q: &'b QuotedString<'a>,
88    inner: QuotedStringCharsInner<'b>,
89}
90#[derive(Clone)]
91enum QuotedStringCharsInner<'a> {
92    NextFragment(usize),
93    FragmentChars(usize, std::str::Chars<'a>),
94}
95
96impl<'a, 'b> Iterator for QuotedStringChars<'a, 'b> {
97    type Item = char;
98    fn next(&mut self) -> Option<Self::Item> {
99        match &mut self.inner {
100            QuotedStringCharsInner::NextFragment(idx) => match self.q.0.get(*idx) {
101                Some(frag) => {
102                    self.inner = QuotedStringCharsInner::FragmentChars(*idx, frag.chars());
103                    self.next()
104                }
105                None => None,
106            },
107            QuotedStringCharsInner::FragmentChars(idx, it) => match it.next() {
108                Some(c) => Some(c),
109                None => {
110                    self.inner = QuotedStringCharsInner::NextFragment(*idx + 1);
111                    self.next()
112                }
113            },
114        }
115    }
116}
117
118/// Quoted pair
119///
120/// ```abnf
121///    quoted-pair     =   ("\" (VCHAR / WSP)) / obs-qp
122///    obs-qp          =   "\" (%d0 / obs-NO-WS-CTL / LF / CR)
123/// ```
124/// We parse quoted pairs even more liberally, allowing any ASCII byte after
125/// the backslash.
126///
127/// However, we only return `Some(_)` for quoted pairs that are valid
128/// according to the strict syntax; other quoted pairs cannot be printed
129/// back and we chose to ignore them.
130pub fn quoted_pair(input: &[u8]) -> IResult<&[u8], Option<&str>> {
131    preceded(
132        tag(&[ascii::BACKSLASH]),
133        map(
134            verify(take(1usize), |b: &[u8]| b[0].is_ascii()),
135            |s: &[u8]| {
136                let b = s[0];
137                if is_strict_quoted_pair(b.into()) {
138                    // SAFETY: from the combinators above (take and verify), we
139                    // know that `b` contains a single ASCII character.
140                    Some(unsafe { str::from_utf8_unchecked(s) })
141                } else {
142                    if !(b == ascii::NULL
143                        || is_obs_no_ws_ctl(b)
144                        || b == ascii::LF
145                        || b == ascii::CR)
146                    {
147                        #[cfg(feature = "tracing-recover")]
148                        warn!(byte = %bytes_to_trace_string(&[b]),
149                                  "invalid quoted pair")
150                    }
151                    None
152                }
153            },
154        ),
155    )(input)
156}
157
158fn is_strict_quoted_pair(c: char) -> bool {
159    is_vchar(c) || ascii::WS_CHAR.contains(&c)
160}
161
162/// Allowed characters in quote
163///
164/// ```abnf
165///   qtext           =   %d33 /             ; Printable US-ASCII
166///                       %d35-91 /          ;  characters not including
167///                       %d93-126 /         ;  "\" or the quote character
168///                       obs-qtext
169/// ```
170/// following RFC6532, also allows non-ascii UTF-8
171fn is_strict_qtext(c: char) -> bool {
172    is_nonascii_or(|c| {
173        c == ascii::EXCLAMATION
174            || (ascii::NUM..=ascii::LEFT_BRACKET).contains(&c)
175            || (ascii::RIGHT_BRACKET..=ascii::TILDE).contains(&c)
176    })(c)
177}
178
179fn is_obs_qtext(c: u8) -> bool {
180    is_obs_no_ws_ctl(c)
181}
182
183/// Quoted pair content
184///
185/// ```abnf
186///   qcontent        =   qtext / quoted-pair
187/// ```
188///
189/// Like for `quoted_pair`, this supports the obsolete syntax but
190/// returns `None` in this case.
191#[instrument_input("tracing")]
192fn qcontent(input: &[u8]) -> IResult<&[u8], Option<Cow<'_, str>>> {
193    alt((
194        map(take_utf8_while1(is_strict_qtext), Some),
195        map(take_while1(is_obs_qtext), |_| None),
196        map(quoted_pair, |qp| qp.map(Cow::Borrowed)),
197    ))(input)
198}
199
200/// Quoted string
201///
202/// ```abnf
203/// quoted-string   =   [CFWS]
204///                     DQUOTE *([FWS] qcontent) [FWS] DQUOTE
205///                     [CFWS]
206/// ```
207#[instrument_input("tracing")]
208pub fn quoted_string(input: &[u8]) -> IResult<&[u8], QuotedString<'_>> {
209    delimited(opt(cfws), quoted_string_plain, opt(cfws))(input)
210}
211pub fn quoted_string_plain(input: &[u8]) -> IResult<&[u8], QuotedString<'_>> {
212    let (input, _) = tag("\"")(input)?;
213    let (input, content) = many0(pair(opt(fws), qcontent))(input)?;
214    let (input, maybe_wsp) = opt(fws)(input)?;
215    let (input, _) = tag("\"")(input)?;
216
217    // Rebuild string
218    let mut qstring =
219        content
220            .into_iter()
221            .fold(QuotedString::default(), |mut acc, (maybe_wsp, c)| {
222                for wsp in maybe_wsp.into_iter().flat_map(|v| v.into_iter()) {
223                    acc.push_str(wsp);
224                }
225                if let Some(c) = c {
226                    acc.push(c);
227                }
228                acc
229            });
230
231    for wsp in maybe_wsp.into_iter().flat_map(|v| v.into_iter()) {
232        qstring.push_str(wsp);
233    }
234
235    Ok((input, qstring))
236}
237
238pub fn print_quoted<I>(fmt: &mut impl Formatter, data: I)
239where
240    I: IntoIterator<Item = char>,
241{
242    let mut buf = [0u8; 4];
243    fmt.write_bytes(b"\"");
244    for c in data.into_iter() {
245        let b = c.encode_utf8(&mut buf).as_bytes();
246        if is_strict_qtext(c) {
247            fmt.write_bytes(b);
248        } else if ascii::WS_CHAR.contains(&c) {
249            // NOTE: we can either output the whitespace as folding
250            // whitespace or to escape it; we choose to output it as folding
251            // whitespace which helps performing line folding.
252            fmt.write_fws_bytes(b);
253        } else if is_vchar(c) {
254            fmt.write_bytes(b"\\");
255            fmt.write_bytes(b);
256        } else {
257            // RFC5322 does not allow escaping bytes other than VCHAR in
258            // quoted strings. We drop them.
259            // NOTE: this case shouldn't happen in practice, because
260            // non-displayable quoted pairs are already dropped during
261            // parsing...
262            // TODO: return the invalid input bytes that were skipped.
263        }
264    }
265    fmt.write_bytes(b"\"")
266}
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271    use crate::print::tests::print_to_vec_with;
272
273    #[test]
274    fn test_quoted_string_parser() {
275        assert_eq!(
276            quoted_string(b" \"hello\\\"world\" ").unwrap().1,
277            QuotedString(vec!["hello".into(), "\"".into(), "world".into(),])
278        );
279
280        assert_eq!(
281            quoted_string(b"\"hello\r\n world\""),
282            Ok((
283                &b""[..],
284                QuotedString(vec!["hello".into(), " ".into(), "world".into(),])
285            )),
286        );
287
288        assert_eq!(
289            quoted_string(b"\"\t\""),
290            Ok((&b""[..], QuotedString(vec!["\t".into(),]))),
291        );
292    }
293
294    #[test]
295    fn test_quoted_string_printer() {
296        let out = print_to_vec_with(|f| {
297            print_quoted(
298                f,
299                QuotedString(vec!["hello".into(), "\"".into(), " world".into()]).chars(),
300            );
301        });
302        assert_eq!(out, b"\"hello\\\" world\"");
303    }
304
305    #[test]
306    fn test_quoted_string_object() {
307        assert_eq!(
308            QuotedString(vec!["hello".into(), " ".into(), "world".into(),]).to_string(),
309            "\"hello world\"".to_string(),
310        );
311    }
312}