Skip to main content

eml_codec/text/
words.rs

1use crate::i18n::ContainsUtf8;
2use crate::print::{Formatter, Print, ToStringFromPrint};
3use crate::text::ascii;
4use crate::text::utf8::{is_nonascii_or, take_utf8_while1};
5use crate::text::whitespace::cfws;
6#[cfg(feature = "arbitrary")]
7use crate::{
8    arbitrary_utils::{arbitrary_string_nonempty_where, arbitrary_vec_nonempty_where},
9    fuzz_eq::FuzzEq,
10};
11#[cfg(feature = "arbitrary")]
12use arbitrary::Arbitrary;
13use bounded_static::ToStatic;
14use eml_codec_derives::instrument_input;
15use nom::{
16    bytes::complete::{tag, take_while1},
17    character::is_alphanumeric,
18    combinator::{map, opt, recognize},
19    multi::many0,
20    sequence::{delimited, pair},
21    IResult,
22};
23use std::borrow::Cow;
24use std::fmt;
25
26/// Printable characters
27///
28/// following RFC6532, this includes non-ascii UTF8 text
29pub fn is_vchar(c: char) -> bool {
30    is_nonascii_or(|c| (ascii::EXCLAMATION..=ascii::TILDE).contains(&c))(c)
31}
32
33/// A MIME atom.
34// Contains a non-zero amount of bytes that satisfy `is_mime_atom_text`.
35#[derive(Clone, ContainsUtf8, PartialEq, Default, ToStatic, ToStringFromPrint)]
36#[contains_utf8(false)]
37pub struct MIMEAtom<'a>(pub Cow<'a, [u8]>);
38
39impl<'a> fmt::Debug for MIMEAtom<'a> {
40    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
41        fmt.debug_tuple("MIMEAtom")
42            .field(&String::from_utf8_lossy(&self.0))
43            .finish()
44    }
45}
46impl<'a> Print for MIMEAtom<'a> {
47    fn print(&self, fmt: &mut impl Formatter) {
48        fmt.write_bytes(&self.0)
49    }
50}
51#[cfg(feature = "arbitrary")]
52impl<'a, 'b> Arbitrary<'a> for MIMEAtom<'b> {
53    fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
54        let bytes = arbitrary_vec_nonempty_where(u, |c| is_mime_atom_text(*c), b'X')?;
55        Ok(MIMEAtom(Cow::Owned(bytes)))
56    }
57}
58#[cfg(feature = "arbitrary")]
59impl<'a> FuzzEq for MIMEAtom<'a> {
60    fn fuzz_eq(&self, other: &Self) -> bool {
61        self == other
62    }
63}
64impl<'a> MIMEAtom<'a> {
65    pub fn chars<'b>(&'b self) -> MIMEAtomChars<'a, 'b> {
66        MIMEAtomChars { a: self, idx: 0 }
67    }
68}
69#[derive(Clone)]
70pub struct MIMEAtomChars<'a, 'b> {
71    a: &'b MIMEAtom<'a>,
72    idx: usize,
73}
74impl<'a, 'b> Iterator for MIMEAtomChars<'a, 'b> {
75    type Item = char;
76    fn next(&mut self) -> Option<Self::Item> {
77        if self.idx < self.a.0.len() {
78            let c: u8 = self.a.0[self.idx];
79            self.idx += 1;
80            Some(c.into())
81        } else {
82            None
83        }
84    }
85}
86
87/// MIME Token allowed characters
88///
89/// forbidden: ()<>@,;:\"/[]?=
90pub fn is_mime_atom_text(c: u8) -> bool {
91    is_alphanumeric(c)
92        || c == ascii::EXCLAMATION
93        || c == ascii::NUM
94        || c == ascii::DOLLAR
95        || c == ascii::PERCENT
96        || c == ascii::AMPERSAND
97        || c == ascii::SQUOTE
98        || c == ascii::ASTERISK
99        || c == ascii::PLUS
100        || c == ascii::MINUS
101        || c == ascii::PERIOD
102        || c == ascii::CARET
103        || c == ascii::UNDERSCORE
104        || c == ascii::GRAVE
105        || c == ascii::LEFT_CURLY
106        || c == ascii::PIPE
107        || c == ascii::RIGHT_CURLY
108        || c == ascii::TILDE
109}
110
111/// MIME Token
112///
113/// `[CFWS] 1*token_text [CFWS]`
114#[instrument_input("tracing")]
115pub fn mime_atom(input: &[u8]) -> IResult<&[u8], MIMEAtom<'_>> {
116    delimited(opt(cfws), mime_atom_plain, opt(cfws))(input)
117}
118
119/// `1*token_text`
120pub fn mime_atom_plain(input: &[u8]) -> IResult<&[u8], MIMEAtom<'_>> {
121    map(take_while1(is_mime_atom_text), |b: &[u8]| {
122        MIMEAtom(b.into())
123    })(input)
124}
125
126/// An IMF atom.
127// Contains a non-zero amount of bytes that satisfy `is_atext`.
128#[derive(Clone, ContainsUtf8, Debug, PartialEq, ToStatic, ToStringFromPrint)]
129pub struct Atom<'a>(pub Cow<'a, str>);
130
131impl<'a> Print for Atom<'a> {
132    fn print(&self, fmt: &mut impl Formatter) {
133        fmt.write_bytes(self.0.as_bytes())
134    }
135}
136#[cfg(feature = "arbitrary")]
137impl<'a> Arbitrary<'a> for Atom<'a> {
138    fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
139        let bytes = arbitrary_string_nonempty_where(u, is_atext, 'X')?;
140        Ok(Atom(Cow::Owned(bytes)))
141    }
142}
143#[cfg(feature = "arbitrary")]
144impl<'a> FuzzEq for Atom<'a> {
145    fn fuzz_eq(&self, other: &Self) -> bool {
146        self == other
147    }
148}
149
150/// Atom allowed characters
151///
152/// authorized: !#$%&'*+-/=?^_`{|}~
153///
154/// following RFC6532, atext also allows non-ascii UTF8 characters
155pub fn is_atext(c: char) -> bool {
156    is_nonascii_or(|c| {
157        is_alphanumeric(c)
158            || c == ascii::EXCLAMATION
159            || c == ascii::NUM
160            || c == ascii::DOLLAR
161            || c == ascii::PERCENT
162            || c == ascii::AMPERSAND
163            || c == ascii::SQUOTE
164            || c == ascii::ASTERISK
165            || c == ascii::PLUS
166            || c == ascii::MINUS
167            || c == ascii::SLASH
168            || c == ascii::EQ
169            || c == ascii::QUESTION
170            || c == ascii::CARET
171            || c == ascii::UNDERSCORE
172            || c == ascii::GRAVE
173            || c == ascii::LEFT_CURLY
174            || c == ascii::PIPE
175            || c == ascii::RIGHT_CURLY
176            || c == ascii::TILDE
177    })(c)
178}
179
180/// Atom
181///
182/// `[CFWS] 1*atext [CFWS]`
183#[instrument_input("tracing")]
184pub fn atom(input: &[u8]) -> IResult<&[u8], Atom<'_>> {
185    map(
186        delimited(opt(cfws), take_utf8_while1(is_atext), opt(cfws)),
187        Atom,
188    )(input)
189}
190
191/// An IMF dot-atom.
192// Only contains bytes that satisfy is_atext or are '.'.
193#[derive(Clone, ContainsUtf8, Debug, PartialEq, ToStatic, ToStringFromPrint)]
194pub struct DotAtom<'a>(pub Cow<'a, str>);
195
196impl<'a> Print for DotAtom<'a> {
197    fn print(&self, fmt: &mut impl Formatter) {
198        fmt.write_bytes(self.0.as_bytes())
199    }
200}
201#[cfg(feature = "arbitrary")]
202impl<'a> Arbitrary<'a> for DotAtom<'a> {
203    fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
204        let mut s = arbitrary_string_nonempty_where(u, is_atext, 'X')?;
205        for _ in 0..u.int_in_range(0..=3)? {
206            s.push('.');
207            s.push_str(&arbitrary_string_nonempty_where(u, is_atext, 'X')?);
208        }
209        Ok(DotAtom(Cow::Owned(s)))
210    }
211}
212#[cfg(feature = "arbitrary")]
213impl<'a> FuzzEq for DotAtom<'a> {
214    fn fuzz_eq(&self, other: &Self) -> bool {
215        self == other
216    }
217}
218
219/// dot-atom-text
220///
221/// `1*atext *("." 1*atext)`
222pub fn dot_atom_text(input: &[u8]) -> IResult<&[u8], DotAtom<'_>> {
223    map(
224        recognize(pair(
225            take_utf8_while1(is_atext),
226            many0(pair(tag("."), take_utf8_while1(is_atext))),
227        )),
228        |b: &[u8]| {
229            // SAFETY: `b` is composed of bytes recognized by
230            // `take_utf8_while1()` and dots ("."). Both are guaranteed to be
231            // valid UTF-8.
232            let s = unsafe { str::from_utf8_unchecked(b) };
233            DotAtom(s.into())
234        },
235    )(input)
236}
237
238/// dot-atom
239///
240/// `[CFWS] dot-atom-text [CFWS]`
241#[instrument_input("tracing")]
242pub fn dot_atom(input: &[u8]) -> IResult<&[u8], DotAtom<'_>> {
243    delimited(opt(cfws), dot_atom_text, opt(cfws))(input)
244}
245
246#[cfg(test)]
247mod tests {
248    use super::*;
249
250    #[test]
251    fn test_atext() {
252        assert!(is_atext('='));
253        assert!(is_atext('5'));
254        assert!(is_atext('Q'));
255        assert!(!is_atext(' '));
256        assert!(is_atext('É')); // non-ascii UTF8 is allowed (RFC6532)
257    }
258
259    #[test]
260    fn test_atom() {
261        assert_eq!(
262            atom(b"(skip)  imf_codec (hidden) aerogramme"),
263            Ok((&b"aerogramme"[..], Atom("imf_codec".into())))
264        );
265    }
266
267    #[test]
268    fn test_dot_atom_text() {
269        assert_eq!(
270            dot_atom_text(b"quentin.dufour.io abcdef"),
271            Ok((&b" abcdef"[..], DotAtom("quentin.dufour.io".into())))
272        );
273    }
274
275    #[test]
276    fn test_dot_atom() {
277        assert_eq!(
278            dot_atom(b"   (skip) quentin.dufour.io abcdef"),
279            Ok((&b"abcdef"[..], DotAtom("quentin.dufour.io".into())))
280        );
281    }
282}