mail_chars/
lib.rs

1//! Provides char classification for mail related grammar parts/charset, i.e.
2//! if a given char belongs characters valid in atext, ctext, dtext, token etc.
3//!
4//! The `Charset` enum is used to determine which set of character is used.  To
5//! check if a `char` is in that set either use `Charset::contains(&self,
6//! char)` or use `ch.is(charset)` which is provided through the `CharMatchExt`
7//! extension trait.
8//!
9//! # Why `Ws` is merged into `CText`, `DText` and `QText`
10//!
11//! Any grammar part in which `qtext`/`ctext`/`dtext` is used is in a form
12//! which 1. repeats 2. prepends `FWS` in the repeating part.
13//!
14//! Which means any parser would have to parse for chars which are
15//! `qtext`/`ctext`/`dtext` OR `ws` (and special handling if it hits another
16//! character like `"\r"` indicating the start of a soft line break etc.).
17//!
18//! For example wrt. `dtext` the grammar is `... *([FWS] dtext) [FWS] ...`
19//! which you can validate by parsing chars which are either `dtext` or `ws`
20//! and if you hit a `"\r"` (which btw. is _not_ in `ws`) you make sure it's
21//! followed by `"\n "` or `"\n\t"` and then you continue with parsing.
22//!
23//! # Alternative interface
24//!
25//! All enum variants are re-exported under a module with the name of the rfc where
26//! they are specified. E.g. `Charset::CText` is also available as `rfc5322::CText`.
27//!
28//! # Example
29//!
30//! ```rust
31//! extern crate mail_chars;
32//! use mail_chars::{Charset, rfc5322, rfc2045, CharMatchExt};
33//!
34//! fn main() {
35//!     assert!(Charset::AText.contains('d'));
36//!     assert!('d'.is(Charset::AText));
37//!     assert!('d'.is(rfc5322::AText));
38//!
39//!     // `rfc*::*` are just reexports grouped by RFC.
40//!     assert_eq!(Charset::Token, rfc2045::Token);
41//!
42//!     // If we want to test for more than on char set we can use lookup.
43//!     let res = Charset::lookup('.');
44//!     // Has the benefit that there is a is_ascii method
45//!     assert!(res.is_ascii());
46//!     assert!(res.is(rfc2045::Token));
47//!     assert!(res.is(rfc5322::CTextWs));
48//!     assert!(!res.is(rfc5322::AText));
49//! }
50//! ```
51
52mod lookup;
53
54/// An enum for the charsets represented through an internal lookup table.
55#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash, Debug)]
56#[repr(u8)]
57pub enum Charset {
58    /// `qtext` + `ws`, basically anything which can appear in a quoted string
59    /// which is not a quoted-pair.
60    ///
61    /// Note: this is equivalent to rfc7230 `qdtext`, excluding the obsolete
62    ///       part of the grammar.
63    /// Note: the obsolete part of the grammar is excluded
64    ///
65    /// **rfc: 5322**
66    QTextWs = lookup::QC,
67
68    /// `ctext` + `ws`
69    ///
70    /// Note: the obsolete part of the grammar is excluded.
71    ///
72    /// **rfc: 5322**
73    CTextWs = lookup::CT,
74
75    /// `dtext` + `ws`
76    ///
77    /// **rfc: 5322**
78    DTextWs = lookup::DT,
79
80    /// `atext`
81    ///
82    /// **rfc: 5322**
83    AText = lookup::AT,
84
85    /// Restricted-name-char subset of rfc2045 token with which IETF-tokens and
86    /// IANA-tokens have to comply.
87    ///
88    /// **rfc: 6838** (related rfc2045)
89    RestrictedToken = lookup::RT,
90
91    /// `token`
92    ///
93    /// Note> there are multiple mail related definitions of token, this one is the rfc2045 based
94    /// one.
95    ///
96    /// **rfc: 2045**
97    ///
98    Token = lookup::TO,
99
100    /// obs-NO-WS-CTL
101    ///
102    /// **rfc: 5322**
103    ///
104    /// combine with CText or QText to support the obsolete part of the grammar
105    ///
106    /// # Example
107    ///
108    /// ```
109    /// use mail_chars::{Charset, CharMatchExt, rfc5322};
110    ///
111    /// fn is_ctext_with_obs(ch: char) -> bool {
112    ///     let res = Charset::lookup(ch);
113    ///     res.is(rfc5322::CTextWs) || res.is(rfc5322::ObsNoWsCtl)
114    /// }
115    ///
116    /// assert!("\x01 comment \x02".chars().all(is_ctext_with_obs));
117    /// ```
118    ObsNoWsCtl = lookup::NC,
119
120    /// `token`
121    ///
122    /// **rfc: 7230**
123    ///
124    /// Token as defined in rfc7230 (HTTP/1.1) not directly a mail grammar, but
125    /// relevant for shared utilities like e.g. anything Media Type (i.e.
126    /// MIME-Type/Content-Type) related.
127    Rfc7230Token = lookup::HT
128}
129
130impl Charset {
131    /// Returns true if the `char` is part of this set of chars.
132    #[inline]
133    pub fn contains(&self, ch: char) -> bool {
134        self.contains_lookup(ch, false)
135    }
136
137    /// Returns true if the `char` is part of the set of chars or not an ascii
138    /// char.
139    ///
140    /// This is mainly meant to be used in combination with rfc6532 which
141    /// extends all `*text` grammar parts/character sets to contain any
142    /// non-us-ascii character.
143    #[inline]
144    pub fn contains_or_non_ascii(&self, ch: char) -> bool {
145        self.contains_lookup(ch, true)
146    }
147
148    fn contains_lookup(&self, ch: char, out_of_table_value: bool) -> bool {
149        let index = ch as u32;
150        if index < 0x80 {
151            lookup::US_ASCII_LOOKUP[index as usize] & (*self as u8) != 0
152        } else {
153            out_of_table_value
154        }
155    }
156
157    /// Uses the internal lookup table to classify a `char`.
158    pub fn lookup(ch: char) -> LookupResult {
159        let index = ch as u32;
160        if index < 0x80 {
161            LookupResult(Some(lookup::US_ASCII_LOOKUP[index as usize]))
162        } else {
163            LookupResult(None)
164        }
165    }
166}
167
168mod sealed{ pub trait Seal {} }
169pub use self::sealed::Seal;
170pub trait CharMatchExt: Seal+Copy {
171    /// Returns true if the char is a char belonging to the given charset.
172    fn is(self, charset: Charset) -> bool;
173
174    /// Returns true if the char is a char belonging to the given charset or a
175    /// non-us-ascii char.
176    fn is_inkl_non_ascii(self, charset: Charset) -> bool;
177}
178
179impl Seal for char {}
180impl CharMatchExt for char {
181    #[inline]
182    fn is(self, charset: Charset) -> bool {
183        charset.contains(self)
184    }
185    #[inline]
186    fn is_inkl_non_ascii(self, charset: Charset) -> bool {
187        charset.contains_or_non_ascii(self)
188    }
189}
190
191/// Represents the result of a lookup of a char.
192///
193/// `CharMatchExt` is implemented on it so that you can treat it the same
194/// as a char (wrt. this trait).
195///
196/// # Example
197///
198/// ```
199/// use mail_chars::{Charset, CharMatchExt};
200///
201/// let res = Charset::lookup('↓');
202/// assert!(!res.is_ascii());
203/// assert!(!res.is(Charset::QTextWs));
204/// assert!(res.is_inkl_non_ascii(Charset::QTextWs));
205///
206/// let res = Charset::lookup('<');
207/// assert!(res.is_ascii());
208/// assert!(res.is(Charset::QTextWs));
209/// assert!(res.is(Charset::CTextWs));
210/// assert!(!res.is(Charset::AText));
211/// ```
212#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
213pub struct LookupResult(Option<u8>);
214
215impl LookupResult {
216
217    pub fn is_ascii(&self) -> bool {
218        self.0.is_some()
219    }
220
221    fn lookup_contains(&self, charset: Charset, default: bool) -> bool {
222        self.0.map(|res| {
223            res & (charset as u8) != 0
224        }).unwrap_or(default)
225    }
226}
227
228impl Seal for LookupResult {}
229impl CharMatchExt for LookupResult {
230    #[inline]
231    fn is(self, charset: Charset) -> bool {
232        self.lookup_contains(charset, false)
233    }
234    #[inline]
235    fn is_inkl_non_ascii(self, charset: Charset) -> bool {
236        self.lookup_contains(charset, true)
237    }
238}
239
240/// Re-export of all charsets (Charset::... variants) from rfc5322.
241pub mod rfc5322 {
242    pub use super::Charset::{QTextWs, CTextWs, AText, DTextWs, ObsNoWsCtl};
243}
244
245/// Re-export of all charsets (Charset::... variants) from rfc2045.
246pub mod rfc2045 {
247    pub use super::Charset::Token;
248}
249
250/// Re-export of all charsets (Charset::... variants) from rfc6838.
251pub mod rfc6838 {
252    pub use super::Charset::RestrictedToken;
253}
254
255/// Re-export of all charsets (Charset::... variants) from rfc7320.
256///
257/// Note that `QTextWs` (rfc5322) is exported as `Qdtext` (rfc7320) as they are
258/// the equivalent (if the obsolete part of both grammars is excluded).
259pub mod rfc7230 {
260    pub use super::Charset::{
261        QTextWs as QDText,
262        Rfc7230Token as Token
263    };
264
265}
266
267
268#[inline]
269pub fn is_ws(ch: char) -> bool {
270    ch == ' ' || ch  == '\t'
271}
272
273#[inline]
274pub fn is_vchar(ch: char) -> bool {
275    ' ' < ch && ch <= '~'
276}
277
278#[cfg(test)]
279mod test {
280    use super::{Charset, CharMatchExt, is_vchar};
281
282    #[test]
283    fn lookup_result_ascii() {
284        let res = Charset::lookup('<');
285        assert!(res.is_ascii());
286        assert!(res.is(Charset::QTextWs));
287        assert!(res.is_inkl_non_ascii(Charset::QTextWs));
288        assert!(res.is(Charset::CTextWs));
289        assert!(res.is_inkl_non_ascii(Charset::CTextWs));
290        assert!(!res.is(Charset::AText));
291        assert!(!res.is_inkl_non_ascii(Charset::AText));
292    }
293
294    #[test]
295    fn lookup_result_utf8() {
296        let res = Charset::lookup('↓');
297        assert!(!res.is_ascii());
298        assert!(!res.is(Charset::QTextWs));
299        assert!(res.is_inkl_non_ascii(Charset::QTextWs));
300    }
301
302    #[test]
303    fn is_part_of_charset() {
304        // Just a "general" check if it works, any specific checks about which
305        // chars belong to which set of chars is handled in the lookup modules
306        // tests.
307        assert!('<'.is(Charset::QTextWs));
308        assert!('<'.is_inkl_non_ascii(Charset::QTextWs));
309        assert!(!'<'.is(Charset::AText));
310        assert!(!'<'.is_inkl_non_ascii(Charset::AText));
311
312        let first_char_not_in_table = '\u{80}';
313        assert!(!first_char_not_in_table.is(Charset::CTextWs));
314        assert!(first_char_not_in_table.is_inkl_non_ascii(Charset::CTextWs));
315    }
316
317    #[test]
318    fn is_vchar_boundaries() {
319        let min = '!';
320        let min_m1 = ' ';
321        assert_eq!(min as u32 - 1, min_m1 as u32);
322        let max = '~';
323        let max_p1 = '\u{7f}';
324        assert_eq!(max as u32 + 1, max_p1 as u32);
325
326        assert!(is_vchar(min));
327        assert!(!is_vchar(min_m1));
328        assert!(is_vchar(max));
329        assert!(!is_vchar(max_p1));
330    }
331}