mail_chars/lib.rs
1//! Provides char classification for mail related grammar parts/charset, i.e.
2//! if a given char belongs characters valid in atext, ctext, dtext, token etc.
3//!
4//! The `Charset` enum is used to determine which set of character is used. To
5//! check if a `char` is in that set either use `Charset::contains(&self,
6//! char)` or use `ch.is(charset)` which is provided through the `CharMatchExt`
7//! extension trait.
8//!
9//! # Why `Ws` is merged into `CText`, `DText` and `QText`
10//!
11//! Any grammar part in which `qtext`/`ctext`/`dtext` is used is in a form
12//! which 1. repeats 2. prepends `FWS` in the repeating part.
13//!
14//! Which means any parser would have to parse for chars which are
15//! `qtext`/`ctext`/`dtext` OR `ws` (and special handling if it hits another
16//! character like `"\r"` indicating the start of a soft line break etc.).
17//!
18//! For example wrt. `dtext` the grammar is `... *([FWS] dtext) [FWS] ...`
19//! which you can validate by parsing chars which are either `dtext` or `ws`
20//! and if you hit a `"\r"` (which btw. is _not_ in `ws`) you make sure it's
21//! followed by `"\n "` or `"\n\t"` and then you continue with parsing.
22//!
23//! # Alternative interface
24//!
25//! All enum variants are re-exported under a module with the name of the rfc where
26//! they are specified. E.g. `Charset::CText` is also available as `rfc5322::CText`.
27//!
28//! # Example
29//!
30//! ```rust
31//! extern crate mail_chars;
32//! use mail_chars::{Charset, rfc5322, rfc2045, CharMatchExt};
33//!
34//! fn main() {
35//! assert!(Charset::AText.contains('d'));
36//! assert!('d'.is(Charset::AText));
37//! assert!('d'.is(rfc5322::AText));
38//!
39//! // `rfc*::*` are just reexports grouped by RFC.
40//! assert_eq!(Charset::Token, rfc2045::Token);
41//!
42//! // If we want to test for more than on char set we can use lookup.
43//! let res = Charset::lookup('.');
44//! // Has the benefit that there is a is_ascii method
45//! assert!(res.is_ascii());
46//! assert!(res.is(rfc2045::Token));
47//! assert!(res.is(rfc5322::CTextWs));
48//! assert!(!res.is(rfc5322::AText));
49//! }
50//! ```
51
52mod lookup;
53
54/// An enum for the charsets represented through an internal lookup table.
55#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Hash, Debug)]
56#[repr(u8)]
57pub enum Charset {
58 /// `qtext` + `ws`, basically anything which can appear in a quoted string
59 /// which is not a quoted-pair.
60 ///
61 /// Note: this is equivalent to rfc7230 `qdtext`, excluding the obsolete
62 /// part of the grammar.
63 /// Note: the obsolete part of the grammar is excluded
64 ///
65 /// **rfc: 5322**
66 QTextWs = lookup::QC,
67
68 /// `ctext` + `ws`
69 ///
70 /// Note: the obsolete part of the grammar is excluded.
71 ///
72 /// **rfc: 5322**
73 CTextWs = lookup::CT,
74
75 /// `dtext` + `ws`
76 ///
77 /// **rfc: 5322**
78 DTextWs = lookup::DT,
79
80 /// `atext`
81 ///
82 /// **rfc: 5322**
83 AText = lookup::AT,
84
85 /// Restricted-name-char subset of rfc2045 token with which IETF-tokens and
86 /// IANA-tokens have to comply.
87 ///
88 /// **rfc: 6838** (related rfc2045)
89 RestrictedToken = lookup::RT,
90
91 /// `token`
92 ///
93 /// Note> there are multiple mail related definitions of token, this one is the rfc2045 based
94 /// one.
95 ///
96 /// **rfc: 2045**
97 ///
98 Token = lookup::TO,
99
100 /// obs-NO-WS-CTL
101 ///
102 /// **rfc: 5322**
103 ///
104 /// combine with CText or QText to support the obsolete part of the grammar
105 ///
106 /// # Example
107 ///
108 /// ```
109 /// use mail_chars::{Charset, CharMatchExt, rfc5322};
110 ///
111 /// fn is_ctext_with_obs(ch: char) -> bool {
112 /// let res = Charset::lookup(ch);
113 /// res.is(rfc5322::CTextWs) || res.is(rfc5322::ObsNoWsCtl)
114 /// }
115 ///
116 /// assert!("\x01 comment \x02".chars().all(is_ctext_with_obs));
117 /// ```
118 ObsNoWsCtl = lookup::NC,
119
120 /// `token`
121 ///
122 /// **rfc: 7230**
123 ///
124 /// Token as defined in rfc7230 (HTTP/1.1) not directly a mail grammar, but
125 /// relevant for shared utilities like e.g. anything Media Type (i.e.
126 /// MIME-Type/Content-Type) related.
127 Rfc7230Token = lookup::HT
128}
129
130impl Charset {
131 /// Returns true if the `char` is part of this set of chars.
132 #[inline]
133 pub fn contains(&self, ch: char) -> bool {
134 self.contains_lookup(ch, false)
135 }
136
137 /// Returns true if the `char` is part of the set of chars or not an ascii
138 /// char.
139 ///
140 /// This is mainly meant to be used in combination with rfc6532 which
141 /// extends all `*text` grammar parts/character sets to contain any
142 /// non-us-ascii character.
143 #[inline]
144 pub fn contains_or_non_ascii(&self, ch: char) -> bool {
145 self.contains_lookup(ch, true)
146 }
147
148 fn contains_lookup(&self, ch: char, out_of_table_value: bool) -> bool {
149 let index = ch as u32;
150 if index < 0x80 {
151 lookup::US_ASCII_LOOKUP[index as usize] & (*self as u8) != 0
152 } else {
153 out_of_table_value
154 }
155 }
156
157 /// Uses the internal lookup table to classify a `char`.
158 pub fn lookup(ch: char) -> LookupResult {
159 let index = ch as u32;
160 if index < 0x80 {
161 LookupResult(Some(lookup::US_ASCII_LOOKUP[index as usize]))
162 } else {
163 LookupResult(None)
164 }
165 }
166}
167
168mod sealed{ pub trait Seal {} }
169pub use self::sealed::Seal;
170pub trait CharMatchExt: Seal+Copy {
171 /// Returns true if the char is a char belonging to the given charset.
172 fn is(self, charset: Charset) -> bool;
173
174 /// Returns true if the char is a char belonging to the given charset or a
175 /// non-us-ascii char.
176 fn is_inkl_non_ascii(self, charset: Charset) -> bool;
177}
178
179impl Seal for char {}
180impl CharMatchExt for char {
181 #[inline]
182 fn is(self, charset: Charset) -> bool {
183 charset.contains(self)
184 }
185 #[inline]
186 fn is_inkl_non_ascii(self, charset: Charset) -> bool {
187 charset.contains_or_non_ascii(self)
188 }
189}
190
191/// Represents the result of a lookup of a char.
192///
193/// `CharMatchExt` is implemented on it so that you can treat it the same
194/// as a char (wrt. this trait).
195///
196/// # Example
197///
198/// ```
199/// use mail_chars::{Charset, CharMatchExt};
200///
201/// let res = Charset::lookup('↓');
202/// assert!(!res.is_ascii());
203/// assert!(!res.is(Charset::QTextWs));
204/// assert!(res.is_inkl_non_ascii(Charset::QTextWs));
205///
206/// let res = Charset::lookup('<');
207/// assert!(res.is_ascii());
208/// assert!(res.is(Charset::QTextWs));
209/// assert!(res.is(Charset::CTextWs));
210/// assert!(!res.is(Charset::AText));
211/// ```
212#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
213pub struct LookupResult(Option<u8>);
214
215impl LookupResult {
216
217 pub fn is_ascii(&self) -> bool {
218 self.0.is_some()
219 }
220
221 fn lookup_contains(&self, charset: Charset, default: bool) -> bool {
222 self.0.map(|res| {
223 res & (charset as u8) != 0
224 }).unwrap_or(default)
225 }
226}
227
228impl Seal for LookupResult {}
229impl CharMatchExt for LookupResult {
230 #[inline]
231 fn is(self, charset: Charset) -> bool {
232 self.lookup_contains(charset, false)
233 }
234 #[inline]
235 fn is_inkl_non_ascii(self, charset: Charset) -> bool {
236 self.lookup_contains(charset, true)
237 }
238}
239
240/// Re-export of all charsets (Charset::... variants) from rfc5322.
241pub mod rfc5322 {
242 pub use super::Charset::{QTextWs, CTextWs, AText, DTextWs, ObsNoWsCtl};
243}
244
245/// Re-export of all charsets (Charset::... variants) from rfc2045.
246pub mod rfc2045 {
247 pub use super::Charset::Token;
248}
249
250/// Re-export of all charsets (Charset::... variants) from rfc6838.
251pub mod rfc6838 {
252 pub use super::Charset::RestrictedToken;
253}
254
255/// Re-export of all charsets (Charset::... variants) from rfc7320.
256///
257/// Note that `QTextWs` (rfc5322) is exported as `Qdtext` (rfc7320) as they are
258/// the equivalent (if the obsolete part of both grammars is excluded).
259pub mod rfc7230 {
260 pub use super::Charset::{
261 QTextWs as QDText,
262 Rfc7230Token as Token
263 };
264
265}
266
267
268#[inline]
269pub fn is_ws(ch: char) -> bool {
270 ch == ' ' || ch == '\t'
271}
272
273#[inline]
274pub fn is_vchar(ch: char) -> bool {
275 ' ' < ch && ch <= '~'
276}
277
278#[cfg(test)]
279mod test {
280 use super::{Charset, CharMatchExt, is_vchar};
281
282 #[test]
283 fn lookup_result_ascii() {
284 let res = Charset::lookup('<');
285 assert!(res.is_ascii());
286 assert!(res.is(Charset::QTextWs));
287 assert!(res.is_inkl_non_ascii(Charset::QTextWs));
288 assert!(res.is(Charset::CTextWs));
289 assert!(res.is_inkl_non_ascii(Charset::CTextWs));
290 assert!(!res.is(Charset::AText));
291 assert!(!res.is_inkl_non_ascii(Charset::AText));
292 }
293
294 #[test]
295 fn lookup_result_utf8() {
296 let res = Charset::lookup('↓');
297 assert!(!res.is_ascii());
298 assert!(!res.is(Charset::QTextWs));
299 assert!(res.is_inkl_non_ascii(Charset::QTextWs));
300 }
301
302 #[test]
303 fn is_part_of_charset() {
304 // Just a "general" check if it works, any specific checks about which
305 // chars belong to which set of chars is handled in the lookup modules
306 // tests.
307 assert!('<'.is(Charset::QTextWs));
308 assert!('<'.is_inkl_non_ascii(Charset::QTextWs));
309 assert!(!'<'.is(Charset::AText));
310 assert!(!'<'.is_inkl_non_ascii(Charset::AText));
311
312 let first_char_not_in_table = '\u{80}';
313 assert!(!first_char_not_in_table.is(Charset::CTextWs));
314 assert!(first_char_not_in_table.is_inkl_non_ascii(Charset::CTextWs));
315 }
316
317 #[test]
318 fn is_vchar_boundaries() {
319 let min = '!';
320 let min_m1 = ' ';
321 assert_eq!(min as u32 - 1, min_m1 as u32);
322 let max = '~';
323 let max_p1 = '\u{7f}';
324 assert_eq!(max as u32 + 1, max_p1 as u32);
325
326 assert!(is_vchar(min));
327 assert!(!is_vchar(min_m1));
328 assert!(is_vchar(max));
329 assert!(!is_vchar(max_p1));
330 }
331}