email_address_list/
parser.rs

1use lazy_static::*;
2use pest::iterators::{Pair, Pairs};
3use pest::Parser as PestParser;
4use pest_derive::Parser;
5use regex::Regex;
6
7use crate::error::Error::*;
8use crate::error::*;
9
10use std::convert::AsRef;
11
12use crate::address_list::*;
13
14lazy_static! {
15    static ref CSV: Regex = Regex::new(
16        r#"[^",]*"[^"\\]*\\.[^"\\]+"[^,"]+@[^,"]+|[^",]*".*?"[^,"]*@[^,"]*|[^,"]+@[^,"]+"#,
17    )
18    .unwrap();
19    static ref SSV: Regex = Regex::new(r#"[^;"]?".*?"[^;"]*|[^;"]*"#).unwrap();
20}
21
22#[derive(Parser)]
23#[grammar = "../grammars/permissive.pest"]
24struct Parser;
25
26fn parse_contact_pair(pair: Pair<'_, Rule>) -> Option<Result<Contact>> {
27    let mut c: EmailContact = Default::default();
28    for inner in pair.into_inner() {
29        match inner.as_rule() {
30            Rule::malformed | Rule::malformed_comment_name => c = c.set_name(inner.as_str()),
31            Rule::name => match inner.into_inner().next() {
32                Some(s) => c = c.set_name(s.as_str()),
33                None => return Some(Err(invalid_empty("name"))),
34            },
35            Rule::email | Rule::mailbox => c = c.set_email(inner.as_str()),
36            Rule::email_angle | Rule::mailbox_angle => match inner.into_inner().next() {
37                Some(s) => c = c.set_email(s.as_str()),
38                None => {
39                    return Some(Err(invalid_empty("email_angle or mailbox_angle")));
40                }
41            },
42            Rule::comment => c = c.set_comment(inner.as_str()),
43            Rule::garbage => {
44                let garbage = inner.as_str();
45                if garbage.is_empty() {
46                    return None;
47                }
48                return Some(Ok(GarbageContact::new(garbage).into()));
49            }
50            Rule::garbage_nongreedy => {
51                let garbage = inner.as_str().trim();
52                // garbage_nongreedy is special in the sense that we know that a mailbox
53                // precedes it - the only occurance of this I've seen was when domain names were
54                // separated by whitespace
55                let new_email = format!("{}{}", c.email().unwrap(), garbage);
56                c = c.set_email(new_email);
57            }
58            _ => return Some(Err(invalid_nesting("contact"))),
59        }
60    }
61    Some(Ok(c.into()))
62}
63
64fn parse_pairs(pairs: Pairs<'_, Rule>) -> Result<AddressList> {
65    let mut contacts = Contacts::new();
66    for pair in pairs {
67        match pair.as_rule() {
68            Rule::group => {
69                let mut group: Group = Default::default();
70                for inner in pair.into_inner() {
71                    match inner.as_rule() {
72                        Rule::name => {
73                            group.name = inner.into_inner().as_str().to_string();
74                        }
75                        Rule::contact_list => {
76                            group.contacts = inner
77                                .into_inner()
78                                .filter_map(parse_contact_pair)
79                                .collect::<Result<Contacts>>()?
80                        }
81                        _ => return Err(invalid_nesting("group")),
82                    }
83                }
84                return Ok(AddressList::from(group));
85            }
86            Rule::address_list => return parse_pairs(pair.into_inner()),
87            Rule::contact_list => {
88                contacts = pair
89                    .into_inner()
90                    .filter_map(parse_contact_pair)
91                    .collect::<Result<Contacts>>()?
92            }
93            _ => {
94                return Err(UnexpectedError(format!(
95                    "{:?} can't be parsed with this function",
96                    pair.as_rule(),
97                )));
98            }
99        }
100    }
101    Ok(AddressList::from(contacts))
102}
103
104fn check_empty<T>(address_list: &T) -> Result<&str>
105where
106    T: AsRef<str>,
107    T: ?Sized,
108{
109    let input = address_list.as_ref().trim();
110    match input {
111        "" => Err(Error::Empty),
112        _ => Ok(input),
113    }
114}
115
116/// Get an [`AddressList`] from a string
117///
118/// Tries its best to come up with the most reasonable parsed address list for a
119/// given (potentially spec-violating) input.
120///
121/// If there's nothing to parse (i.e. an empty string), this function "fails"
122/// with [`Error::Empty`], which is essentially equivalent to a `None`, but
123/// avoids nesting types.
124///
125/// # Examples
126///
127/// Named malformed group:
128///
129/// ```rust
130/// # use email_address_list::*;
131/// let input = r#"Kikundi:  ,  "Jina"  (Maoni) <jina@example.org>, baruapepe@example.org;"#;
132///
133/// let result = parse_address_list(input).unwrap();
134///
135/// let manual: AddressList = Group::new("Kikundi").set_contacts(vec![
136///     Contact::new("jina@example.org").set_name("Jina").set_comment("Maoni"),
137///     Contact::new("baruapepe@example.org")
138/// ]).into();
139///
140/// assert!(result.deep_eq(&manual));
141/// ```
142///
143/// Multiple contacts, some of which may be malformed:
144///
145/// ```rust
146/// # use email_address_list::*;
147/// let input = r#"Przykład <przykład@example.org>, Példa, Rosszformázott <példa@example.org>"#;
148///
149/// let manual: AddressList = vec![
150///     Contact::new("przykład@example.org").set_name("Przykład"),
151///     Contact::new("példa@example.org").set_name("Példa, Rosszformázott"),
152/// ].into();
153///
154/// println!("{:?}", manual);
155///
156/// let result = parse_address_list(input).unwrap();
157///
158/// assert!(result.deep_eq(&manual));
159/// ```
160///
161/// Supplying an empty string:
162///
163/// ```rust
164/// # use email_address_list::*;
165/// match parse_address_list("") {
166///     Err(error::Error::Empty) => assert!(true),
167///     Ok(_) | Err(_) => assert!(false),
168/// };
169/// ```
170///
171/// [`AddressList`]: enum.AddressList.html
172/// [`Error::Empty`]: error/enum.Error.html
173pub fn parse_address_list<T>(address_list: &T) -> Result<AddressList>
174where
175    T: AsRef<str>,
176    T: ?Sized,
177{
178    let input = check_empty(address_list)?;
179    let mut output = parse_pairs(Parser::parse(Rule::address_list, input)?)?;
180
181    /// Make estimation of correct parsing easier
182    ///
183    /// Remove all common characters, the lengths of this output should be roughly equal for what
184    /// we put in and what we plan to put out.
185    fn normalise(input: &str) -> String {
186        [",", "\"", "'", "<", ">"]
187            .iter()
188            .fold(input.to_string(), |o, p| o.replace(p, ""))
189            .replace(char::is_whitespace, "")
190    }
191
192    /// Comma separated values optimised for the way they are used in address lists
193    fn csv(input: &str) -> Vec<String> {
194        CSV.captures_iter(input)
195            .filter_map(|c| {
196                if let Some(c) = c.get(0) {
197                    return Some(c.as_str().into());
198                }
199                None
200            })
201            .collect()
202    }
203
204    /// Break apart undelimited addresses if they are present and put them in the appropriate place
205    /// of the list
206    // TODO add a way to fish out both addresses from something like:
207    // one@example.org Firstname Surname <two@example.org>
208    fn expand_undelimited(mut input: Vec<String>) -> Vec<String> {
209        let mut output = <Vec<String>>::new();
210        for (r, i) in (0..input.len()).enumerate() {
211            let j = &input[i - r];
212            if j.contains('>') {
213                for s in j.split('>') {
214                    if s.contains('<') {
215                        output.push(format!("{}>", s));
216                    } else if s.is_empty() {
217                        // don't do anything with empty bits
218                    } else {
219                        output.push(s.into());
220                    }
221                }
222                input.remove(i - r);
223            } else {
224                output.push(input.remove(i - r));
225            }
226        }
227        output
228    }
229
230    fn add_absent_contacts(input: &[String], output: &mut AddressList) -> Result<()> {
231        for contact in input.iter().map(parse_contact) {
232            let contact = contact?;
233            if let Contact::Email(_) = contact {
234                if !output.contains(&contact) {
235                    output.add(contact);
236                }
237            }
238        }
239        Ok(())
240    }
241
242    let input_n = normalise(input);
243    let output_n = normalise(&format!("{}", output));
244
245    if input_n.len() > output_n.len() {
246        let input_c = csv(input);
247        // Due to the way some headers are malformed, the grammar cannot account for all ways in
248        // which data out there is separated, This check is for an educated guess about
249        // whether we have a ';' separated address list, and returns it if necessary
250        if let AddressList::Contacts(_) = output {
251            if input_n.contains(';') {
252                let sc_input = SSV.captures_iter(input).fold(String::from(""), |mut f, c| {
253                    if let Some(cpt) = c.get(0) {
254                        f.push_str(cpt.as_str());
255                        f.push(',');
256                    }
257                    f
258                });
259                let mut sc_output = parse_pairs(Parser::parse(
260                    Rule::address_list,
261                    sc_input.trim_end_matches(','),
262                )?)?;
263                // If the semi-colon delimited output is bigger than the regular one we're likely
264                // a completely semi-colon separated list, however, we're still trying to find
265                // as many contacts as possible by looking for undelimited ones
266                if sc_output.len() > output.len() && sc_output.len() > input_c.len() {
267                    let sc_output_n = normalise(&format!("{}", sc_output));
268                    if input_n.len() > sc_output_n.len() {
269                        let sc_input_c_a = expand_undelimited(csv(&sc_input));
270                        add_absent_contacts(&sc_input_c_a, &mut sc_output)?;
271                    }
272                    return Ok(sc_output);
273                }
274            }
275        }
276
277        // Last resort, deal with split commas as individual contacts and build an AddressList from
278        // that
279        let input_c_a = expand_undelimited(input_c);
280        if input_c_a.len() > output.len() {
281            add_absent_contacts(&input_c_a, &mut output)?;
282        }
283    }
284    Ok(output)
285}
286
287/// Parse only a single [`Contact`], ignore the rest
288///
289/// Just like [`parse_address_list`], this function "fails" with
290/// [`Error::Empty`] when the supplied string is empty.
291///
292/// # Examples
293///
294/// Single contact:
295///
296/// ```rust
297/// # use email_address_list::*;
298/// let single = parse_contact("<retpoŝto+kontakto@example.org>").unwrap();
299///
300/// assert!(single.deep_eq(&Contact::new("retpoŝto+kontakto@example.org")));
301/// ```
302///
303/// Multiple contacts:
304///
305/// ```rust
306/// # use email_address_list::*;
307/// let multiple = parse_contact("courriel@example.org, exemple@example.org").unwrap();
308///
309/// assert!(multiple.deep_eq(&Contact::new("courriel@example.org")));
310/// ```
311///
312/// Not a contact:
313///
314/// ```rust
315/// # use email_address_list::*;
316/// match parse_contact("Mist").unwrap() {
317///     Contact::Garbage(_) => assert!(true),
318///     Contact::Email(_) => assert!(false),
319/// }
320/// ```
321///
322/// Empty input:
323///
324/// ```rust
325/// # use email_address_list::*;
326/// match parse_contact(",") {
327///     Err(error::Error::Empty) => assert!(true),
328///     Ok(_) | Err(_) => assert!(false),
329/// }
330/// ```
331///
332/// [`Contact`]: enum.Contact.html
333/// [`parse_address_list`]: fn.parse_address_list.html
334/// [`Error::Empty`]: error/enum.Error.html
335pub fn parse_contact<T>(contact: &T) -> Result<Contact>
336where
337    T: AsRef<str>,
338    T: ?Sized,
339{
340    let contact = check_empty(contact)?;
341    let mut pairs = Parser::parse(Rule::contact, contact)?;
342    if let Some(contact) = pairs.next() {
343        if let Some(c) = parse_contact_pair(contact) {
344            return c;
345        }
346    }
347    Err(Error::Empty)
348}