email_address_list/parser.rs
1use lazy_static::*;
2use pest::iterators::{Pair, Pairs};
3use pest::Parser as PestParser;
4use pest_derive::Parser;
5use regex::Regex;
6
7use crate::error::Error::*;
8use crate::error::*;
9
10use std::convert::AsRef;
11
12use crate::address_list::*;
13
14lazy_static! {
15 static ref CSV: Regex = Regex::new(
16 r#"[^",]*"[^"\\]*\\.[^"\\]+"[^,"]+@[^,"]+|[^",]*".*?"[^,"]*@[^,"]*|[^,"]+@[^,"]+"#,
17 )
18 .unwrap();
19 static ref SSV: Regex = Regex::new(r#"[^;"]?".*?"[^;"]*|[^;"]*"#).unwrap();
20}
21
22#[derive(Parser)]
23#[grammar = "../grammars/permissive.pest"]
24struct Parser;
25
26fn parse_contact_pair(pair: Pair<'_, Rule>) -> Option<Result<Contact>> {
27 let mut c: EmailContact = Default::default();
28 for inner in pair.into_inner() {
29 match inner.as_rule() {
30 Rule::malformed | Rule::malformed_comment_name => c = c.set_name(inner.as_str()),
31 Rule::name => match inner.into_inner().next() {
32 Some(s) => c = c.set_name(s.as_str()),
33 None => return Some(Err(invalid_empty("name"))),
34 },
35 Rule::email | Rule::mailbox => c = c.set_email(inner.as_str()),
36 Rule::email_angle | Rule::mailbox_angle => match inner.into_inner().next() {
37 Some(s) => c = c.set_email(s.as_str()),
38 None => {
39 return Some(Err(invalid_empty("email_angle or mailbox_angle")));
40 }
41 },
42 Rule::comment => c = c.set_comment(inner.as_str()),
43 Rule::garbage => {
44 let garbage = inner.as_str();
45 if garbage.is_empty() {
46 return None;
47 }
48 return Some(Ok(GarbageContact::new(garbage).into()));
49 }
50 Rule::garbage_nongreedy => {
51 let garbage = inner.as_str().trim();
52 // garbage_nongreedy is special in the sense that we know that a mailbox
53 // precedes it - the only occurance of this I've seen was when domain names were
54 // separated by whitespace
55 let new_email = format!("{}{}", c.email().unwrap(), garbage);
56 c = c.set_email(new_email);
57 }
58 _ => return Some(Err(invalid_nesting("contact"))),
59 }
60 }
61 Some(Ok(c.into()))
62}
63
64fn parse_pairs(pairs: Pairs<'_, Rule>) -> Result<AddressList> {
65 let mut contacts = Contacts::new();
66 for pair in pairs {
67 match pair.as_rule() {
68 Rule::group => {
69 let mut group: Group = Default::default();
70 for inner in pair.into_inner() {
71 match inner.as_rule() {
72 Rule::name => {
73 group.name = inner.into_inner().as_str().to_string();
74 }
75 Rule::contact_list => {
76 group.contacts = inner
77 .into_inner()
78 .filter_map(parse_contact_pair)
79 .collect::<Result<Contacts>>()?
80 }
81 _ => return Err(invalid_nesting("group")),
82 }
83 }
84 return Ok(AddressList::from(group));
85 }
86 Rule::address_list => return parse_pairs(pair.into_inner()),
87 Rule::contact_list => {
88 contacts = pair
89 .into_inner()
90 .filter_map(parse_contact_pair)
91 .collect::<Result<Contacts>>()?
92 }
93 _ => {
94 return Err(UnexpectedError(format!(
95 "{:?} can't be parsed with this function",
96 pair.as_rule(),
97 )));
98 }
99 }
100 }
101 Ok(AddressList::from(contacts))
102}
103
104fn check_empty<T>(address_list: &T) -> Result<&str>
105where
106 T: AsRef<str>,
107 T: ?Sized,
108{
109 let input = address_list.as_ref().trim();
110 match input {
111 "" => Err(Error::Empty),
112 _ => Ok(input),
113 }
114}
115
116/// Get an [`AddressList`] from a string
117///
118/// Tries its best to come up with the most reasonable parsed address list for a
119/// given (potentially spec-violating) input.
120///
121/// If there's nothing to parse (i.e. an empty string), this function "fails"
122/// with [`Error::Empty`], which is essentially equivalent to a `None`, but
123/// avoids nesting types.
124///
125/// # Examples
126///
127/// Named malformed group:
128///
129/// ```rust
130/// # use email_address_list::*;
131/// let input = r#"Kikundi: , "Jina" (Maoni) <jina@example.org>, baruapepe@example.org;"#;
132///
133/// let result = parse_address_list(input).unwrap();
134///
135/// let manual: AddressList = Group::new("Kikundi").set_contacts(vec![
136/// Contact::new("jina@example.org").set_name("Jina").set_comment("Maoni"),
137/// Contact::new("baruapepe@example.org")
138/// ]).into();
139///
140/// assert!(result.deep_eq(&manual));
141/// ```
142///
143/// Multiple contacts, some of which may be malformed:
144///
145/// ```rust
146/// # use email_address_list::*;
147/// let input = r#"Przykład <przykład@example.org>, Példa, Rosszformázott <példa@example.org>"#;
148///
149/// let manual: AddressList = vec![
150/// Contact::new("przykład@example.org").set_name("Przykład"),
151/// Contact::new("példa@example.org").set_name("Példa, Rosszformázott"),
152/// ].into();
153///
154/// println!("{:?}", manual);
155///
156/// let result = parse_address_list(input).unwrap();
157///
158/// assert!(result.deep_eq(&manual));
159/// ```
160///
161/// Supplying an empty string:
162///
163/// ```rust
164/// # use email_address_list::*;
165/// match parse_address_list("") {
166/// Err(error::Error::Empty) => assert!(true),
167/// Ok(_) | Err(_) => assert!(false),
168/// };
169/// ```
170///
171/// [`AddressList`]: enum.AddressList.html
172/// [`Error::Empty`]: error/enum.Error.html
173pub fn parse_address_list<T>(address_list: &T) -> Result<AddressList>
174where
175 T: AsRef<str>,
176 T: ?Sized,
177{
178 let input = check_empty(address_list)?;
179 let mut output = parse_pairs(Parser::parse(Rule::address_list, input)?)?;
180
181 /// Make estimation of correct parsing easier
182 ///
183 /// Remove all common characters, the lengths of this output should be roughly equal for what
184 /// we put in and what we plan to put out.
185 fn normalise(input: &str) -> String {
186 [",", "\"", "'", "<", ">"]
187 .iter()
188 .fold(input.to_string(), |o, p| o.replace(p, ""))
189 .replace(char::is_whitespace, "")
190 }
191
192 /// Comma separated values optimised for the way they are used in address lists
193 fn csv(input: &str) -> Vec<String> {
194 CSV.captures_iter(input)
195 .filter_map(|c| {
196 if let Some(c) = c.get(0) {
197 return Some(c.as_str().into());
198 }
199 None
200 })
201 .collect()
202 }
203
204 /// Break apart undelimited addresses if they are present and put them in the appropriate place
205 /// of the list
206 // TODO add a way to fish out both addresses from something like:
207 // one@example.org Firstname Surname <two@example.org>
208 fn expand_undelimited(mut input: Vec<String>) -> Vec<String> {
209 let mut output = <Vec<String>>::new();
210 for (r, i) in (0..input.len()).enumerate() {
211 let j = &input[i - r];
212 if j.contains('>') {
213 for s in j.split('>') {
214 if s.contains('<') {
215 output.push(format!("{}>", s));
216 } else if s.is_empty() {
217 // don't do anything with empty bits
218 } else {
219 output.push(s.into());
220 }
221 }
222 input.remove(i - r);
223 } else {
224 output.push(input.remove(i - r));
225 }
226 }
227 output
228 }
229
230 fn add_absent_contacts(input: &[String], output: &mut AddressList) -> Result<()> {
231 for contact in input.iter().map(parse_contact) {
232 let contact = contact?;
233 if let Contact::Email(_) = contact {
234 if !output.contains(&contact) {
235 output.add(contact);
236 }
237 }
238 }
239 Ok(())
240 }
241
242 let input_n = normalise(input);
243 let output_n = normalise(&format!("{}", output));
244
245 if input_n.len() > output_n.len() {
246 let input_c = csv(input);
247 // Due to the way some headers are malformed, the grammar cannot account for all ways in
248 // which data out there is separated, This check is for an educated guess about
249 // whether we have a ';' separated address list, and returns it if necessary
250 if let AddressList::Contacts(_) = output {
251 if input_n.contains(';') {
252 let sc_input = SSV.captures_iter(input).fold(String::from(""), |mut f, c| {
253 if let Some(cpt) = c.get(0) {
254 f.push_str(cpt.as_str());
255 f.push(',');
256 }
257 f
258 });
259 let mut sc_output = parse_pairs(Parser::parse(
260 Rule::address_list,
261 sc_input.trim_end_matches(','),
262 )?)?;
263 // If the semi-colon delimited output is bigger than the regular one we're likely
264 // a completely semi-colon separated list, however, we're still trying to find
265 // as many contacts as possible by looking for undelimited ones
266 if sc_output.len() > output.len() && sc_output.len() > input_c.len() {
267 let sc_output_n = normalise(&format!("{}", sc_output));
268 if input_n.len() > sc_output_n.len() {
269 let sc_input_c_a = expand_undelimited(csv(&sc_input));
270 add_absent_contacts(&sc_input_c_a, &mut sc_output)?;
271 }
272 return Ok(sc_output);
273 }
274 }
275 }
276
277 // Last resort, deal with split commas as individual contacts and build an AddressList from
278 // that
279 let input_c_a = expand_undelimited(input_c);
280 if input_c_a.len() > output.len() {
281 add_absent_contacts(&input_c_a, &mut output)?;
282 }
283 }
284 Ok(output)
285}
286
287/// Parse only a single [`Contact`], ignore the rest
288///
289/// Just like [`parse_address_list`], this function "fails" with
290/// [`Error::Empty`] when the supplied string is empty.
291///
292/// # Examples
293///
294/// Single contact:
295///
296/// ```rust
297/// # use email_address_list::*;
298/// let single = parse_contact("<retpoŝto+kontakto@example.org>").unwrap();
299///
300/// assert!(single.deep_eq(&Contact::new("retpoŝto+kontakto@example.org")));
301/// ```
302///
303/// Multiple contacts:
304///
305/// ```rust
306/// # use email_address_list::*;
307/// let multiple = parse_contact("courriel@example.org, exemple@example.org").unwrap();
308///
309/// assert!(multiple.deep_eq(&Contact::new("courriel@example.org")));
310/// ```
311///
312/// Not a contact:
313///
314/// ```rust
315/// # use email_address_list::*;
316/// match parse_contact("Mist").unwrap() {
317/// Contact::Garbage(_) => assert!(true),
318/// Contact::Email(_) => assert!(false),
319/// }
320/// ```
321///
322/// Empty input:
323///
324/// ```rust
325/// # use email_address_list::*;
326/// match parse_contact(",") {
327/// Err(error::Error::Empty) => assert!(true),
328/// Ok(_) | Err(_) => assert!(false),
329/// }
330/// ```
331///
332/// [`Contact`]: enum.Contact.html
333/// [`parse_address_list`]: fn.parse_address_list.html
334/// [`Error::Empty`]: error/enum.Error.html
335pub fn parse_contact<T>(contact: &T) -> Result<Contact>
336where
337 T: AsRef<str>,
338 T: ?Sized,
339{
340 let contact = check_empty(contact)?;
341 let mut pairs = Parser::parse(Rule::contact, contact)?;
342 if let Some(contact) = pairs.next() {
343 if let Some(c) = parse_contact_pair(contact) {
344 return c;
345 }
346 }
347 Err(Error::Empty)
348}