1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
//! # crumble
//!
//! A robust, minimal library for parsing MIME documents.
//!
//! Supports UTF-8, multipart documents, and nested documents. `crumble` assumes input is mostly
//! compliant and tries to parse input as best as possible.
//!
//! Output is a minimal AST-like structure, with no filtering. This structure should be further
//! processed to produce useful work. See for example [`crinkle`][1]. 
//!
//! # Example
//!
//! ```
//! use crumble::Message;
//! let message = load_mime_file(); // Example! The consumer must have some source of documents
//! let message = Message::new(&message)?; // Handle errors
//! ```
//! [1]: https://git.sr.ht/~happy_shredder/crinkle

#![feature(test)]
#[cfg(test)] mod tests;
#[macro_use] extern crate lazy_static;

use regex::Regex;
use std::fmt::Write;

/// Internal error type.
///
/// # Remarks
/// This library is permissive and tries to parse input as best as it can.
/// Hence very few errors are possible. Trying to parse non-MIME documents is undefined behaviour.
#[derive(Debug,PartialEq)]
pub enum Error {
    Unknown,
    InvalidString,
    ParseError,
}

impl std::error::Error for Error {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        Some(self)
    }
}

impl std::fmt::Display for Error {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Error::Unknown => write!(f, "Error parsing message: Unknown error"),
            Error::InvalidString => write!(f, "Error parsing message: Invalid string"),
            Error::ParseError => write!(f, "Error parsing message: Invalid document"),
        }
    }
}

/// Wraps a String tuple for more literate usage and application of traits.
#[derive(Debug,PartialEq)]
pub struct Header {
    pub key: String,
    pub value: String,
}

impl Header {
    pub fn new(key: &str, value: &str) -> Header {
        Header {
            key: key.to_string(),
            value: value.to_string(),
        }
    }
}

impl std::string::ToString for Header {
    fn to_string(&self) -> String {
        format!("{}: {}", self.key, self.value)
    }
}

trait Search {
    fn search(&self, needle: &str) -> Option<String>;
}

impl Search for Vec<Header> {
    fn search(&self, needle: &str) -> Option<String> {
        for pair in self {
            if pair.key.to_lowercase() == needle.to_lowercase() {
                return Some(pair.value.to_string())
            }
        }
        None
    }
}

trait Boundary {
    fn boundary(&self) -> Option<String>;
}

impl Boundary for Vec<Header> {
    fn boundary(&self) -> Option<String> {
        match self.search("Content-Type") {
            None => return None,
            Some(ct) => {
                // https://tools.ietf.org/html/rfc1521#page-10
                // First split off the type/subtype from the parameters
                let tmp: Vec<&str> = ct.splitn(2, ";").collect();

                // Now search the parameters
                if tmp.len() > 1 {
                    let tmp = tmp[1];
                    let tmp: Vec<&str> = tmp.split(";").collect();
                    for p in tmp {
                        let parameter: Vec<&str> = p.split("=").collect();
                        let key = parameter[0].trim().to_lowercase();
                        let value = parameter[1].trim();
                        let value = value.replace("\"", ""); // strip quotes
                        let value = value.trim();

                        if key == "boundary" {
                            return Some(value.to_string())
                        }
                    }
                }
                return None
            }
        }
    }
}

// Wrapper to avoid error.
trait ToString {
    fn to_string(&self) -> String;
}

impl ToString for Vec<Header> {
    fn to_string(&self) -> String {
        let tmp: Vec<String> = self.iter().map(|x| x.to_string()).collect();
        let tmp = tmp.join("\n");
        String::from(tmp)
    }
}

/// Representation of a section of a MIME document.
///
/// MIME sections can be some text; a header and some text or data; or nested combinations.
/// This parser may produce empty sections.
#[derive(Debug, PartialEq)]
pub enum Section {
    Plain {body: Vec<u8>},
    Multipart {
        headers: Vec<Header>,
        body: Vec<Box<Section>>,
    },
    Empty,
}

impl std::string::ToString for Section {
    fn to_string(&self) -> String {
        let mut section_string = String::new();
        match self {
            Section::Plain {body} => write!(&mut section_string, "{}", std::str::from_utf8(body).unwrap()).expect("Error constructing string."),
            Section::Multipart {headers, body} => {
                write!(&mut section_string, "{}\n", headers.to_string()).expect("Error constructing string.");
                let boundary = headers.boundary();
                match &boundary {
                    None => (),
                    Some(b) => write!(&mut section_string, "--{}", b).expect("Error constructing string."),
                };
                for section in body {
                    write!(&mut section_string, "\n{}\n", section.to_string()).expect("Error constructing string.");
                    match &boundary {
                        None => (),
                        Some(b) => write!(&mut section_string, "--{}", b).expect("Error constructing string."),
                    };
                }
                match &boundary {
                    None => (),
                    Some(_b) => write!(&mut section_string, "--").expect("Error constructing string."),
                };
            },
            Section::Empty => (),
        }
        section_string
    }
}

impl Section {
    fn new(raw_section: &str) -> Result<Section, Box<dyn std::error::Error + 'static>> {
        // A section can either be just some plain text, or be split into headers/body.
        // That body is also section.
        // If the raw section has no headers, return it as plain
        // If it has headers, split off the headers and recurse

        // Catch leftover from multipart parsing
        if raw_section == "--\n" {
            return Ok(Section::Empty);
        }
        if raw_section == "--" {
            return Ok(Section::Empty);
        }

        // A section with headers has a different parsing pipeline than one without.
        if Section::has_headers(raw_section)? {
            Section::parse_multipart(raw_section)
        } else {
            Ok(Section::Plain {body: raw_section.as_bytes().to_vec()})
        }
    }

    fn has_headers(raw_message: &str) -> Result<bool, Box<dyn std::error::Error + 'static>> {
        // If there are headers there should be a content-type
        // Note that headers may be separated by a boundary (nested sections) or newlines (not
        // nested)
        lazy_static! {
            static ref RE: Regex = Regex::new(r"(Content-Type|Content-type|content-type): .+?").unwrap();
        }
        // Performance: Assume that the header is not too long and the boundary appears early
        if raw_message.len() > 3000 {
            Ok(RE.is_match(&raw_message[0..3000]))
        } else {
            Ok(RE.is_match(raw_message))
        }
    }

    fn has_boundary(raw_message: &str) -> Result<bool, Box<dyn std::error::Error + 'static>> {
        lazy_static! {
            static ref RE: Regex = Regex::new(r"(boundary|Boundary)=.+?").unwrap();
        }
        // Performance: Assume that the header is not too long and the boundary appears early
        if raw_message.len() > 3000 {
            Ok(RE.is_match(&raw_message[0..3000]))
        } else {
            Ok(RE.is_match(raw_message))
        }
    }

    fn parse_multipart(raw_section: &str) -> Result<Section, Box<dyn std::error::Error + 'static>> {
        // The body can contain a series of (possibly nested) sections
        // So check for a boundary.
        // If there is a boundary, split the body and iterate.
        // Otherwise, just return a single-entry Vec

        if Section::has_boundary(raw_section)? {
            lazy_static! {
                static ref RE: Regex = Regex::new(r#"(?m)(boundary|Boundary)=(("|')(?P<boundary1>[[:print:]]+?)("|')|(?P<boundary2>[[:print:]]+?($|;)))"#).unwrap();
            }
            let boundary = match RE.captures(raw_section) {
                Some(c) => {
                    let boundary1 = match c.name("boundary1") {
                        Some(s) => s.as_str().to_string(),
                        None => String::new()
                    };
                    let boundary2 = match c.name("boundary2") {
                        Some(s) => s.as_str().to_string(),
                        None => String::new()
                    };
                    if boundary1.len() > 0 {
                        boundary1
                    } else if boundary2.len() > 0 {
                        boundary2
                    } else {
                        String::new()
                    }
                },
                None => String::new(),
            };
            if boundary.len() == 0 {
                return Err(Box::new(Error::ParseError));
            }
            // Each section is separated by --<boundary>, and finishes with --<boundary>--
            let boundary = format!("--{}", boundary);
            let raw_sections: Vec<&str> = raw_section.split(boundary.as_str()).collect();

            let raw_headers = raw_sections[0];
            let headers = parse_headers(raw_headers)?;

            let mut sections = Vec::new();
            let raw_sections = &raw_sections[1..raw_sections.len() - 1]; // Drop empty section at tail

            for section in raw_sections {
                // Recursively construct sections
                let section = Section::new(&section)?;
                sections.push(Box::new(section));
            }

            Ok(Section::Multipart {
                headers: headers,
                body: sections,
            })
        } else {
            // Separate out headers
            lazy_static! {
                static ref RE: Regex = Regex::new(r"\n{2,}|\r{2,}").unwrap();
            }
            let split: Vec<&str> = RE.splitn(raw_section, 2).collect();
            let raw_headers = split[0];
            let headers = parse_headers(raw_headers)?;

            // Process body
            let body = split[1];
            let body = Section::new(&body)?;
            let sections = vec![Box::new(body)];

            Ok(Section::Multipart {
                headers: headers,
                body: sections,
            })
        }
    }
}

/// Representation of a MIME document.
///
/// MIME documents have a large initial key-value header, followed by one or more text/data sections.
/// A section can be some plain text; a header with text or data; or some nested combination.
#[derive(Debug)]
pub struct Message {
    pub headers: Vec<Header>,
    pub sections: Vec<Section>,
}

impl std::string::ToString for Message {
    fn to_string(&self) -> String {
        let mut message = String::new();
        write!(&mut message, "{}\n\n", self.headers.to_string()).expect("Error constructing string.");
        let boundary = self.headers.boundary();
        match &boundary {
            None => (),
            Some(b) => write!(&mut message, "--{}", b).expect("Error constructing string."),
        };
        for section in &self.sections {
            write!(&mut message, "\n{}\n", section.to_string()).expect("Error constructing string.");
            match &boundary {
                None => (),
                Some(b) => write!(&mut message, "--{}", b).expect("Error constructing string."),
            };
        }
        match boundary {
            None => (),
            Some(_b) => write!(&mut message, "--").expect("Error constructing string."),
        };
        message
    }
}

impl Message {
    /// Parse a MIME document and return structured representation.
    /// Performance should be reasonable: provided tests take between 5 and 200μs per document.
    pub fn new(raw_message: &str) -> Result<Message, Box<dyn std::error::Error + 'static>> {
        // Multipart and plain messages require entirely different parsing pathways
        if Message::is_multipart(raw_message)? {
            Message::parse_multipart(raw_message)
        } else {
            Message::parse_plain(raw_message)
        }
    }

    fn is_multipart(raw_message: &str) -> Result<bool, Box<dyn std::error::Error + 'static>> {
        lazy_static! {
            static ref RE: Regex = Regex::new(r"(Content-Type|Content-type|content-type): multipart.+?").unwrap();
        }
        Ok(RE.is_match(&raw_message))
    }

    fn parse_plain(raw_message: &str) -> Result<Message, Box<dyn std::error::Error + 'static>> {
        // Plain messages separate the headers from the body with more than 2 newlines
        lazy_static! {
            static ref RE: Regex = Regex::new(r"\n{2,}|\r{2,}").unwrap();
        }
        let split: Vec<&str> = RE.splitn(raw_message, 2).collect();

        if split.len() != 2 || split[0].len() == 0 || split[1].len() == 0 {
            return Err(Box::new(Error::InvalidString));
        }

        let raw_headers = split[0];
        let headers = parse_headers(raw_headers)?;

        // Everything after the header is by definition the body. There is only one section.
        let tmp = split[1];
        let sections = vec![Section::new(&tmp)?];

        Ok(Message {
            headers: headers,
            sections: sections,
        })
    }

    fn parse_multipart(raw_message: &str) -> Result<Message, Box<dyn std::error::Error + 'static>> {
        // Multipart messages separate parts using a boundary string, defined in the main headers
        // Any reasonable string after a `boundary="` is the boundary
        lazy_static! {
            static ref RE: Regex = Regex::new(r#"(Boundary|boundary)=("|')(?P<boundary>[[:print:]]+?)("|')"#).unwrap();
        }
        let b = match RE.captures(raw_message) {
            Some(c) => c["boundary"].to_string(),
            None => return Err(Box::new(Error::InvalidString)),
        };
        let boundary = format!("--{}", b);
        let raw_parts: Vec<&str> = raw_message.split(boundary.as_str()).collect();

        let raw_headers = raw_parts[0];
        let headers = parse_headers(raw_headers)?;

        let mut sections = Vec::new();
        let raw_parts = &raw_parts[1..raw_parts.len()];

        // Parse each section
        for section in raw_parts {
            let section = Section::new(section)?; // Note that this constructor will recursively build sections, as required
            sections.push(section);
        }

        Ok(Message {
            headers: headers,
            sections: sections,
        })
    }
}

// Find keys and values for each header
fn parse_headers(raw_headers: &str) -> Result<Vec<Header>, Box<dyn std::error::Error + 'static>> {
    // A MIME key is a string of letters|numbers|-|_, followed by a :
    // It starts on it's own line (i.e. after a \n)
    // While the spec requires a header to be all on its own line,
    // parsers in the wild (e.g. GMail) will split headers across mutliple lines.
    // Hence, a value is everything between two keys.
    // So, find the positions of all the keys -> extract key String
    // Then, infer the position of text between subsequent keys -> extract value String

    lazy_static! {
        static ref RE: Regex = Regex::new(r"(?m)^[0-9A-Za-z_\-]+:").unwrap();
    }
    let mut header_indices: Vec<(usize,usize)> = Vec::new();

    // Construct a vector of key positions
    for header in RE.find_iter(raw_headers) {
        header_indices.push((header.start(), header.end() - 1)); // Strip off :
    }

    // Convert key positions to key and value strings
    let mut headers: Vec<Header> = Vec::new();
    for (index, header) in header_indices.iter().enumerate() {
        let key = String::from(&raw_headers[header.0..header.1]);
        let key = key.trim();
        let key = String::from(key);
        let key = key.to_lowercase();

        // The final value is not between two keys: it is final key to end of string
        if index < header_indices.len() - 1 {
            let value = String::from(&raw_headers[header.1 + 2..header_indices[index + 1].0]);  // Correct for :
            let value = value.trim();
            headers.push(Header::new(&key, &value));
        } else {
            let value = String::from(&raw_headers[header.1 + 2..]);
            let value = value.trim();
            headers.push(Header::new(&key, &value));
        }
    }

    Ok(headers)
}