whatsapp_export_parser/
parser.rs

1use crate::{Attachment, Body, DirectMessage, Message, Metadata, Span};
2use chrono::NaiveDateTime;
3use std::{
4    borrow::Cow,
5    fmt::{self, Display, Formatter},
6};
7
8/// Try to parse some WhatsApp messages from an export file.
9pub fn parse(src: &str) -> Parsed {
10    let cursor = Cursor::new(src);
11    let mut errors = Vec::new();
12
13    let messages = parse_file(cursor, |d| errors.push(d));
14
15    Parsed { messages, errors }
16}
17
18/// The outcome of parsing an exported WhatsApp chat.
19#[derive(Debug, Clone, PartialEq)]
20#[non_exhaustive]
21pub struct Parsed {
22    /// The messages that were found.
23    pub messages: Vec<Message>,
24    /// Any parse errors that may have occurred.
25    pub errors: Vec<ParseError>,
26}
27
28#[derive(Debug, Copy, Clone, PartialEq)]
29struct Cursor<'src> {
30    rest: &'src str,
31    index: usize,
32}
33
34impl<'src> Cursor<'src> {
35    const fn new(src: &'src str) -> Self {
36        Cursor {
37            rest: src,
38            index: 0,
39        }
40    }
41
42    fn is_empty(&self) -> bool { self.len() == 0 }
43
44    fn len(&self) -> usize { self.rest.len() }
45
46    /// Tries to split off some text from the front of the [`Cursor`], using a
47    /// predicate to determine the split point.
48    fn split_at<P>(self, mut predicate: P) -> Option<(&'src str, Self)>
49    where
50        P: FnMut(char) -> bool,
51    {
52        let start = self.index;
53        let mut end = start;
54
55        for c in self.rest.chars() {
56            if predicate(c) {
57                break;
58            } else {
59                end += c.len_utf8();
60            }
61        }
62
63        if start == end {
64            None
65        } else {
66            let bytes_read = end - start;
67            Some(self.split(bytes_read))
68        }
69    }
70
71    fn split(self, index: usize) -> (&'src str, Self) {
72        let text = &self.rest[..index];
73        (text, self.advance(index))
74    }
75
76    fn skip_to_next_line(self) -> Self {
77        // skip to the next newline character
78        let (_, cursor) = self.rest_of_line();
79
80        // then keep consuming the newline characters until we find something
81        // else
82        if let Some((_, cursor)) = cursor.split_at(|c| c != '\n' && c != '\r') {
83            cursor
84        } else {
85            cursor.eof()
86        }
87    }
88
89    fn rest_of_line(self) -> (&'src str, Cursor<'src>) {
90        if self.rest.starts_with('\n') {
91            // there is nothing left on this line
92            return ("", self);
93        }
94
95        self.split_at(|c| c == '\n')
96            .unwrap_or_else(|| (self.rest, self.eof()))
97    }
98
99    fn advance(self, amount: usize) -> Self {
100        Cursor {
101            rest: &self.rest[amount..],
102            index: self.index + amount,
103        }
104    }
105
106    fn eof(self) -> Self { self.advance(self.len()) }
107}
108
109/// Parse an entire file.
110///
111/// In technical jargon, this is a naive `LL(k)` parser with arbitrary length
112/// lookahead, and based on the following grammar:
113///
114/// ```bnf
115/// file         := message*
116/// message      := metadata ":" body
117/// metadata     := timestamp "-" sender
118/// body         := attachment | direct_message
119/// attachment   := NAME_OR_PATH  "(file attached)"
120/// timestamp    := DATETIME
121/// sender       := NAME_OR_PATH
122///
123/// NAME_OR_PATH := /[\w\d.\s-]+/
124/// (* a timestamp formatted with the user's locale *)
125/// DATETIME     := "%d/%m/%y, %H:%M" | "%d/%m/%y, %I:%M %P"
126/// ```
127///
128/// Each rule of the grammar gets its own `parse_*` function (e.g. the `message`
129/// rule is parsed with [`parse_message()`]).
130///
131/// Instead of mutating any internal state, each function is given the input
132/// text and location ([`Cursor`]), and will return the parsed item and a new
133/// [`Cursor`] representing the unparsed part of the input.
134fn parse_file<E>(mut cursor: Cursor<'_>, mut on_error: E) -> Vec<Message>
135where
136    E: FnMut(ParseError),
137{
138    let mut messages = Vec::new();
139
140    while !cursor.is_empty() {
141        match parse_message(cursor) {
142            Ok((msg, new_cursor)) => {
143                messages.push(msg);
144                cursor = new_cursor;
145            },
146            Err(diag) => {
147                on_error(diag);
148            },
149        }
150
151        // make sure the next call will start at the beginning of
152        // the next line
153        cursor = cursor.skip_to_next_line();
154    }
155
156    messages
157}
158
159fn parse_message(
160    cursor: Cursor<'_>,
161) -> Result<(Message, Cursor<'_>), ParseError> {
162    let start = cursor.index;
163
164    let (meta, cursor) =
165        parse_metadata(cursor).map_err(|d| d.namespaced("metadata"))?;
166
167    let cursor = skip_character_surrounded_by_space(cursor, ':')?;
168    let (body, cursor) = parse_body(cursor);
169
170    let end = cursor.index;
171    let span = Span::new(start, end);
172    let msg = Message { meta, body, span };
173
174    Ok((msg, cursor))
175}
176
177fn parse_metadata(
178    cursor: Cursor<'_>,
179) -> Result<(Metadata, Cursor<'_>), ParseError> {
180    let start = cursor.index;
181
182    let (timestamp, cursor) = parse_timestamp(cursor)?;
183    let cursor = skip_character_surrounded_by_space(cursor, '-')?;
184    let (sender, cursor) = parse_sender(cursor)?;
185
186    let end = cursor.index;
187    let span = Span::new(start, end);
188    let meta = Metadata {
189        timestamp,
190        sender: String::from(sender),
191        span,
192    };
193
194    Ok((meta, cursor))
195}
196
197fn parse_body(cursor: Cursor<'_>) -> (Body, Cursor<'_>) {
198    if let Some((attachment, cursor)) = parse_attachment(cursor) {
199        (Body::from(attachment), cursor)
200    } else {
201        let (dm, cursor) = parse_direct_message(cursor);
202        (Body::from(dm), cursor)
203    }
204}
205
206fn parse_attachment(cursor: Cursor<'_>) -> Option<(Attachment, Cursor<'_>)> {
207    let (rest_of_line, end_of_line) = cursor.rest_of_line();
208
209    if rest_of_line.find(" (file attached)").is_none() {
210        // couldn't find the magic string for attachments
211        return None;
212    }
213
214    let start = cursor.index;
215    let (name, _) = parse_attachment_name(cursor)?;
216    let end = start + name.len();
217
218    let attachment = Attachment {
219        name: String::from(name),
220        span: Span::new(start, end),
221    };
222
223    Some((attachment, end_of_line))
224}
225
226fn parse_direct_message(cursor: Cursor<'_>) -> (DirectMessage, Cursor<'_>) {
227    // Unlike every other rule, a direct message may take up multiple lines.
228    //
229    // We can work around this by being a bit sneaky... Keep reading content
230    // until we encounter something that parses correctly as metadata (i.e.
231    // it's the start of the next message) then backtrack to the start of
232    // that line, accepting everything in between as the message body.
233    //
234    // because direct messages are arbitrary text, it's actually impossible
235    // for this rule to fail.
236    let start = cursor.index;
237
238    let (text, cursor) = to_end_of_direct_message(cursor);
239
240    // to provide better spans, we'll also skip past leading whitespace
241    let text_without_leading_whitespace = text.trim_start();
242    let bytes_skipped = text.len() - text_without_leading_whitespace.len();
243    let span = Span::new(start + bytes_skipped, cursor.index);
244
245    let msg = DirectMessage {
246        content: String::from(text),
247        span,
248    };
249
250    (msg, cursor)
251}
252
253fn to_end_of_direct_message(cursor: Cursor<'_>) -> (&'_ str, Cursor<'_>) {
254    let start = cursor.index;
255
256    // everything else on this line is part of the message
257    let mut scanning_ahead = cursor.skip_to_next_line();
258
259    // // look for the start of the next message
260    while !scanning_ahead.is_empty() && parse_metadata(scanning_ahead).is_err()
261    {
262        scanning_ahead = scanning_ahead.skip_to_next_line();
263    }
264
265    // this moves backwards over the newline characters at the
266    // end of the message. We want this rule to match *only* the message
267    // body and our previous backtracking was a bit eager.
268    let bytes_read = scanning_ahead.index - start;
269    let text_to_start_of_next_message = &cursor.rest[..bytes_read];
270    let bytes_to_end_of_message =
271        text_to_start_of_next_message.trim_end().len();
272
273    cursor.split(bytes_to_end_of_message)
274}
275
276fn parse_attachment_name(cursor: Cursor<'_>) -> Option<(&'_ str, Cursor<'_>)> {
277    parse_name_or_path(cursor).ok()
278}
279
280fn parse_timestamp(
281    cursor: Cursor<'_>,
282) -> Result<(NaiveDateTime, Cursor<'_>), ParseError> {
283    // everything from the start of a line to the "-" is part of the
284    // timestamp.
285    let (candidate, _) = match cursor.split_at(|c| c == '-') {
286        Some(s) => s,
287        None => return Err(ParseError::new("timestamp", cursor.index)),
288    };
289
290    match parse_australian_timestamp(candidate.trim()) {
291        Some(ts) => {
292            // move the cursor to the "-"
293            let cursor = cursor.advance(candidate.len());
294            Ok((ts, cursor))
295        },
296        None => Err(ParseError::new("timestamp", cursor.index)),
297    }
298}
299
300fn parse_sender(
301    cursor: Cursor<'_>,
302) -> Result<(&'_ str, Cursor<'_>), ParseError> {
303    parse_name_or_path(cursor)
304}
305
306fn parse_name_or_path(
307    cursor: Cursor<'_>,
308) -> Result<(&'_ str, Cursor<'_>), ParseError> {
309    match cursor.split_at(|c| !is_valid_name_or_path_character(c)) {
310        Some((name, cursor)) => {
311            let name = name.trim_end();
312            Ok((name, cursor))
313        },
314        None => Err(ParseError::new("name or path", cursor.index)),
315    }
316}
317
318fn skip_character_surrounded_by_space(
319    cursor: Cursor<'_>,
320    letter: char,
321) -> Result<Cursor<'_>, ParseError> {
322    let mut current_state = State::SkippingWhitespaceBefore;
323
324    match cursor.split_at(whitespace_skipper(&mut current_state, letter)) {
325        Some((_, cursor)) if current_state == State::Done => Ok(cursor),
326        // anything other than State::Done is an error
327        _ => Err(ParseError::new(
328            format!("skip a '{}' surrounded by whitespace", letter),
329            cursor.index,
330        )),
331    }
332}
333
334/// The states used by the [`whitespace_skipper()`] state machine.
335#[derive(Debug, Copy, Clone, PartialEq)]
336enum State {
337    SkippingWhitespaceBefore,
338    EncounteredLetter,
339    SkippingWhitespaceAfter,
340    Done,
341    Error,
342}
343
344/// Get a predicate for use with [`Cursor::split_at()`] which matches the
345/// equivalent of the regex, "\s*" + letter + "\s*"".
346fn whitespace_skipper(
347    current_state: &mut State,
348    letter: char,
349) -> impl FnMut(char) -> bool + '_ {
350    fn next_state(current: State, c: char, letter: char) -> State {
351        match current {
352            State::SkippingWhitespaceBefore => {
353                if c.is_whitespace() {
354                    State::SkippingWhitespaceBefore
355                } else if c == letter {
356                    State::EncounteredLetter
357                } else {
358                    State::Error
359                }
360            },
361            State::EncounteredLetter => {
362                if c.is_whitespace() {
363                    State::SkippingWhitespaceAfter
364                } else {
365                    State::Error
366                }
367            },
368            State::SkippingWhitespaceAfter => {
369                if c.is_whitespace() {
370                    State::SkippingWhitespaceAfter
371                } else {
372                    State::Done
373                }
374            },
375            State::Done | State::Error => current,
376        }
377    }
378
379    move |c: char| {
380        *current_state = next_state(*current_state, c, letter);
381        *current_state == State::Done || *current_state == State::Error
382    }
383}
384
385/// Tries to parse a timestamp in typical australian forms.
386fn parse_australian_timestamp(src: &str) -> Option<NaiveDateTime> {
387    let forms = &["%d/%m/%y, %H:%M", "%d/%m/%y, %I:%M %P"];
388
389    for form in forms {
390        if let Ok(timestamp) = NaiveDateTime::parse_from_str(src, form) {
391            return Some(timestamp);
392        }
393    }
394
395    None
396}
397
398fn is_valid_name_or_path_character(c: char) -> bool {
399    if c.is_whitespace() || c.is_alphanumeric() {
400        return true;
401    }
402
403    match c {
404        '-' | '_' | '.' | '+' => true,
405        _ => false,
406    }
407}
408
409/// An error that can occur while parsing.
410#[derive(Debug, Clone, PartialEq)]
411pub struct ParseError {
412    production_name: Cow<'static, str>,
413    location: usize,
414}
415
416impl ParseError {
417    /// What the parser was trying to parse at the time.
418    pub fn production_name(&self) -> &str { &self.production_name }
419
420    /// The byte offset this error was encountered at.
421    pub fn index(&self) -> usize { self.location }
422
423    fn new<S: Into<Cow<'static, str>>>(
424        production_name: S,
425        location: usize,
426    ) -> Self {
427        ParseError {
428            production_name: production_name.into(),
429            location,
430        }
431    }
432
433    fn namespaced<S: AsRef<str>>(&self, new_name: S) -> Self {
434        ParseError::new(
435            format!("{}.{}", new_name.as_ref(), self.production_name),
436            self.location,
437        )
438    }
439}
440
441impl Display for ParseError {
442    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
443        write!(
444            f,
445            "expected {} at index {}",
446            self.production_name, self.location
447        )
448    }
449}
450
451#[cfg(test)]
452mod tests {
453    use super::*;
454    use chrono::NaiveDate;
455
456    fn direct_message<S: Into<String>>(content: S, span: Span) -> Body {
457        Body::DirectMessage(DirectMessage {
458            content: content.into(),
459            span,
460        })
461    }
462
463    fn attachment<S: Into<String>>(name: S, span: Span) -> Body {
464        Body::Attachment(Attachment {
465            name: name.into(),
466            span,
467        })
468    }
469
470    #[test]
471    fn parse_several_common_timestamp_formats() {
472        let inputs = vec![
473            (
474                "31/10/19, 16:26",
475                NaiveDate::from_ymd(2019, 10, 31).and_hms(16, 26, 0),
476            ),
477            (
478                "31/10/19, 16:16",
479                NaiveDate::from_ymd(2019, 10, 31).and_hms(16, 16, 0),
480            ),
481            (
482                "22/2/20, 3:58 pm",
483                NaiveDate::from_ymd(2020, 2, 22).and_hms(15, 58, 0),
484            ),
485            (
486                "22/2/20, 3:37 pm",
487                NaiveDate::from_ymd(2020, 2, 22).and_hms(15, 37, 0),
488            ),
489        ];
490
491        for (src, should_be) in inputs {
492            let got = parse_australian_timestamp(src).unwrap();
493            assert_eq!(got, should_be);
494        }
495    }
496
497    #[test]
498    fn cursor_split_at() {
499        let src = "Hello World. asdf";
500        let cursor = Cursor::new(src);
501
502        let (got, cursor) = cursor.split_at(|c| c == '.').unwrap();
503
504        assert_eq!(got, "Hello World");
505        assert_eq!(
506            cursor,
507            Cursor {
508                rest: ". asdf",
509                index: got.len(),
510            }
511        );
512    }
513
514    #[test]
515    fn known_messages() {
516        let inputs = vec![
517        (
518            "31/10/19, 16:16 - Michael-F-Bryan: I figured out what the problem is",
519            Message {
520                meta: Metadata {
521                timestamp: NaiveDate::from_ymd(2019, 10, 31).and_hms(16, 16, 0),
522                sender: String::from("Michael-F-Bryan"),
523                span: Span::new(0, 33),
524                },
525                body: direct_message("I figured out what the problem is", Span::new(35, 68)),
526                span: Span::new(0, 68),
527            },
528        ),
529        (
530            "31/10/19, 14:13 - Michael-F-Bryan: IMG-20191031-WA0005.jpg (file attached)",
531            Message {
532                meta: Metadata {
533                timestamp: NaiveDate::from_ymd(2019, 10, 31).and_hms(14, 13, 0),
534                sender: String::from("Michael-F-Bryan"),
535                span: Span::new(0, 33),
536                },
537                body: attachment("IMG-20191031-WA0005.jpg", Span::new(35, 58)),
538                span: Span::new(0, 74),
539            }
540        ),
541         ];
542
543        for (src, should_be) in inputs {
544            let cursor = Cursor::new(src);
545
546            let (got, cursor) = parse_message(cursor).unwrap();
547
548            assert_eq!(got, should_be);
549            assert_eq!(
550                cursor,
551                Cursor {
552                    rest: "",
553                    index: src.len(),
554                }
555            );
556        }
557    }
558
559    #[test]
560    fn multiline_direct_message() {
561        let src = "31/10/19, 14:13 - Michael-F-Bryan: this is a\nreally\nlong\nmessage";
562        let body_should_be = direct_message(
563            "this is a\nreally\nlong\nmessage",
564            Span::new(35, src.len()),
565        );
566
567        let got = parse(src);
568
569        assert!(got.errors.is_empty());
570        assert_eq!(got.messages.len(), 1);
571        assert_eq!(got.messages[0].body, body_should_be);
572    }
573
574    #[test]
575    fn skip_over_unparseable_lines() {
576        let src = r#"
57731/10/19, 16:16 - Michael-F-Bryan: I figured out what the problem is
57831/10/19, 14:13 - Michael-F-Bryan: IMG-20191031-WA0005.jpg (file attached)
579this is some garbage content!
580
581$and more garbage (note: the previous line was skipped because it was empty, not message or garbage)
582"#;
583
584        let got = parse(src);
585
586        println!("{:#?}", got);
587        assert_eq!(got.messages.len(), 2);
588        assert_eq!(got.errors.len(), 2);
589    }
590
591    #[test]
592    fn skip_cursor_to_next_newline() {
593        let src = "some text\n\nasdf";
594        let cursor = Cursor::new(src);
595
596        let got = cursor.skip_to_next_line();
597
598        assert_eq!(
599            got,
600            Cursor {
601                rest: "asdf",
602                index: 11,
603            }
604        );
605    }
606
607    #[test]
608    fn skip_to_next_line_with_no_more_newlines() {
609        let src = "some text";
610        let cursor = Cursor::new(src);
611
612        let got = cursor.skip_to_next_line();
613
614        assert_eq!(
615            got,
616            Cursor {
617                rest: "",
618                index: src.len()
619            }
620        );
621    }
622
623    #[test]
624    fn skip_to_next_line_with_leading_newlines() {
625        let src = "\nsome text";
626        let cursor = Cursor::new(src);
627
628        let got = cursor.skip_to_next_line();
629
630        assert_eq!(
631            got,
632            Cursor {
633                rest: "some text",
634                index: 1,
635            }
636        );
637    }
638
639    #[test]
640    fn rest_of_line_at_eof() {
641        let src = "some text";
642        let cursor = Cursor::new(src);
643
644        let (line, got) = cursor.rest_of_line();
645
646        assert_eq!(line, src);
647        assert_eq!(got, cursor.eof());
648    }
649
650    #[test]
651    fn some_known_senders() {
652        let inputs = vec![
653            "Michael",
654            "Michael-F-Bryan",
655            "Michael Bryan",
656            "+60 12-345 6789",
657        ];
658
659        for src in inputs {
660            let cursor = Cursor::new(src);
661            let (got_sender, got_cursor) = parse_sender(cursor).unwrap();
662
663            assert_eq!(got_sender, src);
664            assert_eq!(
665                got_cursor,
666                Cursor {
667                    rest: "",
668                    index: src.len(),
669                }
670            );
671        }
672    }
673
674    #[test]
675    fn split_at_when_all_characters_succeed() {
676        let src = "Michael";
677        let cursor = Cursor::new(src);
678
679        let (got_text, got_cursor) = cursor
680            .split_at(|c| !is_valid_name_or_path_character(c))
681            .unwrap();
682
683        assert_eq!(got_text, src);
684        assert_eq!(got_cursor, cursor.eof());
685    }
686}