1use crate::{Attachment, Body, DirectMessage, Message, Metadata, Span};
2use chrono::NaiveDateTime;
3use std::{
4 borrow::Cow,
5 fmt::{self, Display, Formatter},
6};
7
8pub fn parse(src: &str) -> Parsed {
10 let cursor = Cursor::new(src);
11 let mut errors = Vec::new();
12
13 let messages = parse_file(cursor, |d| errors.push(d));
14
15 Parsed { messages, errors }
16}
17
18#[derive(Debug, Clone, PartialEq)]
20#[non_exhaustive]
21pub struct Parsed {
22 pub messages: Vec<Message>,
24 pub errors: Vec<ParseError>,
26}
27
28#[derive(Debug, Copy, Clone, PartialEq)]
29struct Cursor<'src> {
30 rest: &'src str,
31 index: usize,
32}
33
34impl<'src> Cursor<'src> {
35 const fn new(src: &'src str) -> Self {
36 Cursor {
37 rest: src,
38 index: 0,
39 }
40 }
41
42 fn is_empty(&self) -> bool { self.len() == 0 }
43
44 fn len(&self) -> usize { self.rest.len() }
45
46 fn split_at<P>(self, mut predicate: P) -> Option<(&'src str, Self)>
49 where
50 P: FnMut(char) -> bool,
51 {
52 let start = self.index;
53 let mut end = start;
54
55 for c in self.rest.chars() {
56 if predicate(c) {
57 break;
58 } else {
59 end += c.len_utf8();
60 }
61 }
62
63 if start == end {
64 None
65 } else {
66 let bytes_read = end - start;
67 Some(self.split(bytes_read))
68 }
69 }
70
71 fn split(self, index: usize) -> (&'src str, Self) {
72 let text = &self.rest[..index];
73 (text, self.advance(index))
74 }
75
76 fn skip_to_next_line(self) -> Self {
77 let (_, cursor) = self.rest_of_line();
79
80 if let Some((_, cursor)) = cursor.split_at(|c| c != '\n' && c != '\r') {
83 cursor
84 } else {
85 cursor.eof()
86 }
87 }
88
89 fn rest_of_line(self) -> (&'src str, Cursor<'src>) {
90 if self.rest.starts_with('\n') {
91 return ("", self);
93 }
94
95 self.split_at(|c| c == '\n')
96 .unwrap_or_else(|| (self.rest, self.eof()))
97 }
98
99 fn advance(self, amount: usize) -> Self {
100 Cursor {
101 rest: &self.rest[amount..],
102 index: self.index + amount,
103 }
104 }
105
106 fn eof(self) -> Self { self.advance(self.len()) }
107}
108
109fn parse_file<E>(mut cursor: Cursor<'_>, mut on_error: E) -> Vec<Message>
135where
136 E: FnMut(ParseError),
137{
138 let mut messages = Vec::new();
139
140 while !cursor.is_empty() {
141 match parse_message(cursor) {
142 Ok((msg, new_cursor)) => {
143 messages.push(msg);
144 cursor = new_cursor;
145 },
146 Err(diag) => {
147 on_error(diag);
148 },
149 }
150
151 cursor = cursor.skip_to_next_line();
154 }
155
156 messages
157}
158
159fn parse_message(
160 cursor: Cursor<'_>,
161) -> Result<(Message, Cursor<'_>), ParseError> {
162 let start = cursor.index;
163
164 let (meta, cursor) =
165 parse_metadata(cursor).map_err(|d| d.namespaced("metadata"))?;
166
167 let cursor = skip_character_surrounded_by_space(cursor, ':')?;
168 let (body, cursor) = parse_body(cursor);
169
170 let end = cursor.index;
171 let span = Span::new(start, end);
172 let msg = Message { meta, body, span };
173
174 Ok((msg, cursor))
175}
176
177fn parse_metadata(
178 cursor: Cursor<'_>,
179) -> Result<(Metadata, Cursor<'_>), ParseError> {
180 let start = cursor.index;
181
182 let (timestamp, cursor) = parse_timestamp(cursor)?;
183 let cursor = skip_character_surrounded_by_space(cursor, '-')?;
184 let (sender, cursor) = parse_sender(cursor)?;
185
186 let end = cursor.index;
187 let span = Span::new(start, end);
188 let meta = Metadata {
189 timestamp,
190 sender: String::from(sender),
191 span,
192 };
193
194 Ok((meta, cursor))
195}
196
197fn parse_body(cursor: Cursor<'_>) -> (Body, Cursor<'_>) {
198 if let Some((attachment, cursor)) = parse_attachment(cursor) {
199 (Body::from(attachment), cursor)
200 } else {
201 let (dm, cursor) = parse_direct_message(cursor);
202 (Body::from(dm), cursor)
203 }
204}
205
206fn parse_attachment(cursor: Cursor<'_>) -> Option<(Attachment, Cursor<'_>)> {
207 let (rest_of_line, end_of_line) = cursor.rest_of_line();
208
209 if rest_of_line.find(" (file attached)").is_none() {
210 return None;
212 }
213
214 let start = cursor.index;
215 let (name, _) = parse_attachment_name(cursor)?;
216 let end = start + name.len();
217
218 let attachment = Attachment {
219 name: String::from(name),
220 span: Span::new(start, end),
221 };
222
223 Some((attachment, end_of_line))
224}
225
226fn parse_direct_message(cursor: Cursor<'_>) -> (DirectMessage, Cursor<'_>) {
227 let start = cursor.index;
237
238 let (text, cursor) = to_end_of_direct_message(cursor);
239
240 let text_without_leading_whitespace = text.trim_start();
242 let bytes_skipped = text.len() - text_without_leading_whitespace.len();
243 let span = Span::new(start + bytes_skipped, cursor.index);
244
245 let msg = DirectMessage {
246 content: String::from(text),
247 span,
248 };
249
250 (msg, cursor)
251}
252
253fn to_end_of_direct_message(cursor: Cursor<'_>) -> (&'_ str, Cursor<'_>) {
254 let start = cursor.index;
255
256 let mut scanning_ahead = cursor.skip_to_next_line();
258
259 while !scanning_ahead.is_empty() && parse_metadata(scanning_ahead).is_err()
261 {
262 scanning_ahead = scanning_ahead.skip_to_next_line();
263 }
264
265 let bytes_read = scanning_ahead.index - start;
269 let text_to_start_of_next_message = &cursor.rest[..bytes_read];
270 let bytes_to_end_of_message =
271 text_to_start_of_next_message.trim_end().len();
272
273 cursor.split(bytes_to_end_of_message)
274}
275
276fn parse_attachment_name(cursor: Cursor<'_>) -> Option<(&'_ str, Cursor<'_>)> {
277 parse_name_or_path(cursor).ok()
278}
279
280fn parse_timestamp(
281 cursor: Cursor<'_>,
282) -> Result<(NaiveDateTime, Cursor<'_>), ParseError> {
283 let (candidate, _) = match cursor.split_at(|c| c == '-') {
286 Some(s) => s,
287 None => return Err(ParseError::new("timestamp", cursor.index)),
288 };
289
290 match parse_australian_timestamp(candidate.trim()) {
291 Some(ts) => {
292 let cursor = cursor.advance(candidate.len());
294 Ok((ts, cursor))
295 },
296 None => Err(ParseError::new("timestamp", cursor.index)),
297 }
298}
299
300fn parse_sender(
301 cursor: Cursor<'_>,
302) -> Result<(&'_ str, Cursor<'_>), ParseError> {
303 parse_name_or_path(cursor)
304}
305
306fn parse_name_or_path(
307 cursor: Cursor<'_>,
308) -> Result<(&'_ str, Cursor<'_>), ParseError> {
309 match cursor.split_at(|c| !is_valid_name_or_path_character(c)) {
310 Some((name, cursor)) => {
311 let name = name.trim_end();
312 Ok((name, cursor))
313 },
314 None => Err(ParseError::new("name or path", cursor.index)),
315 }
316}
317
318fn skip_character_surrounded_by_space(
319 cursor: Cursor<'_>,
320 letter: char,
321) -> Result<Cursor<'_>, ParseError> {
322 let mut current_state = State::SkippingWhitespaceBefore;
323
324 match cursor.split_at(whitespace_skipper(&mut current_state, letter)) {
325 Some((_, cursor)) if current_state == State::Done => Ok(cursor),
326 _ => Err(ParseError::new(
328 format!("skip a '{}' surrounded by whitespace", letter),
329 cursor.index,
330 )),
331 }
332}
333
334#[derive(Debug, Copy, Clone, PartialEq)]
336enum State {
337 SkippingWhitespaceBefore,
338 EncounteredLetter,
339 SkippingWhitespaceAfter,
340 Done,
341 Error,
342}
343
344fn whitespace_skipper(
347 current_state: &mut State,
348 letter: char,
349) -> impl FnMut(char) -> bool + '_ {
350 fn next_state(current: State, c: char, letter: char) -> State {
351 match current {
352 State::SkippingWhitespaceBefore => {
353 if c.is_whitespace() {
354 State::SkippingWhitespaceBefore
355 } else if c == letter {
356 State::EncounteredLetter
357 } else {
358 State::Error
359 }
360 },
361 State::EncounteredLetter => {
362 if c.is_whitespace() {
363 State::SkippingWhitespaceAfter
364 } else {
365 State::Error
366 }
367 },
368 State::SkippingWhitespaceAfter => {
369 if c.is_whitespace() {
370 State::SkippingWhitespaceAfter
371 } else {
372 State::Done
373 }
374 },
375 State::Done | State::Error => current,
376 }
377 }
378
379 move |c: char| {
380 *current_state = next_state(*current_state, c, letter);
381 *current_state == State::Done || *current_state == State::Error
382 }
383}
384
385fn parse_australian_timestamp(src: &str) -> Option<NaiveDateTime> {
387 let forms = &["%d/%m/%y, %H:%M", "%d/%m/%y, %I:%M %P"];
388
389 for form in forms {
390 if let Ok(timestamp) = NaiveDateTime::parse_from_str(src, form) {
391 return Some(timestamp);
392 }
393 }
394
395 None
396}
397
398fn is_valid_name_or_path_character(c: char) -> bool {
399 if c.is_whitespace() || c.is_alphanumeric() {
400 return true;
401 }
402
403 match c {
404 '-' | '_' | '.' | '+' => true,
405 _ => false,
406 }
407}
408
409#[derive(Debug, Clone, PartialEq)]
411pub struct ParseError {
412 production_name: Cow<'static, str>,
413 location: usize,
414}
415
416impl ParseError {
417 pub fn production_name(&self) -> &str { &self.production_name }
419
420 pub fn index(&self) -> usize { self.location }
422
423 fn new<S: Into<Cow<'static, str>>>(
424 production_name: S,
425 location: usize,
426 ) -> Self {
427 ParseError {
428 production_name: production_name.into(),
429 location,
430 }
431 }
432
433 fn namespaced<S: AsRef<str>>(&self, new_name: S) -> Self {
434 ParseError::new(
435 format!("{}.{}", new_name.as_ref(), self.production_name),
436 self.location,
437 )
438 }
439}
440
441impl Display for ParseError {
442 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
443 write!(
444 f,
445 "expected {} at index {}",
446 self.production_name, self.location
447 )
448 }
449}
450
451#[cfg(test)]
452mod tests {
453 use super::*;
454 use chrono::NaiveDate;
455
456 fn direct_message<S: Into<String>>(content: S, span: Span) -> Body {
457 Body::DirectMessage(DirectMessage {
458 content: content.into(),
459 span,
460 })
461 }
462
463 fn attachment<S: Into<String>>(name: S, span: Span) -> Body {
464 Body::Attachment(Attachment {
465 name: name.into(),
466 span,
467 })
468 }
469
470 #[test]
471 fn parse_several_common_timestamp_formats() {
472 let inputs = vec![
473 (
474 "31/10/19, 16:26",
475 NaiveDate::from_ymd(2019, 10, 31).and_hms(16, 26, 0),
476 ),
477 (
478 "31/10/19, 16:16",
479 NaiveDate::from_ymd(2019, 10, 31).and_hms(16, 16, 0),
480 ),
481 (
482 "22/2/20, 3:58 pm",
483 NaiveDate::from_ymd(2020, 2, 22).and_hms(15, 58, 0),
484 ),
485 (
486 "22/2/20, 3:37 pm",
487 NaiveDate::from_ymd(2020, 2, 22).and_hms(15, 37, 0),
488 ),
489 ];
490
491 for (src, should_be) in inputs {
492 let got = parse_australian_timestamp(src).unwrap();
493 assert_eq!(got, should_be);
494 }
495 }
496
497 #[test]
498 fn cursor_split_at() {
499 let src = "Hello World. asdf";
500 let cursor = Cursor::new(src);
501
502 let (got, cursor) = cursor.split_at(|c| c == '.').unwrap();
503
504 assert_eq!(got, "Hello World");
505 assert_eq!(
506 cursor,
507 Cursor {
508 rest: ". asdf",
509 index: got.len(),
510 }
511 );
512 }
513
514 #[test]
515 fn known_messages() {
516 let inputs = vec![
517 (
518 "31/10/19, 16:16 - Michael-F-Bryan: I figured out what the problem is",
519 Message {
520 meta: Metadata {
521 timestamp: NaiveDate::from_ymd(2019, 10, 31).and_hms(16, 16, 0),
522 sender: String::from("Michael-F-Bryan"),
523 span: Span::new(0, 33),
524 },
525 body: direct_message("I figured out what the problem is", Span::new(35, 68)),
526 span: Span::new(0, 68),
527 },
528 ),
529 (
530 "31/10/19, 14:13 - Michael-F-Bryan: IMG-20191031-WA0005.jpg (file attached)",
531 Message {
532 meta: Metadata {
533 timestamp: NaiveDate::from_ymd(2019, 10, 31).and_hms(14, 13, 0),
534 sender: String::from("Michael-F-Bryan"),
535 span: Span::new(0, 33),
536 },
537 body: attachment("IMG-20191031-WA0005.jpg", Span::new(35, 58)),
538 span: Span::new(0, 74),
539 }
540 ),
541 ];
542
543 for (src, should_be) in inputs {
544 let cursor = Cursor::new(src);
545
546 let (got, cursor) = parse_message(cursor).unwrap();
547
548 assert_eq!(got, should_be);
549 assert_eq!(
550 cursor,
551 Cursor {
552 rest: "",
553 index: src.len(),
554 }
555 );
556 }
557 }
558
559 #[test]
560 fn multiline_direct_message() {
561 let src = "31/10/19, 14:13 - Michael-F-Bryan: this is a\nreally\nlong\nmessage";
562 let body_should_be = direct_message(
563 "this is a\nreally\nlong\nmessage",
564 Span::new(35, src.len()),
565 );
566
567 let got = parse(src);
568
569 assert!(got.errors.is_empty());
570 assert_eq!(got.messages.len(), 1);
571 assert_eq!(got.messages[0].body, body_should_be);
572 }
573
574 #[test]
575 fn skip_over_unparseable_lines() {
576 let src = r#"
57731/10/19, 16:16 - Michael-F-Bryan: I figured out what the problem is
57831/10/19, 14:13 - Michael-F-Bryan: IMG-20191031-WA0005.jpg (file attached)
579this is some garbage content!
580
581$and more garbage (note: the previous line was skipped because it was empty, not message or garbage)
582"#;
583
584 let got = parse(src);
585
586 println!("{:#?}", got);
587 assert_eq!(got.messages.len(), 2);
588 assert_eq!(got.errors.len(), 2);
589 }
590
591 #[test]
592 fn skip_cursor_to_next_newline() {
593 let src = "some text\n\nasdf";
594 let cursor = Cursor::new(src);
595
596 let got = cursor.skip_to_next_line();
597
598 assert_eq!(
599 got,
600 Cursor {
601 rest: "asdf",
602 index: 11,
603 }
604 );
605 }
606
607 #[test]
608 fn skip_to_next_line_with_no_more_newlines() {
609 let src = "some text";
610 let cursor = Cursor::new(src);
611
612 let got = cursor.skip_to_next_line();
613
614 assert_eq!(
615 got,
616 Cursor {
617 rest: "",
618 index: src.len()
619 }
620 );
621 }
622
623 #[test]
624 fn skip_to_next_line_with_leading_newlines() {
625 let src = "\nsome text";
626 let cursor = Cursor::new(src);
627
628 let got = cursor.skip_to_next_line();
629
630 assert_eq!(
631 got,
632 Cursor {
633 rest: "some text",
634 index: 1,
635 }
636 );
637 }
638
639 #[test]
640 fn rest_of_line_at_eof() {
641 let src = "some text";
642 let cursor = Cursor::new(src);
643
644 let (line, got) = cursor.rest_of_line();
645
646 assert_eq!(line, src);
647 assert_eq!(got, cursor.eof());
648 }
649
650 #[test]
651 fn some_known_senders() {
652 let inputs = vec![
653 "Michael",
654 "Michael-F-Bryan",
655 "Michael Bryan",
656 "+60 12-345 6789",
657 ];
658
659 for src in inputs {
660 let cursor = Cursor::new(src);
661 let (got_sender, got_cursor) = parse_sender(cursor).unwrap();
662
663 assert_eq!(got_sender, src);
664 assert_eq!(
665 got_cursor,
666 Cursor {
667 rest: "",
668 index: src.len(),
669 }
670 );
671 }
672 }
673
674 #[test]
675 fn split_at_when_all_characters_succeed() {
676 let src = "Michael";
677 let cursor = Cursor::new(src);
678
679 let (got_text, got_cursor) = cursor
680 .split_at(|c| !is_valid_name_or_path_character(c))
681 .unwrap();
682
683 assert_eq!(got_text, src);
684 assert_eq!(got_cursor, cursor.eof());
685 }
686}