use crate::{Attachment, Body, DirectMessage, Message, Metadata, Span};
use chrono::NaiveDateTime;
use std::{
borrow::Cow,
fmt::{self, Display, Formatter},
};
pub fn parse(src: &str) -> Parsed {
let cursor = Cursor::new(src);
let mut errors = Vec::new();
let messages = parse_file(cursor, |d| errors.push(d));
Parsed { messages, errors }
}
#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
pub struct Parsed {
pub messages: Vec<Message>,
pub errors: Vec<ParseError>,
}
#[derive(Debug, Copy, Clone, PartialEq)]
struct Cursor<'src> {
rest: &'src str,
index: usize,
}
impl<'src> Cursor<'src> {
const fn new(src: &'src str) -> Self {
Cursor {
rest: src,
index: 0,
}
}
fn is_empty(&self) -> bool { self.len() == 0 }
fn len(&self) -> usize { self.rest.len() }
fn split_at<P>(self, mut predicate: P) -> Option<(&'src str, Self)>
where
P: FnMut(char) -> bool,
{
let start = self.index;
let mut end = start;
for c in self.rest.chars() {
if predicate(c) {
break;
} else {
end += c.len_utf8();
}
}
if start == end {
None
} else {
let bytes_read = end - start;
Some(self.split(bytes_read))
}
}
fn split(self, index: usize) -> (&'src str, Self) {
let text = &self.rest[..index];
(text, self.advance(index))
}
fn skip_to_next_line(self) -> Self {
let (_, cursor) = self.rest_of_line();
if let Some((_, cursor)) = cursor.split_at(|c| c != '\n' && c != '\r') {
cursor
} else {
cursor.eof()
}
}
fn rest_of_line(self) -> (&'src str, Cursor<'src>) {
if self.rest.starts_with('\n') {
return ("", self);
}
self.split_at(|c| c == '\n')
.unwrap_or_else(|| (self.rest, self.eof()))
}
fn advance(self, amount: usize) -> Self {
Cursor {
rest: &self.rest[amount..],
index: self.index + amount,
}
}
fn eof(self) -> Self { self.advance(self.len()) }
}
fn parse_file<E>(mut cursor: Cursor<'_>, mut on_error: E) -> Vec<Message>
where
E: FnMut(ParseError),
{
let mut messages = Vec::new();
while !cursor.is_empty() {
match parse_message(cursor) {
Ok((msg, new_cursor)) => {
messages.push(msg);
cursor = new_cursor;
},
Err(diag) => {
on_error(diag);
},
}
cursor = cursor.skip_to_next_line();
}
messages
}
fn parse_message(
cursor: Cursor<'_>,
) -> Result<(Message, Cursor<'_>), ParseError> {
let start = cursor.index;
let (meta, cursor) =
parse_metadata(cursor).map_err(|d| d.namespaced("metadata"))?;
let cursor = skip_character_surrounded_by_space(cursor, ':')?;
let (body, cursor) = parse_body(cursor);
let end = cursor.index;
let span = Span::new(start, end);
let msg = Message { meta, body, span };
Ok((msg, cursor))
}
fn parse_metadata(
cursor: Cursor<'_>,
) -> Result<(Metadata, Cursor<'_>), ParseError> {
let start = cursor.index;
let (timestamp, cursor) = parse_timestamp(cursor)?;
let cursor = skip_character_surrounded_by_space(cursor, '-')?;
let (sender, cursor) = parse_sender(cursor)?;
let end = cursor.index;
let span = Span::new(start, end);
let meta = Metadata {
timestamp,
sender: String::from(sender),
span,
};
Ok((meta, cursor))
}
fn parse_body(cursor: Cursor<'_>) -> (Body, Cursor<'_>) {
if let Some((attachment, cursor)) = parse_attachment(cursor) {
(Body::from(attachment), cursor)
} else {
let (dm, cursor) = parse_direct_message(cursor);
(Body::from(dm), cursor)
}
}
fn parse_attachment(cursor: Cursor<'_>) -> Option<(Attachment, Cursor<'_>)> {
let (rest_of_line, end_of_line) = cursor.rest_of_line();
if rest_of_line.find(" (file attached)").is_none() {
return None;
}
let start = cursor.index;
let (name, _) = parse_attachment_name(cursor)?;
let end = start + name.len();
let attachment = Attachment {
name: String::from(name),
span: Span::new(start, end),
};
Some((attachment, end_of_line))
}
fn parse_direct_message(cursor: Cursor<'_>) -> (DirectMessage, Cursor<'_>) {
let start = cursor.index;
let (text, cursor) = to_end_of_direct_message(cursor);
let text_without_leading_whitespace = text.trim_start();
let bytes_skipped = text.len() - text_without_leading_whitespace.len();
let span = Span::new(start + bytes_skipped, cursor.index);
let msg = DirectMessage {
content: String::from(text),
span,
};
(msg, cursor)
}
fn to_end_of_direct_message(cursor: Cursor<'_>) -> (&'_ str, Cursor<'_>) {
let start = cursor.index;
let mut scanning_ahead = cursor.skip_to_next_line();
while !scanning_ahead.is_empty() && parse_metadata(scanning_ahead).is_err()
{
scanning_ahead = scanning_ahead.skip_to_next_line();
}
let bytes_read = scanning_ahead.index - start;
let text_to_start_of_next_message = &cursor.rest[..bytes_read];
let bytes_to_end_of_message =
text_to_start_of_next_message.trim_end().len();
cursor.split(bytes_to_end_of_message)
}
fn parse_attachment_name(cursor: Cursor<'_>) -> Option<(&'_ str, Cursor<'_>)> {
parse_name_or_path(cursor).ok()
}
fn parse_timestamp(
cursor: Cursor<'_>,
) -> Result<(NaiveDateTime, Cursor<'_>), ParseError> {
let (candidate, _) = match cursor.split_at(|c| c == '-') {
Some(s) => s,
None => return Err(ParseError::new("timestamp", cursor.index)),
};
match parse_australian_timestamp(candidate.trim()) {
Some(ts) => {
let cursor = cursor.advance(candidate.len());
Ok((ts, cursor))
},
None => Err(ParseError::new("timestamp", cursor.index)),
}
}
fn parse_sender(
cursor: Cursor<'_>,
) -> Result<(&'_ str, Cursor<'_>), ParseError> {
parse_name_or_path(cursor)
}
fn parse_name_or_path(
cursor: Cursor<'_>,
) -> Result<(&'_ str, Cursor<'_>), ParseError> {
match cursor.split_at(|c| !is_valid_name_or_path_character(c)) {
Some((name, cursor)) => {
let name = name.trim_end();
Ok((name, cursor))
},
None => Err(ParseError::new("name or path", cursor.index)),
}
}
fn skip_character_surrounded_by_space(
cursor: Cursor<'_>,
letter: char,
) -> Result<Cursor<'_>, ParseError> {
let mut current_state = State::SkippingWhitespaceBefore;
match cursor.split_at(whitespace_skipper(&mut current_state, letter)) {
Some((_, cursor)) if current_state == State::Done => Ok(cursor),
_ => Err(ParseError::new(
format!("skip a '{}' surrounded by whitespace", letter),
cursor.index,
)),
}
}
#[derive(Debug, Copy, Clone, PartialEq)]
enum State {
SkippingWhitespaceBefore,
EncounteredLetter,
SkippingWhitespaceAfter,
Done,
Error,
}
fn whitespace_skipper(
current_state: &mut State,
letter: char,
) -> impl FnMut(char) -> bool + '_ {
fn next_state(current: State, c: char, letter: char) -> State {
match current {
State::SkippingWhitespaceBefore => {
if c.is_whitespace() {
State::SkippingWhitespaceBefore
} else if c == letter {
State::EncounteredLetter
} else {
State::Error
}
},
State::EncounteredLetter => {
if c.is_whitespace() {
State::SkippingWhitespaceAfter
} else {
State::Error
}
},
State::SkippingWhitespaceAfter => {
if c.is_whitespace() {
State::SkippingWhitespaceAfter
} else {
State::Done
}
},
State::Done | State::Error => current,
}
}
move |c: char| {
*current_state = next_state(*current_state, c, letter);
*current_state == State::Done || *current_state == State::Error
}
}
fn parse_australian_timestamp(src: &str) -> Option<NaiveDateTime> {
let forms = &["%d/%m/%y, %H:%M", "%d/%m/%y, %I:%M %P"];
for form in forms {
if let Ok(timestamp) = NaiveDateTime::parse_from_str(src, form) {
return Some(timestamp);
}
}
None
}
fn is_valid_name_or_path_character(c: char) -> bool {
if c.is_whitespace() || c.is_alphanumeric() {
return true;
}
match c {
'-' | '_' | '.' | '+' => true,
_ => false,
}
}
#[derive(Debug, Clone, PartialEq)]
pub struct ParseError {
production_name: Cow<'static, str>,
location: usize,
}
impl ParseError {
pub fn production_name(&self) -> &str { &self.production_name }
pub fn index(&self) -> usize { self.location }
fn new<S: Into<Cow<'static, str>>>(
production_name: S,
location: usize,
) -> Self {
ParseError {
production_name: production_name.into(),
location,
}
}
fn namespaced<S: AsRef<str>>(&self, new_name: S) -> Self {
ParseError::new(
format!("{}.{}", new_name.as_ref(), self.production_name),
self.location,
)
}
}
impl Display for ParseError {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
write!(
f,
"expected {} at index {}",
self.production_name, self.location
)
}
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::NaiveDate;
fn direct_message<S: Into<String>>(content: S, span: Span) -> Body {
Body::DirectMessage(DirectMessage {
content: content.into(),
span,
})
}
fn attachment<S: Into<String>>(name: S, span: Span) -> Body {
Body::Attachment(Attachment {
name: name.into(),
span,
})
}
#[test]
fn parse_several_common_timestamp_formats() {
let inputs = vec![
(
"31/10/19, 16:26",
NaiveDate::from_ymd(2019, 10, 31).and_hms(16, 26, 0),
),
(
"31/10/19, 16:16",
NaiveDate::from_ymd(2019, 10, 31).and_hms(16, 16, 0),
),
(
"22/2/20, 3:58 pm",
NaiveDate::from_ymd(2020, 2, 22).and_hms(15, 58, 0),
),
(
"22/2/20, 3:37 pm",
NaiveDate::from_ymd(2020, 2, 22).and_hms(15, 37, 0),
),
];
for (src, should_be) in inputs {
let got = parse_australian_timestamp(src).unwrap();
assert_eq!(got, should_be);
}
}
#[test]
fn cursor_split_at() {
let src = "Hello World. asdf";
let cursor = Cursor::new(src);
let (got, cursor) = cursor.split_at(|c| c == '.').unwrap();
assert_eq!(got, "Hello World");
assert_eq!(
cursor,
Cursor {
rest: ". asdf",
index: got.len(),
}
);
}
#[test]
fn known_messages() {
let inputs = vec![
(
"31/10/19, 16:16 - Michael-F-Bryan: I figured out what the problem is",
Message {
meta: Metadata {
timestamp: NaiveDate::from_ymd(2019, 10, 31).and_hms(16, 16, 0),
sender: String::from("Michael-F-Bryan"),
span: Span::new(0, 33),
},
body: direct_message("I figured out what the problem is", Span::new(35, 68)),
span: Span::new(0, 68),
},
),
(
"31/10/19, 14:13 - Michael-F-Bryan: IMG-20191031-WA0005.jpg (file attached)",
Message {
meta: Metadata {
timestamp: NaiveDate::from_ymd(2019, 10, 31).and_hms(14, 13, 0),
sender: String::from("Michael-F-Bryan"),
span: Span::new(0, 33),
},
body: attachment("IMG-20191031-WA0005.jpg", Span::new(35, 58)),
span: Span::new(0, 74),
}
),
];
for (src, should_be) in inputs {
let cursor = Cursor::new(src);
let (got, cursor) = parse_message(cursor).unwrap();
assert_eq!(got, should_be);
assert_eq!(
cursor,
Cursor {
rest: "",
index: src.len(),
}
);
}
}
#[test]
fn multiline_direct_message() {
let src = "31/10/19, 14:13 - Michael-F-Bryan: this is a\nreally\nlong\nmessage";
let body_should_be = direct_message(
"this is a\nreally\nlong\nmessage",
Span::new(35, src.len()),
);
let got = parse(src);
assert!(got.errors.is_empty());
assert_eq!(got.messages.len(), 1);
assert_eq!(got.messages[0].body, body_should_be);
}
#[test]
fn skip_over_unparseable_lines() {
let src = r#"
31/10/19, 16:16 - Michael-F-Bryan: I figured out what the problem is
31/10/19, 14:13 - Michael-F-Bryan: IMG-20191031-WA0005.jpg (file attached)
this is some garbage content!
$and more garbage (note: the previous line was skipped because it was empty, not message or garbage)
"#;
let got = parse(src);
println!("{:#?}", got);
assert_eq!(got.messages.len(), 2);
assert_eq!(got.errors.len(), 2);
}
#[test]
fn skip_cursor_to_next_newline() {
let src = "some text\n\nasdf";
let cursor = Cursor::new(src);
let got = cursor.skip_to_next_line();
assert_eq!(
got,
Cursor {
rest: "asdf",
index: 11,
}
);
}
#[test]
fn skip_to_next_line_with_no_more_newlines() {
let src = "some text";
let cursor = Cursor::new(src);
let got = cursor.skip_to_next_line();
assert_eq!(
got,
Cursor {
rest: "",
index: src.len()
}
);
}
#[test]
fn skip_to_next_line_with_leading_newlines() {
let src = "\nsome text";
let cursor = Cursor::new(src);
let got = cursor.skip_to_next_line();
assert_eq!(
got,
Cursor {
rest: "some text",
index: 1,
}
);
}
#[test]
fn rest_of_line_at_eof() {
let src = "some text";
let cursor = Cursor::new(src);
let (line, got) = cursor.rest_of_line();
assert_eq!(line, src);
assert_eq!(got, cursor.eof());
}
#[test]
fn some_known_senders() {
let inputs = vec![
"Michael",
"Michael-F-Bryan",
"Michael Bryan",
"+60 12-345 6789",
];
for src in inputs {
let cursor = Cursor::new(src);
let (got_sender, got_cursor) = parse_sender(cursor).unwrap();
assert_eq!(got_sender, src);
assert_eq!(
got_cursor,
Cursor {
rest: "",
index: src.len(),
}
);
}
}
#[test]
fn split_at_when_all_characters_succeed() {
let src = "Michael";
let cursor = Cursor::new(src);
let (got_text, got_cursor) = cursor
.split_at(|c| !is_valid_name_or_path_character(c))
.unwrap();
assert_eq!(got_text, src);
assert_eq!(got_cursor, cursor.eof());
}
}