use crate::{Attachment, Body, DirectMessage, Message, Metadata, Span};
use chrono::NaiveDateTime;
use std::{
    borrow::Cow,
    fmt::{self, Display, Formatter},
};

/// Try to parse some WhatsApp messages from an export file.
pub fn parse(src: &str) -> Parsed {
    let cursor = Cursor::new(src);
    let mut errors = Vec::new();

    let messages = parse_file(cursor, |d| errors.push(d));

    Parsed { messages, errors }
}

/// The outcome of parsing an exported WhatsApp chat.
#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
pub struct Parsed {
    /// The messages that were found.
    pub messages: Vec<Message>,
    /// Any parse errors that may have occurred.
    pub errors: Vec<ParseError>,
}

#[derive(Debug, Copy, Clone, PartialEq)]
struct Cursor<'src> {
    rest: &'src str,
    index: usize,
}

impl<'src> Cursor<'src> {
    const fn new(src: &'src str) -> Self {
        Cursor {
            rest: src,
            index: 0,
        }
    }

    fn is_empty(&self) -> bool { self.len() == 0 }

    fn len(&self) -> usize { self.rest.len() }

    /// Tries to split off some text from the front of the [`Cursor`], using a
    /// predicate to determine the split point.
    fn split_at<P>(self, mut predicate: P) -> Option<(&'src str, Self)>
    where
        P: FnMut(char) -> bool,
    {
        let start = self.index;
        let mut end = start;

        for c in self.rest.chars() {
            if predicate(c) {
                break;
            } else {
                end += c.len_utf8();
            }
        }

        if start == end {
            None
        } else {
            let bytes_read = end - start;
            Some(self.split(bytes_read))
        }
    }

    fn split(self, index: usize) -> (&'src str, Self) {
        let text = &self.rest[..index];
        (text, self.advance(index))
    }

    fn skip_to_next_line(self) -> Self {
        // skip to the next newline character
        let (_, cursor) = self.rest_of_line();

        // then keep consuming the newline characters until we find something
        // else
        if let Some((_, cursor)) = cursor.split_at(|c| c != '\n' && c != '\r') {
            cursor
        } else {
            cursor.eof()
        }
    }

    fn rest_of_line(self) -> (&'src str, Cursor<'src>) {
        if self.rest.starts_with('\n') {
            // there is nothing left on this line
            return ("", self);
        }

        self.split_at(|c| c == '\n')
            .unwrap_or_else(|| (self.rest, self.eof()))
    }

    fn advance(self, amount: usize) -> Self {
        Cursor {
            rest: &self.rest[amount..],
            index: self.index + amount,
        }
    }

    fn eof(self) -> Self { self.advance(self.len()) }
}

/// Parse an entire file.
///
/// In technical jargon, this is a naive `LL(k)` parser with arbitrary length
/// lookahead, and based on the following grammar:
///
/// ```bnf
/// file         := message*
/// message      := metadata ":" body
/// metadata     := timestamp "-" sender
/// body         := attachment | direct_message
/// attachment   := NAME_OR_PATH  "(file attached)"
/// timestamp    := DATETIME
/// sender       := NAME_OR_PATH
///
/// NAME_OR_PATH := /[\w\d.\s-]+/
/// (* a timestamp formatted with the user's locale *)
/// DATETIME     := "%d/%m/%y, %H:%M" | "%d/%m/%y, %I:%M %P"
/// ```
///
/// Each rule of the grammar gets its own `parse_*` function (e.g. the `message`
/// rule is parsed with [`parse_message()`]).
///
/// Instead of mutating any internal state, each function is given the input
/// text and location ([`Cursor`]), and will return the parsed item and a new
/// [`Cursor`] representing the unparsed part of the input.
fn parse_file<E>(mut cursor: Cursor<'_>, mut on_error: E) -> Vec<Message>
where
    E: FnMut(ParseError),
{
    let mut messages = Vec::new();

    while !cursor.is_empty() {
        match parse_message(cursor) {
            Ok((msg, new_cursor)) => {
                messages.push(msg);
                cursor = new_cursor;
            },
            Err(diag) => {
                on_error(diag);
            },
        }

        // make sure the next call will start at the beginning of
        // the next line
        cursor = cursor.skip_to_next_line();
    }

    messages
}

fn parse_message(
    cursor: Cursor<'_>,
) -> Result<(Message, Cursor<'_>), ParseError> {
    let start = cursor.index;

    let (meta, cursor) =
        parse_metadata(cursor).map_err(|d| d.namespaced("metadata"))?;

    let cursor = skip_character_surrounded_by_space(cursor, ':')?;
    let (body, cursor) = parse_body(cursor);

    let end = cursor.index;
    let span = Span::new(start, end);
    let msg = Message { meta, body, span };

    Ok((msg, cursor))
}

fn parse_metadata(
    cursor: Cursor<'_>,
) -> Result<(Metadata, Cursor<'_>), ParseError> {
    let start = cursor.index;

    let (timestamp, cursor) = parse_timestamp(cursor)?;
    let cursor = skip_character_surrounded_by_space(cursor, '-')?;
    let (sender, cursor) = parse_sender(cursor)?;

    let end = cursor.index;
    let span = Span::new(start, end);
    let meta = Metadata {
        timestamp,
        sender: String::from(sender),
        span,
    };

    Ok((meta, cursor))
}

fn parse_body(cursor: Cursor<'_>) -> (Body, Cursor<'_>) {
    if let Some((attachment, cursor)) = parse_attachment(cursor) {
        (Body::from(attachment), cursor)
    } else {
        let (dm, cursor) = parse_direct_message(cursor);
        (Body::from(dm), cursor)
    }
}

fn parse_attachment(cursor: Cursor<'_>) -> Option<(Attachment, Cursor<'_>)> {
    let (rest_of_line, end_of_line) = cursor.rest_of_line();

    if rest_of_line.find(" (file attached)").is_none() {
        // couldn't find the magic string for attachments
        return None;
    }

    let start = cursor.index;
    let (name, _) = parse_attachment_name(cursor)?;
    let end = start + name.len();

    let attachment = Attachment {
        name: String::from(name),
        span: Span::new(start, end),
    };

    Some((attachment, end_of_line))
}

fn parse_direct_message(cursor: Cursor<'_>) -> (DirectMessage, Cursor<'_>) {
    // Unlike every other rule, a direct message may take up multiple lines.
    //
    // We can work around this by being a bit sneaky... Keep reading content
    // until we encounter something that parses correctly as metadata (i.e.
    // it's the start of the next message) then backtrack to the start of
    // that line, accepting everything in between as the message body.
    //
    // because direct messages are arbitrary text, it's actually impossible
    // for this rule to fail.
    let start = cursor.index;

    let (text, cursor) = to_end_of_direct_message(cursor);

    // to provide better spans, we'll also skip past leading whitespace
    let text_without_leading_whitespace = text.trim_start();
    let bytes_skipped = text.len() - text_without_leading_whitespace.len();
    let span = Span::new(start + bytes_skipped, cursor.index);

    let msg = DirectMessage {
        content: String::from(text),
        span,
    };

    (msg, cursor)
}

fn to_end_of_direct_message(cursor: Cursor<'_>) -> (&'_ str, Cursor<'_>) {
    let start = cursor.index;

    // everything else on this line is part of the message
    let mut scanning_ahead = cursor.skip_to_next_line();

    // // look for the start of the next message
    while !scanning_ahead.is_empty() && parse_metadata(scanning_ahead).is_err()
    {
        scanning_ahead = scanning_ahead.skip_to_next_line();
    }

    // this moves backwards over the newline characters at the
    // end of the message. We want this rule to match *only* the message
    // body and our previous backtracking was a bit eager.
    let bytes_read = scanning_ahead.index - start;
    let text_to_start_of_next_message = &cursor.rest[..bytes_read];
    let bytes_to_end_of_message =
        text_to_start_of_next_message.trim_end().len();

    cursor.split(bytes_to_end_of_message)
}

fn parse_attachment_name(cursor: Cursor<'_>) -> Option<(&'_ str, Cursor<'_>)> {
    parse_name_or_path(cursor).ok()
}

fn parse_timestamp(
    cursor: Cursor<'_>,
) -> Result<(NaiveDateTime, Cursor<'_>), ParseError> {
    // everything from the start of a line to the "-" is part of the
    // timestamp.
    let (candidate, _) = match cursor.split_at(|c| c == '-') {
        Some(s) => s,
        None => return Err(ParseError::new("timestamp", cursor.index)),
    };

    match parse_australian_timestamp(candidate.trim()) {
        Some(ts) => {
            // move the cursor to the "-"
            let cursor = cursor.advance(candidate.len());
            Ok((ts, cursor))
        },
        None => Err(ParseError::new("timestamp", cursor.index)),
    }
}

fn parse_sender(
    cursor: Cursor<'_>,
) -> Result<(&'_ str, Cursor<'_>), ParseError> {
    parse_name_or_path(cursor)
}

fn parse_name_or_path(
    cursor: Cursor<'_>,
) -> Result<(&'_ str, Cursor<'_>), ParseError> {
    match cursor.split_at(|c| !is_valid_name_or_path_character(c)) {
        Some((name, cursor)) => {
            let name = name.trim_end();
            Ok((name, cursor))
        },
        None => Err(ParseError::new("name or path", cursor.index)),
    }
}

fn skip_character_surrounded_by_space(
    cursor: Cursor<'_>,
    letter: char,
) -> Result<Cursor<'_>, ParseError> {
    let mut current_state = State::SkippingWhitespaceBefore;

    match cursor.split_at(whitespace_skipper(&mut current_state, letter)) {
        Some((_, cursor)) if current_state == State::Done => Ok(cursor),
        // anything other than State::Done is an error
        _ => Err(ParseError::new(
            format!("skip a '{}' surrounded by whitespace", letter),
            cursor.index,
        )),
    }
}

/// The states used by the [`whitespace_skipper()`] state machine.
#[derive(Debug, Copy, Clone, PartialEq)]
enum State {
    SkippingWhitespaceBefore,
    EncounteredLetter,
    SkippingWhitespaceAfter,
    Done,
    Error,
}

/// Get a predicate for use with [`Cursor::split_at()`] which matches the
/// equivalent of the regex, "\s*" + letter + "\s*"".
fn whitespace_skipper(
    current_state: &mut State,
    letter: char,
) -> impl FnMut(char) -> bool + '_ {
    fn next_state(current: State, c: char, letter: char) -> State {
        match current {
            State::SkippingWhitespaceBefore => {
                if c.is_whitespace() {
                    State::SkippingWhitespaceBefore
                } else if c == letter {
                    State::EncounteredLetter
                } else {
                    State::Error
                }
            },
            State::EncounteredLetter => {
                if c.is_whitespace() {
                    State::SkippingWhitespaceAfter
                } else {
                    State::Error
                }
            },
            State::SkippingWhitespaceAfter => {
                if c.is_whitespace() {
                    State::SkippingWhitespaceAfter
                } else {
                    State::Done
                }
            },
            State::Done | State::Error => current,
        }
    }

    move |c: char| {
        *current_state = next_state(*current_state, c, letter);
        *current_state == State::Done || *current_state == State::Error
    }
}

/// Tries to parse a timestamp in typical australian forms.
fn parse_australian_timestamp(src: &str) -> Option<NaiveDateTime> {
    let forms = &["%d/%m/%y, %H:%M", "%d/%m/%y, %I:%M %P"];

    for form in forms {
        if let Ok(timestamp) = NaiveDateTime::parse_from_str(src, form) {
            return Some(timestamp);
        }
    }

    None
}

fn is_valid_name_or_path_character(c: char) -> bool {
    if c.is_whitespace() || c.is_alphanumeric() {
        return true;
    }

    match c {
        '-' | '_' | '.' | '+' => true,
        _ => false,
    }
}

/// An error that can occur while parsing.
#[derive(Debug, Clone, PartialEq)]
pub struct ParseError {
    production_name: Cow<'static, str>,
    location: usize,
}

impl ParseError {
    /// What the parser was trying to parse at the time.
    pub fn production_name(&self) -> &str { &self.production_name }

    /// The byte offset this error was encountered at.
    pub fn index(&self) -> usize { self.location }

    fn new<S: Into<Cow<'static, str>>>(
        production_name: S,
        location: usize,
    ) -> Self {
        ParseError {
            production_name: production_name.into(),
            location,
        }
    }

    fn namespaced<S: AsRef<str>>(&self, new_name: S) -> Self {
        ParseError::new(
            format!("{}.{}", new_name.as_ref(), self.production_name),
            self.location,
        )
    }
}

impl Display for ParseError {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        write!(
            f,
            "expected {} at index {}",
            self.production_name, self.location
        )
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use chrono::NaiveDate;

    fn direct_message<S: Into<String>>(content: S, span: Span) -> Body {
        Body::DirectMessage(DirectMessage {
            content: content.into(),
            span,
        })
    }

    fn attachment<S: Into<String>>(name: S, span: Span) -> Body {
        Body::Attachment(Attachment {
            name: name.into(),
            span,
        })
    }

    #[test]
    fn parse_several_common_timestamp_formats() {
        let inputs = vec![
            (
                "31/10/19, 16:26",
                NaiveDate::from_ymd(2019, 10, 31).and_hms(16, 26, 0),
            ),
            (
                "31/10/19, 16:16",
                NaiveDate::from_ymd(2019, 10, 31).and_hms(16, 16, 0),
            ),
            (
                "22/2/20, 3:58 pm",
                NaiveDate::from_ymd(2020, 2, 22).and_hms(15, 58, 0),
            ),
            (
                "22/2/20, 3:37 pm",
                NaiveDate::from_ymd(2020, 2, 22).and_hms(15, 37, 0),
            ),
        ];

        for (src, should_be) in inputs {
            let got = parse_australian_timestamp(src).unwrap();
            assert_eq!(got, should_be);
        }
    }

    #[test]
    fn cursor_split_at() {
        let src = "Hello World. asdf";
        let cursor = Cursor::new(src);

        let (got, cursor) = cursor.split_at(|c| c == '.').unwrap();

        assert_eq!(got, "Hello World");
        assert_eq!(
            cursor,
            Cursor {
                rest: ". asdf",
                index: got.len(),
            }
        );
    }

    #[test]
    fn known_messages() {
        let inputs = vec![
        (
            "31/10/19, 16:16 - Michael-F-Bryan: I figured out what the problem is",
            Message {
                meta: Metadata {
                timestamp: NaiveDate::from_ymd(2019, 10, 31).and_hms(16, 16, 0),
                sender: String::from("Michael-F-Bryan"),
                span: Span::new(0, 33),
                },
                body: direct_message("I figured out what the problem is", Span::new(35, 68)),
                span: Span::new(0, 68),
            },
        ),
        (
            "31/10/19, 14:13 - Michael-F-Bryan: IMG-20191031-WA0005.jpg (file attached)",
            Message {
                meta: Metadata {
                timestamp: NaiveDate::from_ymd(2019, 10, 31).and_hms(14, 13, 0),
                sender: String::from("Michael-F-Bryan"),
                span: Span::new(0, 33),
                },
                body: attachment("IMG-20191031-WA0005.jpg", Span::new(35, 58)),
                span: Span::new(0, 74),
            }
        ),
         ];

        for (src, should_be) in inputs {
            let cursor = Cursor::new(src);

            let (got, cursor) = parse_message(cursor).unwrap();

            assert_eq!(got, should_be);
            assert_eq!(
                cursor,
                Cursor {
                    rest: "",
                    index: src.len(),
                }
            );
        }
    }

    #[test]
    fn multiline_direct_message() {
        let src = "31/10/19, 14:13 - Michael-F-Bryan: this is a\nreally\nlong\nmessage";
        let body_should_be = direct_message(
            "this is a\nreally\nlong\nmessage",
            Span::new(35, src.len()),
        );

        let got = parse(src);

        assert!(got.errors.is_empty());
        assert_eq!(got.messages.len(), 1);
        assert_eq!(got.messages[0].body, body_should_be);
    }

    #[test]
    fn skip_over_unparseable_lines() {
        let src = r#"
31/10/19, 16:16 - Michael-F-Bryan: I figured out what the problem is
31/10/19, 14:13 - Michael-F-Bryan: IMG-20191031-WA0005.jpg (file attached)
this is some garbage content!

$and more garbage (note: the previous line was skipped because it was empty, not message or garbage)
"#;

        let got = parse(src);

        println!("{:#?}", got);
        assert_eq!(got.messages.len(), 2);
        assert_eq!(got.errors.len(), 2);
    }

    #[test]
    fn skip_cursor_to_next_newline() {
        let src = "some text\n\nasdf";
        let cursor = Cursor::new(src);

        let got = cursor.skip_to_next_line();

        assert_eq!(
            got,
            Cursor {
                rest: "asdf",
                index: 11,
            }
        );
    }

    #[test]
    fn skip_to_next_line_with_no_more_newlines() {
        let src = "some text";
        let cursor = Cursor::new(src);

        let got = cursor.skip_to_next_line();

        assert_eq!(
            got,
            Cursor {
                rest: "",
                index: src.len()
            }
        );
    }

    #[test]
    fn skip_to_next_line_with_leading_newlines() {
        let src = "\nsome text";
        let cursor = Cursor::new(src);

        let got = cursor.skip_to_next_line();

        assert_eq!(
            got,
            Cursor {
                rest: "some text",
                index: 1,
            }
        );
    }

    #[test]
    fn rest_of_line_at_eof() {
        let src = "some text";
        let cursor = Cursor::new(src);

        let (line, got) = cursor.rest_of_line();

        assert_eq!(line, src);
        assert_eq!(got, cursor.eof());
    }

    #[test]
    fn some_known_senders() {
        let inputs = vec![
            "Michael",
            "Michael-F-Bryan",
            "Michael Bryan",
            "+60 12-345 6789",
        ];

        for src in inputs {
            let cursor = Cursor::new(src);
            let (got_sender, got_cursor) = parse_sender(cursor).unwrap();

            assert_eq!(got_sender, src);
            assert_eq!(
                got_cursor,
                Cursor {
                    rest: "",
                    index: src.len(),
                }
            );
        }
    }

    #[test]
    fn split_at_when_all_characters_succeed() {
        let src = "Michael";
        let cursor = Cursor::new(src);

        let (got_text, got_cursor) = cursor
            .split_at(|c| !is_valid_name_or_path_character(c))
            .unwrap();

        assert_eq!(got_text, src);
        assert_eq!(got_cursor, cursor.eof());
    }
}