w-pgn 0.1.2

A pgn parser written in Rust
Documentation
use std::{iter::Peekable, str::Chars};

use crate::{token::Token, Nag};

enum State {
    Default,
    Symbol,
    String,
    Nag,
    CommentCurlyBracket,
    CommentSemicolon,
}

pub struct Tokenizer<'a> {
    input: Peekable<Chars<'a>>,

    current_state: State,

    temporary_buffer: Option<String>,

    current_input_character: Option<char>,

    reconsume: bool,

    pub tokens: Vec<Token>,
}

impl<'a> Tokenizer<'a> {
    pub fn new(input: &'a str) -> Self {
        let mut tokenizer = Tokenizer {
            input: input.chars().peekable(),
            current_state: State::Default,
            temporary_buffer: None,

            current_input_character: None,
            reconsume: false,

            tokens: Vec::new(),
        };

        tokenizer.tokenize();

        tokenizer
    }

    fn tokenize(&mut self) {
        while let Some(c) = self.consume() {
            match self.current_state {
                State::Default => match c {
                    '[' => {
                        self.tokens.push(Token::LeftBracket);
                    }
                    ';' => {
                        self.switch_to(State::CommentSemicolon);
                    }
                    '{' => {
                        self.switch_to(State::CommentCurlyBracket);
                    }
                    '(' => {
                        self.tokens.push(Token::LeftParenthesis);
                    }
                    'a'..='z' | 'A'..='Z' | '0'..='9' => {
                        self.append_to_temporary_buffer(c);
                        self.switch_to(State::Symbol);
                    }
                    '"' => {
                        self.switch_to(State::String);
                    }
                    '$' => {
                        self.switch_to(State::Nag);
                        self.append_to_temporary_buffer(c);
                    }
                    ']' => {
                        self.tokens.push(Token::RightBracket);
                    }
                    ')' => {
                        self.tokens.push(Token::RightParenthesis);
                    }
                    _ => {}
                },
                State::Symbol => match c {
                    'a'..='z' | 'A'..='Z' | '0'..='9' | '_' | '+' | '#' | '=' | ':' | '-' | '/' => {
                        self.append_to_temporary_buffer(c);
                    }
                    '.' => {
                        let symbol = self.get_temporary_buffer();
                        self.tokens.push(Token::Symbol(symbol));
                        self.switch_to(State::Default);
                    }
                    _ => {
                        let symbol = self.get_temporary_buffer();
                        match symbol.as_str() {
                            "0-1" | "1-0" | "1/2-1/2" | "*" => {
                                self.tokens.push(Token::Termination(symbol));
                                self.reconsume_in(State::Default);
                            }
                            _ => {
                                if symbol.len() > 0 {
                                    self.tokens.push(Token::Symbol(symbol));
                                    self.reconsume_in(State::Default);
                                }
                            }
                        }
                    }
                },
                State::String => match c {
                    '"' => {
                        let string = self.get_temporary_buffer();
                        self.tokens.push(Token::String(string));
                        self.switch_to(State::Default);
                    }
                    _ => {
                        self.append_to_temporary_buffer(c);
                    }
                },
                State::CommentCurlyBracket => match c {
                    '}' => {
                        let comment = self.get_temporary_buffer();
                        self.tokens.push(Token::Comment(comment));
                        self.switch_to(State::Default);
                    }
                    _ => {
                        self.append_to_temporary_buffer(c);
                    }
                },
                State::CommentSemicolon => match c {
                    '\n' => {
                        let comment = self.get_temporary_buffer();
                        self.tokens.push(Token::Comment(comment));
                        self.switch_to(State::Default);
                    }
                    _ => {
                        self.append_to_temporary_buffer(c);
                    }
                },
                State::Nag => {
                    if c.is_digit(10) {
                        self.append_to_temporary_buffer(c);
                    } else {
                        let nag = self.get_temporary_buffer();
                        self.tokens.push(Token::Nag(Nag::from(nag.as_str())));
                        self.reconsume_in(State::Default);
                    }
                }
            }
        }
        let temporary_buffer = self.get_temporary_buffer();
        match temporary_buffer.as_str() {
            "0-1" | "1-0" | "1/2-1/2" | "*" => {
                self.tokens.push(Token::Termination(temporary_buffer));
            }
            _ => {
                if temporary_buffer.len() > 0 {
                    self.tokens.push(Token::Symbol(temporary_buffer));
                }
            }
        }
    }

    fn append_to_temporary_buffer(&mut self, c: char) {
        if let Some(buffer) = &mut self.temporary_buffer {
            buffer.push(c);
        } else {
            self.temporary_buffer = Some(c.to_string());
        }
    }

    fn get_temporary_buffer(&mut self) -> String {
        match self.temporary_buffer.take() {
            Some(buffer) => buffer,
            None => String::new(),
        }
    }

    fn switch_to(&mut self, state: State) {
        self.current_state = state;
    }

    fn consume(&mut self) -> Option<char> {
        if self.reconsume {
            self.reconsume = false;
            return self.current_input_character;
        }

        self.current_input_character = self.input.next();

        self.current_input_character
    }

    fn reconsume(&mut self) {
        self.reconsume = true;
    }

    fn reconsume_in(&mut self, state: State) {
        self.reconsume();
        self.switch_to(state);
    }
}

impl IntoIterator for Tokenizer<'_> {
    type Item = Token;
    type IntoIter = std::vec::IntoIter<Token>;

    fn into_iter(self) -> Self::IntoIter {
        self.tokens.into_iter()
    }
}

#[cfg(test)]
mod tests {

    use std::io::Read;

    use super::*;

    #[test]
    fn test_tokenize_data() {
        let input = "[Event \"FIDE World Championship\"]";
        let tokenizer = Tokenizer::new(input);

        assert_eq!(
            tokenizer.tokens,
            vec![
                Token::LeftBracket,
                Token::Symbol("Event".to_string()),
                Token::String("FIDE World Championship".to_string()),
                Token::RightBracket,
            ]
        );
    }

    #[test]
    fn test_can_read_pgn() {
        let mut buffer = String::new();

        let mut file = std::fs::File::open("game.pgn").unwrap();

        file.read_to_string(&mut buffer).unwrap();

        let tokenizer = Tokenizer::new(&buffer);

        println!("{:?}", tokenizer.tokens);
    }

    #[test]
    fn test_can_read_string() {
        let input = r#"[Event "ch-UZB 1st League 2014"]
        [Site "Tashkent UZB"]
        [Date "2014.03.01"]
        [Round "1.5"]
        [White "Abdusattorov,Nodirbek"]
        [Black "Alikulov,Elbek"]
        [Result "1-0"]
        [WhiteElo "2024"]
        [BlackElo "2212"]
        [ECO "B28"]
        
        1.e4 c5 2.Nf3 a6 3.d3 g6 4.g3 Bg7 5.Bg2 b5 6.O-O Bb7 7.c3 e5 8.a3 Ne7 9.b4 d6
        10.Nbd2 O-O 11.Nb3 Nd7 12.Be3 Rc8 13.Rc1 h6 14.Nfd2 f5 15.f4 Kh7 16.Qe2 cxb4
        17.axb4 exf4 18.Bxf4 Rxc3 19.Rxc3 Bxc3 20.Bxd6 Qb6+ 21.Bc5 Nxc5 22.bxc5 Qe6
        23.d4 Rd8 24.Qd3 Bxd2 25.Nxd2 fxe4 26.Nxe4 Nf5 27.d5 Qe5 28.g4 Ne7 29.Rf7+ Kg8
        30.Qf1 Nxd5 31.Rxb7 Qd4+ 32.Kh1 Rf8 33.Qg1 Ne3 34.Re7 a5 35.c6 a4 36.Qxe3 Qxe3
        37.Nf6+ Rxf6 38.Rxe3 Rd6 39.h4 Rd1+ 40.Kh2 b4 41.c7  1-0
        "#;

        let tokenizer = Tokenizer::new(input);

        println!("{:?}", tokenizer.tokens);
    }
}