1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
use std::str::Chars;

// making use of FamilySearch's GEDCOM Standard Release 5.5.1
// https://www.familysearch.org/wiki/en/GEDCOM
// gedcom_line: level + delim + [optional_xref_ID] + tag + [optional_line_value] + terminator
#[derive(Debug, PartialEq)]
pub enum Token {
    Level(u8),
    Tag(String),
    LineValue(String),
    Pointer(String),
    EOF,
    None,
}

pub struct Tokenizer<'a> {
    pub current_token: Token,
    current_char: char,
    chars: Chars<'a>,
    pub line: u32,
}

impl<'a> Tokenizer<'a> {
    #[must_use]
    pub fn new(chars: Chars<'a>) -> Tokenizer {
        Tokenizer {
            current_char: '\n',
            current_token: Token::None,
            chars,
            line: 0,
        }
    }

    #[must_use]
    pub fn done(&self) -> bool {
        self.current_token == Token::EOF
    }

    pub fn next_token(&mut self) {
        if self.current_char == '\0' {
            self.current_token = Token::EOF;
            return;
        }

        // level number is at the start of each line.
        if self.current_char == '\r' {
            self.next_char();
        }
        if self.current_char == '\n' {
            self.next_char();
            self.current_token = Token::Level(self.extract_number());
            self.line += 1;
            return;
        }

        self.skip_whitespace();

        // handle tag with trailing whitespace
        if self.current_char == '\n' {
            // println!("line {}: trailing whitespace {:?}", self.line, self.current_token);
            self.next_token();
            return;
        }

        self.current_token = match self.current_token {
            Token::Level(_) => {
                if self.current_char == '@' {
                    Token::Pointer(self.extract_word())
                } else {
                    Token::Tag(self.extract_word())
                }
            }
            Token::Pointer(_) => Token::Tag(self.extract_word()),
            Token::Tag(_) => Token::LineValue(self.extract_value()),
            _ => panic!(
                "line {}: Tokenization error! {:?}",
                self.line, self.current_token
            ),
        };
    }

    fn next_char(&mut self) {
        self.current_char = self.chars.next().unwrap_or('\0');
    }

    fn extract_number(&mut self) -> u8 {
        let mut digits: Vec<char> = Vec::new();
        while self.current_char.is_digit(10) {
            digits.push(self.current_char);
            self.next_char();
        }

        digits.iter().collect::<String>().parse::<u8>().unwrap()
    }

    fn extract_word(&mut self) -> String {
        let mut letters: Vec<char> = Vec::new();
        while !self.current_char.is_whitespace() && self.current_char != '\0' {
            letters.push(self.current_char);
            self.next_char();
        }

        letters.iter().collect::<String>()
    }

    fn extract_value(&mut self) -> String {
        let mut letters: Vec<char> = Vec::new();
        while self.current_char != '\n' {
            letters.push(self.current_char);
            self.next_char();
        }

        letters.iter().collect::<String>()
    }

    fn skip_whitespace(&mut self) {
        while self.current_char.is_whitespace() && self.current_char != '\n' {
            self.next_char();
        }
    }
}