use crate::GedcomError;
use std::io::BufRead;
use std::str::Chars;
#[derive(Clone, Debug, PartialEq)]
pub enum Token {
Level(u8),
Tag(Box<str>),
LineValue(Box<str>),
Pointer(Box<str>),
CustomTag(Box<str>),
EOF,
None,
}
impl Token {
#[inline]
#[must_use]
pub fn as_tag_str(&self) -> Option<&str> {
match self {
Token::Tag(s) => Some(s),
_ => None,
}
}
#[inline]
#[must_use]
pub fn as_line_value_str(&self) -> Option<&str> {
match self {
Token::LineValue(s) => Some(s),
_ => None,
}
}
#[inline]
#[must_use]
pub fn as_pointer_str(&self) -> Option<&str> {
match self {
Token::Pointer(s) => Some(s),
_ => None,
}
}
#[inline]
#[must_use]
pub fn as_custom_tag_str(&self) -> Option<&str> {
match self {
Token::CustomTag(s) => Some(s),
_ => None,
}
}
}
pub trait TokenizerTrait {
fn current_token(&self) -> &Token;
fn line(&self) -> u32;
fn done(&self) -> bool;
fn next_token(&mut self) -> Result<(), GedcomError>;
fn take_token(&mut self) -> Result<Token, GedcomError>;
fn take_line_value(&mut self) -> Result<String, GedcomError>;
fn take_continued_text(&mut self, level: u8) -> Result<String, GedcomError>;
fn debug(&self) -> String;
}
const TAG_CAPACITY: usize = 8;
const VALUE_CAPACITY: usize = 64;
const POINTER_CAPACITY: usize = 16;
pub struct Tokenizer<'a> {
pub current_token: Token,
current_char: char,
chars: Chars<'a>,
pub line: u32,
}
impl<'a> Tokenizer<'a> {
#[must_use]
pub fn new(chars: Chars<'a>) -> Tokenizer<'a> {
Tokenizer {
current_char: '\n',
current_token: Token::None,
chars,
line: 0,
}
}
#[inline]
#[must_use]
pub fn done(&self) -> bool {
matches!(self.current_token, Token::EOF)
}
pub fn next_token(&mut self) -> Result<(), GedcomError> {
if self.current_char == '\0' {
self.current_token = Token::EOF;
return Ok(());
}
if matches!(self.current_token, Token::None) || self.current_char == '\n' {
while matches!(self.current_token, Token::None)
&& (self.current_char as u32) == 65279_u32
{
self.next_char();
}
if self.current_char == '\r' {
self.next_char();
if self.current_char == '\n' {
self.next_char();
}
}
if self.current_char == '\n' {
self.next_char();
if self.current_char == '\0' {
self.current_token = Token::EOF;
return Ok(());
}
}
self.current_token = Token::Level(self.extract_number()?);
self.line += 1;
return Ok(());
}
self.skip_whitespace();
if self.current_char == '\n' {
self.next_token()?;
return Ok(());
}
if self.current_char == '\r' {
self.next_char();
if self.current_char == '\n' {
self.next_char();
}
if self.current_char == '\0' {
self.current_token = Token::EOF;
return Ok(());
}
self.next_token()?;
return Ok(());
}
self.current_token = match self.current_token {
Token::Level(_) => {
if self.current_char == '@' {
Token::Pointer(self.extract_word_with_capacity(POINTER_CAPACITY))
} else if self.current_char == '_' {
Token::CustomTag(self.extract_word_with_capacity(TAG_CAPACITY))
} else {
Token::Tag(self.extract_word_with_capacity(TAG_CAPACITY))
}
}
Token::Pointer(_) => Token::Tag(self.extract_word_with_capacity(TAG_CAPACITY)),
Token::Tag(_) | Token::CustomTag(_) => {
if self.current_char == '\n'
|| self.current_char == '\r'
|| self.current_char == '\0'
{
Token::LineValue("".into())
} else {
Token::LineValue(self.extract_value_with_capacity(VALUE_CAPACITY))
}
}
_ => {
return Err(GedcomError::ParseError {
line: self.line,
message: format!("Tokenization error! {:?}", self.current_token),
})
}
};
Ok(())
}
pub fn take_token(&mut self) -> Result<Token, GedcomError> {
let current_token = std::mem::replace(&mut self.current_token, Token::None);
self.next_token()?;
Ok(current_token)
}
#[inline]
fn next_char(&mut self) {
self.current_char = self.chars.next().unwrap_or('\0');
}
#[inline]
fn extract_number(&mut self) -> Result<u8, GedcomError> {
self.skip_whitespace();
if !self.current_char.is_ascii_digit() {
return Err(GedcomError::ParseError {
line: self.line,
message: "Expected digit for level number".to_string(),
});
}
let mut level: u32 = 0;
while self.current_char.is_ascii_digit() {
level = level
.saturating_mul(10)
.saturating_add(u32::from(self.current_char as u8 - b'0'));
self.next_char();
}
level.try_into().map_err(|_| GedcomError::ParseError {
line: self.line,
message: format!("Level number too large: {level}"),
})
}
#[inline]
fn extract_word_with_capacity(&mut self, capacity: usize) -> Box<str> {
let mut word = String::with_capacity(capacity);
while !self.current_char.is_whitespace() && self.current_char != '\0' {
word.push(self.current_char);
self.next_char();
}
word.into_boxed_str()
}
#[inline]
fn extract_value_with_capacity(&mut self, capacity: usize) -> Box<str> {
let mut value = String::with_capacity(capacity);
while self.current_char != '\n' && self.current_char != '\r' && self.current_char != '\0' {
value.push(self.current_char);
self.next_char();
}
value.into_boxed_str()
}
#[inline]
fn skip_whitespace(&mut self) {
while self.is_nonnewline_whitespace() {
self.next_char();
}
}
#[inline]
fn is_nonnewline_whitespace(&self) -> bool {
let c = self.current_char;
let is_zero_width_space = c as u32 == 65279_u32;
let not_a_newline = c != '\n';
(c.is_whitespace() || is_zero_width_space) && not_a_newline
}
#[must_use]
pub fn debug(&self) -> String {
format!("line {}:", self.line)
}
pub fn take_line_value(&mut self) -> Result<String, GedcomError> {
self.next_token()?;
match &self.current_token {
Token::LineValue(val) => {
let value = val.to_string();
self.next_token()?;
Ok(value)
}
Token::Level(_) => Ok(String::new()),
_ => Err(GedcomError::ParseError {
line: self.line,
message: format!("Expected LineValue, found {:?}", self.current_token),
}),
}
}
pub fn take_continued_text(&mut self, level: u8) -> Result<String, GedcomError> {
let first = self.take_line_value()?;
let mut value = String::with_capacity(first.len() + 16);
value.push_str(&first);
loop {
if let Token::Level(cur_level) = self.current_token {
if cur_level <= level {
break;
}
}
match &self.current_token {
Token::Tag(tag) => match tag.as_ref() {
"CONT" => {
value.push('\n');
value.push_str(&self.take_line_value()?);
}
"CONC" => {
value.push_str(&self.take_line_value()?);
}
_ => {
break;
}
},
Token::Level(_) => self.next_token()?,
Token::EOF => break,
_ => {
return Err(GedcomError::ParseError {
line: self.line,
message: format!("Unhandled Continuation Token: {:?}", self.current_token),
})
}
}
}
Ok(value)
}
}
impl TokenizerTrait for Tokenizer<'_> {
#[inline]
fn current_token(&self) -> &Token {
&self.current_token
}
#[inline]
fn line(&self) -> u32 {
self.line
}
#[inline]
fn done(&self) -> bool {
self.done()
}
#[inline]
fn next_token(&mut self) -> Result<(), GedcomError> {
Tokenizer::next_token(self)
}
#[inline]
fn take_token(&mut self) -> Result<Token, GedcomError> {
Tokenizer::take_token(self)
}
#[inline]
fn take_line_value(&mut self) -> Result<String, GedcomError> {
Tokenizer::take_line_value(self)
}
#[inline]
fn take_continued_text(&mut self, level: u8) -> Result<String, GedcomError> {
Tokenizer::take_continued_text(self, level)
}
#[inline]
fn debug(&self) -> String {
Tokenizer::debug(self)
}
}
const LINE_BUFFER_CAPACITY: usize = 256;
pub struct StreamTokenizer<R: BufRead> {
reader: R,
line_buffer: String,
line_pos: usize,
current_char: char,
current_token: Token,
line: u32,
eof: bool,
initial: bool,
}
impl<R: BufRead> StreamTokenizer<R> {
pub fn new(reader: R) -> Result<Self, GedcomError> {
let mut tokenizer = Self {
reader,
line_buffer: String::with_capacity(LINE_BUFFER_CAPACITY),
line_pos: 0,
current_char: '\n', current_token: Token::None,
line: 0,
eof: false,
initial: true,
};
tokenizer.read_next_line()?;
if tokenizer.line_buffer.len() >= 2 {
let bytes = tokenizer.line_buffer.as_bytes();
if (bytes[0] == 0xFF && bytes[1] == 0xFE) || (bytes[0] == 0xFE && bytes[1] == 0xFF) {
return Err(GedcomError::EncodingError(
"Streaming parser requires UTF-8 input; UTF-16 BOM detected".to_string(),
));
}
}
if tokenizer.line_buffer.starts_with('\u{FEFF}') {
tokenizer.line_pos = '\u{FEFF}'.len_utf8();
}
tokenizer.load_current_char();
tokenizer.next_token()?;
Ok(tokenizer)
}
fn read_next_line(&mut self) -> Result<(), GedcomError> {
self.line_buffer.clear();
self.line_pos = 0;
match self.reader.read_line(&mut self.line_buffer) {
Ok(0) => {
self.eof = true;
self.current_char = '\0';
}
Ok(_) => {
self.load_current_char();
}
Err(e) => {
return Err(GedcomError::IoError(e.to_string()));
}
}
Ok(())
}
#[inline]
fn load_current_char(&mut self) {
if self.eof {
self.current_char = '\0';
} else if self.line_pos >= self.line_buffer.len() {
self.current_char = '\n';
} else {
self.current_char = self.line_buffer[self.line_pos..]
.chars()
.next()
.unwrap_or('\0');
}
}
#[inline]
fn next_char(&mut self) -> Result<(), GedcomError> {
if self.eof {
self.current_char = '\0';
return Ok(());
}
if self.line_pos >= self.line_buffer.len() {
self.read_next_line()?;
} else {
self.line_pos += self.current_char.len_utf8();
self.load_current_char();
}
Ok(())
}
#[inline]
fn skip_whitespace(&mut self) -> Result<(), GedcomError> {
while self.is_nonnewline_whitespace() {
self.next_char()?;
}
Ok(())
}
#[inline]
fn is_nonnewline_whitespace(&self) -> bool {
let c = self.current_char;
let is_zero_width_space = c as u32 == 65279_u32;
let not_a_newline = c != '\n';
(c.is_whitespace() || is_zero_width_space) && not_a_newline
}
fn extract_number(&mut self) -> Result<u8, GedcomError> {
self.skip_whitespace()?;
if !self.current_char.is_ascii_digit() {
return Err(GedcomError::ParseError {
line: self.line,
message: "Expected digit for level number".to_string(),
});
}
let mut level: u32 = 0;
while self.current_char.is_ascii_digit() {
level = level
.saturating_mul(10)
.saturating_add(u32::from(self.current_char as u8 - b'0'));
self.next_char()?;
}
level.try_into().map_err(|_| GedcomError::ParseError {
line: self.line,
message: format!("Level number too large: {level}"),
})
}
fn extract_word_with_capacity(&mut self, capacity: usize) -> Result<Box<str>, GedcomError> {
let mut word = String::with_capacity(capacity);
while !self.current_char.is_whitespace() && self.current_char != '\0' {
word.push(self.current_char);
self.next_char()?;
}
Ok(word.into_boxed_str())
}
fn extract_value_with_capacity(&mut self, capacity: usize) -> Result<Box<str>, GedcomError> {
let mut value = String::with_capacity(capacity);
while self.current_char != '\n' && self.current_char != '\r' && self.current_char != '\0' {
value.push(self.current_char);
self.next_char()?;
}
Ok(value.into_boxed_str())
}
fn next_token_impl(&mut self) -> Result<(), GedcomError> {
if self.eof && self.current_char == '\0' {
self.current_token = Token::EOF;
return Ok(());
}
if self.initial || self.current_char == '\n' {
self.initial = false;
if self.current_char == '\r' {
self.next_char()?;
if self.current_char == '\n' {
self.next_char()?;
}
}
if self.current_char == '\n' {
self.next_char()?;
if self.eof || self.current_char == '\0' {
self.current_token = Token::EOF;
return Ok(());
}
}
while self.current_char as u32 == 65279_u32 {
self.next_char()?;
}
if self.eof || self.current_char == '\0' {
self.current_token = Token::EOF;
return Ok(());
}
self.current_token = Token::Level(self.extract_number()?);
self.line += 1;
return Ok(());
}
self.skip_whitespace()?;
if self.current_char == '\n' {
self.next_token_impl()?;
return Ok(());
}
if self.current_char == '\r' {
self.next_char()?;
if self.current_char == '\n' {
self.next_char()?;
}
if self.eof || self.current_char == '\0' {
self.current_token = Token::EOF;
return Ok(());
}
self.next_token_impl()?;
return Ok(());
}
self.current_token = match self.current_token {
Token::Level(_) => {
if self.current_char == '@' {
Token::Pointer(self.extract_word_with_capacity(POINTER_CAPACITY)?)
} else if self.current_char == '_' {
Token::CustomTag(self.extract_word_with_capacity(TAG_CAPACITY)?)
} else {
Token::Tag(self.extract_word_with_capacity(TAG_CAPACITY)?)
}
}
Token::Pointer(_) => Token::Tag(self.extract_word_with_capacity(TAG_CAPACITY)?),
Token::Tag(_) | Token::CustomTag(_) => {
if self.current_char == '\n'
|| self.current_char == '\r'
|| self.current_char == '\0'
{
Token::LineValue("".into())
} else {
Token::LineValue(self.extract_value_with_capacity(VALUE_CAPACITY)?)
}
}
_ => {
return Err(GedcomError::ParseError {
line: self.line,
message: format!("Tokenization error! {:?}", self.current_token),
})
}
};
Ok(())
}
}
impl<R: BufRead> TokenizerTrait for StreamTokenizer<R> {
#[inline]
fn current_token(&self) -> &Token {
&self.current_token
}
#[inline]
fn line(&self) -> u32 {
self.line
}
#[inline]
fn done(&self) -> bool {
matches!(self.current_token, Token::EOF)
}
fn next_token(&mut self) -> Result<(), GedcomError> {
self.next_token_impl()
}
fn take_token(&mut self) -> Result<Token, GedcomError> {
let current_token = std::mem::replace(&mut self.current_token, Token::None);
self.next_token()?;
Ok(current_token)
}
fn take_line_value(&mut self) -> Result<String, GedcomError> {
self.next_token()?;
match &self.current_token {
Token::LineValue(val) => {
let value = val.to_string();
self.next_token()?;
Ok(value)
}
Token::Level(_) => Ok(String::new()),
_ => Err(GedcomError::ParseError {
line: self.line,
message: format!("Expected LineValue, found {:?}", self.current_token),
}),
}
}
fn take_continued_text(&mut self, level: u8) -> Result<String, GedcomError> {
let first = self.take_line_value()?;
let mut value = String::with_capacity(first.len() + 16);
value.push_str(&first);
loop {
if let Token::Level(cur_level) = self.current_token {
if cur_level <= level {
break;
}
}
match &self.current_token {
Token::Tag(tag) => match tag.as_ref() {
"CONT" => {
value.push('\n');
value.push_str(&self.take_line_value()?);
}
"CONC" => {
value.push_str(&self.take_line_value()?);
}
_ => {
break;
}
},
Token::Level(_) => self.next_token()?,
Token::EOF => break,
_ => {
return Err(GedcomError::ParseError {
line: self.line,
message: format!("Unhandled Continuation Token: {:?}", self.current_token),
})
}
}
}
Ok(value)
}
fn debug(&self) -> String {
format!("line {}:", self.line)
}
}