use crate::core::{
CueStr,
error::{CueStrError, CueStrErrorKind},
};
const ZWNBP: char = '\u{feff}';
#[derive(Default, Debug, Clone, Copy)]
pub struct Position {
pub line: usize,
pub column: usize,
}
#[derive(Debug)]
pub enum Token<'a> {
Text { value: CueStr<'a> },
LF,
}
impl<'a> From<CueStr<'a>> for Token<'a> {
fn from(value: CueStr<'a>) -> Self {
Self::Text { value }
}
}
#[derive(Clone)]
pub struct Tokenizer<'a> {
buffer: &'a str,
position: Position,
cursor_index: usize,
}
impl<'a> Tokenizer<'a> {
pub const fn new(buffer: &'a str) -> Self {
Self {
buffer,
position: Position { line: 0, column: 0 },
cursor_index: 0,
}
}
#[inline]
pub const fn position(&self) -> &Position {
&self.position
}
#[inline]
pub const fn cursor_position(&self) -> usize {
self.cursor_index
}
pub const fn snapshot(&self) -> Self {
Self {
buffer: self.buffer,
position: Position {
line: self.position.line,
column: self.position.column,
},
cursor_index: self.cursor_index,
}
}
#[inline]
pub const fn as_raw_buffer(&self) -> &'a str {
&self.buffer
}
pub fn next_token(&mut self) -> Result<Option<Token<'a>>, CueStrError> {
self.eat_whitespace();
let start = self.cursor_index;
if start < self.buffer.len() {
let remaining = &self.buffer[start..];
let token: Token<'a> = match remaining
.as_bytes()
.first()
.expect("unreachable 'None' case, length is already checked")
{
b'\n' => self.line_feed(),
b'"' => self.quoted_str()?.into(),
_ => self.regular_str()?.into(),
};
self.eat_whitespace();
Ok(Some(token))
} else {
Ok(None)
}
}
fn eat_whitespace(&mut self) {
let start = self.cursor_index;
let remaining = &self.buffer[start..];
let mut chars = remaining.chars();
loop {
match chars.next() {
Some(value) => {
if value != '\n' && (value.is_whitespace() || value == ZWNBP) {
self.cursor_index += value.len_utf8();
self.position.column += 1;
} else {
break;
}
}
None => break,
}
}
}
#[inline]
fn line_feed(&mut self) -> Token<'a> {
self.cursor_index += '\n'.len_utf8();
self.position.line += 1;
self.position.column = 0;
Token::LF
}
#[inline]
fn quoted_str(&mut self) -> Result<CueStr<'a>, CueStrError> {
let start = self.cursor_index;
let remaining = &self.buffer[start..];
let mut has_escape = false;
let mut chars = remaining.chars();
macro_rules! next_char {
() => {{
let next = chars.next();
if let Some(v) = next {
self.cursor_index += v.len_utf8();
self.position.column += 1;
}
next
}};
}
_ = next_char!();
loop {
match next_char!() {
Some('"') => {
let end = self.cursor_index;
let cue_str = if has_escape {
CueStr::QuotedTextWithEscape(&self.buffer[start..end])
} else {
CueStr::QuotedText(&self.buffer[start..end])
};
return Ok(cue_str);
}
Some('\\') => {
has_escape = true;
match next_char!() {
Some('"' | '\\') => {}
Some(_) => return Err(CueStrError::new(CueStrErrorKind::UnescapedSpecialChar)),
_ => return Err(CueStrError::new(CueStrErrorKind::MissingEndingQuote)),
}
}
Some('\n') => return Err(CueStrError::new(CueStrErrorKind::MissingEndingQuote)),
Some(_) => continue,
None => break,
}
}
Err(CueStrError::new(CueStrErrorKind::MissingEndingQuote))
}
#[inline]
fn regular_str(&mut self) -> Result<CueStr<'a>, CueStrError> {
let start = self.cursor_index;
let remaining = &self.buffer[start..];
let mut chars = remaining.chars();
loop {
match chars.next() {
Some(v) if !v.is_whitespace() => {
self.cursor_index += v.len_utf8();
self.position.column += 1;
}
_ => {
let end = self.cursor_index;
let cue_str = CueStr::Text(&self.buffer[start..end]);
return Ok(cue_str);
}
}
}
}
}