use unicode_normalization::UnicodeNormalization;
use crate::{Token, Transcriber};
use super::{ParseAction, TengwarMode};
const fn to_lower(c: char) -> char {
match c {
'Ñ' => 'ñ', 'Ð' => 'ð', 'Þ' => 'þ', 'Θ' => 'θ', 'Φ' => 'φ',
'Ä' => 'ä', 'Ë' => 'ë', 'Ï' => 'ï', 'Ö' => 'ö', 'Ü' => 'ü', 'Ÿ' => 'ÿ',
'Á' => 'á', 'É' => 'é', 'Í' => 'í', 'Ó' => 'ó', 'Ú' => 'ú', 'Ý' => 'ý',
'Â' => 'â', 'Ê' => 'ê', 'Î' => 'î', 'Ô' => 'ô', 'Û' => 'û', 'Ŷ' => 'ŷ',
'Æ' => 'æ', 'Œ' => 'œ',
'Ɣ' => 'ɣ', 'Ʒ' => 'ʒ',
_ => c.to_ascii_lowercase(),
}
}
#[derive(Clone, Debug)]
enum Step {
Incomplete,
Exhausted,
Complete(Token),
}
#[derive(Debug)]
pub struct Tokenizer<M: TengwarMode> {
chars: Vec<char>,
lower: Vec<char>,
head: usize,
size: usize,
skip: usize,
pub mode: M,
next: Option<Token>,
}
impl<M: TengwarMode> Tokenizer<M> {
pub fn new(chars: Vec<char>, mode: M) -> Self {
let size: usize = chars.len().min(M::MAX_CHUNK);
let mut lower = chars.clone();
for char in &mut lower {
*char = to_lower(*char);
}
Self {
chars,
lower,
head: 0,
size,
skip: 0,
mode,
next: None,
}
}
pub fn from_str(s: impl AsRef<str>) -> Self
where M: Default,
{
Self::with_mode(s, M::default())
}
pub fn with_mode(s: impl AsRef<str>, mode: M) -> Self {
Self::new(s.as_ref().nfc().collect(), mode)
}
pub fn into_transcriber(self) -> Transcriber<M> { self.into() }
pub fn window(&self) -> &[char] {
let end: usize = self.chars.len().min(self.head + self.size);
&self.chars[self.head..end]
}
pub fn window_lower(&self) -> &[char] {
let end: usize = self.lower.len().min(self.head + self.size);
&self.lower[self.head..end]
}
}
impl<M: TengwarMode> Tokenizer<M> {
fn advance_head(&mut self, n: usize) {
self.head += n;
self.size = self.chars.len().min(M::MAX_CHUNK);
}
#[inline]
fn narrow_window(&mut self) { self.size -= 1; }
#[inline]
fn skip_count_add(&mut self, n: usize) { self.skip += n; }
#[inline]
fn skip_count_dec(&mut self) { self.skip -= 1; }
fn step(&mut self) -> Step {
let data: &[char] = &self.lower;
let mode: &mut M = &mut self.mode;
let head: usize = self.head;
let size: usize = self.size;
let skip: usize = self.skip;
let len: usize = data.len();
if len <= head {
match mode.finish_current() {
Some(token) => Step::Complete(token),
None => Step::Exhausted,
}
} else { if 0 < skip {
if let Some(token) = mode.finish_current() {
self.advance_head(0);
Step::Complete(token)
} else {
self.advance_head(1);
self.skip_count_dec();
Step::Complete(Token::Char(self.chars[head]))
}
}
else if 0 < size { let end: usize = len.min(head + size);
let chunk: &[char] = &data[head..end];
match mode.process(chunk) {
ParseAction::MatchedNone => {
self.narrow_window();
Step::Incomplete
}
ParseAction::MatchedPart(len) => {
self.advance_head(len);
Step::Incomplete
}
ParseAction::MatchedToken { token, len } => {
self.advance_head(len);
Step::Complete(token)
}
ParseAction::Skip(n) => {
let finished: Option<Token> = mode.finish_current();
self.skip_count_add(n);
match finished {
Some(token) => Step::Complete(token),
None => Step::Incomplete,
}
}
ParseAction::Escape { len_seq, n_skip } => {
self.advance_head(len_seq);
self.skip_count_add(n_skip);
Step::Incomplete
}
}
}
else if let Some(token) = mode.finish_current() {
self.advance_head(0);
Step::Complete(token)
}
else if let Some((token, len)) = mode.find_secondary(&data[head..]) {
self.advance_head(len);
Step::Complete(token)
}
else {
self.advance_head(1);
Step::Complete(Token::Char(self.chars[head]))
}
}
}
fn step_to_next(&mut self) -> Option<Token> {
loop {
match self.step() {
Step::Incomplete => continue,
Step::Exhausted => break None,
Step::Complete(token) => break Some(token),
}
}
}
}
impl<M: TengwarMode> Iterator for Tokenizer<M> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
let mut token: Token = match self.next.take() {
Some(stored) => stored,
None => self.step_to_next()?,
};
self.next = self.step_to_next();
self.mode.finalize(&mut token, self.next.as_ref());
Some(token)
}
}