use crate::contractions;
use crate::kind::Kind;
use crate::lex::{self, Lexicon, is_apostrophe};
use std::io::{self, BufRead, Bytes};
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum Chunk {
Text,
Symbol,
Boundary,
}
struct CharSplitter<R: BufRead> {
bytes: Bytes<R>,
code: Vec<u8>,
}
pub struct Parser<R: BufRead> {
lex: &'static Lexicon,
splitter: CharSplitter<R>,
text: String,
chunks: Vec<Result<(Chunk, String, Kind), io::Error>>,
}
impl<R> CharSplitter<R>
where
R: BufRead,
{
fn new(r: R) -> Self {
CharSplitter {
bytes: r.bytes(),
code: Vec::with_capacity(4),
}
}
fn next_char(&mut self) -> Option<Result<char, io::Error>> {
self.code.clear();
for _i in 0..4 {
match self.bytes.next() {
Some(Err(e)) => return Some(Err(e)),
Some(Ok(b)) => {
self.code.push(b);
if let Ok(c) = str::from_utf8(&self.code)
&& let Some(c) = c.chars().next()
{
return Some(Ok(c));
}
}
None => {
if self.code.is_empty() {
return None;
} else {
break;
}
}
}
}
Some(Err(io::Error::other("Invalid UTF-8")))
}
}
impl<R> Iterator for CharSplitter<R>
where
R: BufRead,
{
type Item = Result<char, io::Error>;
fn next(&mut self) -> Option<Self::Item> {
self.next_char()
}
}
impl Chunk {
fn from_char(c: char) -> Self {
if is_boundary(c) {
Chunk::Boundary
} else if c.is_alphanumeric() || is_apostrophe(c) {
Chunk::Text
} else {
Chunk::Symbol
}
}
}
fn is_boundary(c: char) -> bool {
c.is_whitespace() || c.is_control() || c == '\u{200B}' || c == '\u{FEFF}'
}
fn is_dot_appendable(word: &str) -> bool {
word.chars().count() > 0
&& word.chars().all(|c| c.is_uppercase() || c == '.')
&& !word.ends_with('.')
}
impl<R> Iterator for Parser<R>
where
R: BufRead,
{
type Item = Result<(Chunk, String, Kind), io::Error>;
fn next(&mut self) -> Option<Self::Item> {
if self.chunks.is_empty() {
self.read_chunk();
}
if !self.chunks.is_empty() {
Some(self.chunks.remove(0))
} else {
None
}
}
}
impl<R> Parser<R>
where
R: BufRead,
{
pub fn new(reader: R) -> Self {
let lex = lex::builtin();
let splitter = CharSplitter::new(reader);
let chunks = Vec::new();
let text = String::new();
Parser {
lex,
splitter,
text,
chunks,
}
}
fn read_chunk(&mut self) {
while let Some(ch) = self.splitter.next() {
if let Err(e) = ch {
self.chunks.push(Err(e));
return;
}
let c = ch.unwrap();
match Chunk::from_char(c) {
Chunk::Boundary => {
self.push_text();
self.push_boundary(c);
return;
}
Chunk::Symbol => {
if c == '-' {
if !self.text.is_empty() && !self.text.ends_with('-') {
self.text.push('-');
continue;
}
}
if c == '.' && is_dot_appendable(&self.text) {
self.text.push('.');
continue;
}
self.push_text();
self.push_symbol(c);
return;
}
Chunk::Text => self.text.push(c),
}
}
self.push_text();
}
fn push_text(&mut self) {
let mut text = std::mem::take(&mut self.text);
if !text.is_empty() {
if text.ends_with('.')
&& text.chars().count() > 2
&& text.chars().filter(|c| *c == '.').count() == 1
{
text.pop();
self.push_chunk(Chunk::Text, text);
self.push_symbol('.');
} else {
self.push_chunk(Chunk::Text, text);
}
}
}
fn push_symbol(&mut self, c: char) {
self.push_chunk(Chunk::Symbol, String::from(c));
}
fn push_boundary(&mut self, c: char) {
self.push_chunk(Chunk::Boundary, String::from(c));
}
fn push_chunk(&mut self, chunk: Chunk, txt: String) {
if txt.chars().count() == 1
|| self.lex.contains(&txt)
|| !txt.chars().any(is_splittable)
{
self.push_word(chunk, txt);
return;
}
let mut first = true;
for ch in txt.split('-') {
if !first {
self.push_word(Chunk::Symbol, String::from('-'));
}
self.push_word_check_contraction(ch);
first = false;
}
}
fn push_word_check_contraction(&mut self, word: &str) {
if !word.is_empty() {
let kind = self.contraction_kind(word);
self.chunks
.push(Ok((Chunk::Text, String::from(word), kind)));
}
}
fn contraction_kind(&self, word: &str) -> Kind {
if self.lex.contains(word) {
return Kind::Lexicon;
}
if word.chars().any(is_apostrophe) {
let mut kinds = Vec::new();
for w in contractions::split(word) {
if !w.is_empty() {
let k = self.word_kind(&w);
if k == Kind::Unknown {
return Kind::Unknown;
}
kinds.push(k);
}
}
kinds.pop().unwrap_or(Kind::Unknown)
} else {
Kind::from(word)
}
}
fn word_kind(&self, word: &str) -> Kind {
if self.lex.contains(word) {
Kind::Lexicon
} else {
Kind::from(word)
}
}
fn push_word(&mut self, chunk: Chunk, word: String) {
let kind = self.word_kind(&word);
self.chunks.push(Ok((chunk, word, kind)));
}
}
fn is_splittable(c: char) -> bool {
c == '-' || is_apostrophe(c)
}