#![feature(core)]
#![feature(io)]
use std::vec::Vec;
use std::iter::Iterator;
use std::io;
use std::io::{Read, ReadExt};
pub struct Tokenizer<R: Read> {
separators: Vec<char>,
chars: io::Chars<R>,
current: String,
}
impl <R> Tokenizer<R> where R: Read {
pub fn new(reader: R, separators: Vec<char>) -> Tokenizer<R> {
Tokenizer {
chars: reader.chars(),
separators: separators,
current: String::new(),
}
}
}
impl<'a, R: Read> Iterator for Tokenizer<R> {
type Item = Result<&'a str, io::CharsError>;
fn next(&mut self) -> Option<Result<&str, io::CharsError>> {
self.current.clear();
'main: loop {
match self.chars.next() {
None => break,
Some(res) => {
match res {
Ok(c) => {
for &t in self.separators.iter() {
if c == t {
if !self.current.is_empty() {
return Some(Ok(self.current.as_slice()));
}
continue 'main; }
}
self.current.push(c);
}
Err(e) => return Some(Err(e)),
}
}
}
}
if !self.current.is_empty() {
Some(Ok(self.current.as_slice()))
} else {
None }
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.chars.size_hint()
}
}
pub struct SentenceSplitter<'a, R: Read> {
tokenizer: Tokenizer<R>,
terminators: Vec<&'a str>,
current: String,
quote_types: Vec<&'a str>,
}
impl <'a, R: Read> SentenceSplitter<'a, R> {
pub fn new(source: Tokenizer<R>, terminators: Vec<&'a str>,
quote_types: Vec<&'a str>) -> SentenceSplitter<'a, R>
{
SentenceSplitter{
tokenizer: source,
current: String::new(),
terminators: terminators,
quote_types: quote_types
}
}
}
impl <'a, 'b, R: Read> Iterator for SentenceSplitter<'a, R> {
type Item = Result<&'b str, io::CharsError>;
fn next(&mut self) -> Option<Result<&str, io::CharsError>> {
self.current.clear();
let mut quote = "";
'main: loop {
match self.tokenizer.next() {
Some(res) => {
match res {
Ok(s) => {
self.current.push_str(s);
if !quote.is_empty() {
if s.ends_with(quote) {
return Some(Ok(self.current.as_slice()))
}
}
else {
for &qt in self.quote_types.iter() {
if s.starts_with(qt) {
if s.ends_with(qt) { return Some(Ok(self.current.as_slice()));
}
quote = qt;
self.current.push_str(" ");
continue 'main;
}
}
if s.ends_with("..") { self.current.push_str(" ");
continue;
}
for &t in self.terminators.iter() {
if s.ends_with(t) {
return Some(Ok(self.current.as_slice()));
}
}
}
self.current.push_str(" ");
}
Err(e) => return Some(Err(e)),
}
},
None => {
if self.current.len() != 0 {
return Some(Ok(self.current.as_slice()));
} else {
return None;
}
}
}
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
self.tokenizer.size_hint()
}
}