use std::marker::PhantomData;
use token::Token;
use trainer::TrainingData;
use prelude::{DefinesNonPrefixCharacters, DefinesNonWordCharacters, DefinesPunctuation,
DefinesSentenceEndings};
const STATE_SENT_END: u8 = 0b00000001;
const STATE_TOKN_BEG: u8 = 0b00000010;
const STATE_CAPT_TOK: u8 = 0b00000100;
const STATE_UPDT_STT: u8 = 0b10000000;
const STATE_UPDT_RET: u8 = 0b01000000;
struct PeriodContextTokenizer<'a, P> {
doc: &'a str,
pos: usize,
params: PhantomData<P>,
}
impl<'a, P> PeriodContextTokenizer<'a, P>
where
P: DefinesNonWordCharacters + DefinesSentenceEndings,
{
#[inline(always)]
pub fn new(doc: &'a str) -> PeriodContextTokenizer<'a, P> {
PeriodContextTokenizer {
doc: doc,
pos: 0,
params: PhantomData,
}
}
fn lookahead_is_token(&self) -> Option<usize> {
let mut pos = self.pos;
while pos < self.doc.len() {
let mut iter = self.doc[pos..].chars();
let cur = iter.nth(0).unwrap();
match cur {
c if c.is_whitespace() => return None,
c if P::is_sentence_ending(&c) => {
if let Some(nxt) = iter.next() {
if nxt.is_whitespace() || P::is_nonword_char(&nxt) {
break;
}
} else {
break;
}
}
_ => (),
}
pos += cur.len_utf8();
}
Some(pos)
}
}
impl<'a, P> Iterator for PeriodContextTokenizer<'a, P>
where
P: DefinesNonWordCharacters + DefinesSentenceEndings,
{
type Item = (&'a str, usize, usize, usize, usize);
fn next(&mut self) -> Option<(&'a str, usize, usize, usize, usize)> {
let mut astart = self.pos;
let mut wstart = self.pos;
let mut nstart = self.pos;
let mut state: u8 = 0;
while self.pos < self.doc.len() {
let cur = self.doc[self.pos..].chars().next().unwrap();
macro_rules! return_token(
() => (
{
let end = self.pos;
if state & STATE_UPDT_RET != 0 { self.pos = nstart; }
return Some((
&self.doc[astart..end],
nstart,
wstart,
end,
cur.len_utf8()));
}
)
);
match cur {
c if P::is_sentence_ending(&c) => {
state |= STATE_SENT_END;
if state & STATE_UPDT_STT != 0 {
astart = self.pos;
state ^= STATE_UPDT_STT;
}
if state & STATE_CAPT_TOK != 0 {
state |= STATE_UPDT_RET;
}
}
c if state & STATE_SENT_END == 0 => {
if c.is_whitespace() {
state |= STATE_UPDT_STT;
} else if state & STATE_UPDT_STT != 0 {
astart = self.pos;
state ^= STATE_UPDT_STT;
}
}
c if state & STATE_SENT_END != 0 && state & STATE_TOKN_BEG == 0 => {
if c.is_whitespace() {
state |= STATE_TOKN_BEG;
wstart = self.pos;
} else if P::is_nonword_char(&c) {
self.pos += c.len_utf8();
nstart = self.pos;
match self.lookahead_is_token() {
Some(x) => self.pos = x,
None => return_token!(),
}
} else if !P::is_sentence_ending(&c) {
state ^= STATE_SENT_END;
}
}
c if state & STATE_SENT_END != 0 && state & STATE_TOKN_BEG != 0
&& state & STATE_CAPT_TOK == 0 =>
{
if !c.is_whitespace() {
nstart = self.pos;
state |= STATE_CAPT_TOK;
}
}
c if state & STATE_CAPT_TOK != 0 && c.is_whitespace() => return_token!(),
_ => (),
}
self.pos += cur.len_utf8();
}
None
}
}
const NEWLINE_START: u8 = 0b00000001;
const PARAGPH_START: u8 = 0b00000010;
const CAPTURE_START: u8 = 0b00000100;
const CAPTURE_COMMA: u8 = 0b00001000;
pub struct WordTokenizer<'a, P> {
pos: usize,
doc: &'a str,
params: PhantomData<P>,
}
impl<'a, P> WordTokenizer<'a, P>
where
P: DefinesNonPrefixCharacters + DefinesNonWordCharacters,
{
#[inline(always)]
pub fn new(doc: &'a str) -> WordTokenizer<'a, P> {
WordTokenizer {
pos: 0,
doc: doc,
params: PhantomData,
}
}
}
impl<'a, P> Iterator for WordTokenizer<'a, P>
where
P: DefinesNonPrefixCharacters + DefinesNonWordCharacters,
{
type Item = Token;
fn next(&mut self) -> Option<Token> {
let mut state = if self.pos == 0 { NEWLINE_START } else { 0u8 };
let mut start = self.pos;
let mut is_ellipsis = false;
macro_rules! return_token(
() => (
{
if state & CAPTURE_COMMA != 0 {
self.pos -= 1;
}
return Some(Token::new(
&self.doc[start..self.pos],
is_ellipsis,
state & PARAGPH_START != 0,
state & NEWLINE_START != 0));
}
)
);
while self.pos < self.doc.len() {
let cur = self.doc[self.pos..].chars().next().unwrap();
match cur {
'.' | '-' => match is_multi_char(self.doc, self.pos) {
Some(s) => {
if state & CAPTURE_START != 0 || state & CAPTURE_COMMA != 0 {
return_token!()
}
start = self.pos;
is_ellipsis = s.ends_with(".");
self.pos += s.len();
return_token!()
}
None => (),
},
_ => (),
}
match cur {
c if state & CAPTURE_START != 0 => {
match c {
_ if c.is_whitespace() || P::is_nonword_char(&c) => return_token!(),
_ if c.is_alphanumeric() => {
if state & CAPTURE_COMMA != 0 {
state ^= CAPTURE_COMMA;
}
}
',' => {
state |= CAPTURE_COMMA;
}
_ => {
if state & CAPTURE_COMMA != 0 {
state ^= CAPTURE_COMMA;
}
}
}
}
c if state & CAPTURE_START == 0 && !c.is_whitespace() && !P::is_nonprefix_char(&c) => {
start = self.pos;
state |= CAPTURE_START;
}
c if !c.is_whitespace() => {
start = self.pos;
self.pos += c.len_utf8();
return_token!()
}
'\n' if state & NEWLINE_START == 0 => state |= NEWLINE_START,
'\n' => state |= PARAGPH_START,
_ => (),
}
self.pos += cur.len_utf8();
}
if state & CAPTURE_START != 0 {
return_token!()
}
None
}
}
pub struct SentenceByteOffsetTokenizer<'a, P> {
doc: &'a str,
data: &'a TrainingData,
iter: PeriodContextTokenizer<'a, P>,
last: usize,
params: PhantomData<P>,
}
impl<'a, P> SentenceByteOffsetTokenizer<'a, P>
where
P: DefinesNonPrefixCharacters
+ DefinesNonWordCharacters
+ DefinesPunctuation
+ DefinesSentenceEndings,
{
#[inline(always)]
pub fn new(doc: &'a str, data: &'a TrainingData) -> SentenceByteOffsetTokenizer<'a, P> {
SentenceByteOffsetTokenizer {
doc: doc,
iter: PeriodContextTokenizer::new(doc),
data: data,
last: 0,
params: PhantomData,
}
}
}
impl<'a, P> Iterator for SentenceByteOffsetTokenizer<'a, P>
where
P: DefinesNonPrefixCharacters
+ DefinesNonWordCharacters
+ DefinesPunctuation
+ DefinesSentenceEndings,
{
type Item = (usize, usize);
fn next(&mut self) -> Option<(usize, usize)> {
while let Some((slice, tok_start, ws_start, slice_end, len)) = self.iter.next() {
let mut prv = None;
let mut has_sentence_break = false;
for mut t in WordTokenizer::<P>::new(slice) {
::util::annotate_first_pass::<P>(&mut t, self.data);
match prv {
Some(mut p) => {
annotate_second_pass::<P>(&mut t, &mut p, self.data);
if p.is_sentence_break() {
has_sentence_break = true;
break;
}
}
None => (),
}
prv = Some(t);
}
if has_sentence_break {
let start = self.last;
return if tok_start == slice_end {
self.last = slice_end - len;
Some((start, self.last))
} else {
self.last = tok_start;
Some((start, ws_start))
};
}
}
if self.iter.pos == self.doc.len() {
self.iter.pos += 1;
Some((self.last, self.doc.len()))
} else {
None
}
}
}
pub struct SentenceTokenizer<'a, P> {
doc: &'a str,
iter: SentenceByteOffsetTokenizer<'a, P>,
params: PhantomData<P>,
}
impl<'a, P> SentenceTokenizer<'a, P>
where
P: DefinesNonPrefixCharacters
+ DefinesNonWordCharacters
+ DefinesPunctuation
+ DefinesSentenceEndings,
{
#[inline(always)]
pub fn new(doc: &'a str, data: &'a TrainingData) -> SentenceTokenizer<'a, P> {
SentenceTokenizer {
doc: doc,
iter: SentenceByteOffsetTokenizer::new(doc, data),
params: PhantomData,
}
}
}
impl<'a, P> Iterator for SentenceTokenizer<'a, P>
where
P: DefinesNonPrefixCharacters
+ DefinesNonWordCharacters
+ DefinesPunctuation
+ DefinesSentenceEndings,
{
type Item = &'a str;
#[inline]
fn next(&mut self) -> Option<&'a str> {
self.iter.next().map(|(start, end)| &self.doc[start..end])
}
}
fn orthographic_heuristic<P>(tok: &Token, data: &TrainingData) -> Option<bool>
where
P: DefinesPunctuation,
{
use prelude::{BEG_LC, MID_UC, ORT_LC, ORT_UC};
if P::is_punctuation(&tok.tok().chars().nth(0).unwrap()) {
Some(false)
} else {
let ctxt = data.get_orthographic_context(tok.typ_without_break_or_period());
if tok.is_uppercase() && (ctxt & ORT_LC != 0) && (ctxt & MID_UC == 0) {
Some(true)
} else if tok.is_lowercase() && ((ctxt & ORT_UC != 0) || (ctxt & BEG_LC == 0)) {
Some(false)
} else {
None
}
}
}
fn annotate_second_pass<P>(cur: &mut Token, prv: &mut Token, data: &TrainingData)
where
P: DefinesPunctuation,
{
use prelude::ORT_LC;
if data.contains_collocation(prv.typ_without_period(), cur.typ_without_break_or_period()) {
prv.set_is_abbrev(true);
prv.set_is_sentence_break(false);
return;
}
if (prv.is_abbrev() || prv.is_ellipsis()) && !prv.is_initial() {
if orthographic_heuristic::<P>(cur, data).unwrap_or(false) {
prv.set_is_sentence_break(true);
return;
}
if cur.is_uppercase() && data.contains_sentence_starter(cur.typ_without_break_or_period()) {
prv.set_is_sentence_break(true);
return;
}
}
if prv.is_initial() || prv.is_numeric() {
let ortho_dec = orthographic_heuristic::<P>(cur, data);
if !ortho_dec.unwrap_or(true) {
prv.set_is_sentence_break(false);
prv.set_is_abbrev(true);
return;
}
let ctxt = data.get_orthographic_context(cur.typ_without_break_or_period());
if ortho_dec.is_none() && prv.is_initial() && cur.is_uppercase() && ctxt & ORT_LC == 0 {
prv.set_is_sentence_break(false);
prv.set_is_abbrev(true);
}
}
}
fn is_multi_char(doc: &str, start: usize) -> Option<&str> {
let mut end = start;
let mut prv = doc.as_bytes()[start];
end += 1;
while end < doc.len() {
let c = doc.as_bytes()[end];
match c {
b'-' if prv == b'-' => (),
b'.' if prv == b'.' || prv == b' ' => (),
b' ' if prv == b'.' => (),
_ => {
if prv == b' ' {
end -= 1;
}
break;
}
}
prv = c;
end += 1;
}
if end - start > 1 {
Some(&doc[start..end])
} else {
None
}
}
#[test]
fn periodctxt_tokenizer_compare_nltk() {
use std::iter::Iterator;
use prelude::Standard;
for (expected, raw, file) in super::get_test_scenarios("test/word-periodctxt/", "test/raw/") {
let iter: PeriodContextTokenizer<Standard> = PeriodContextTokenizer::new(&raw[..]);
println!(" running periodctxt tests for '{:?}'", file);
for ((t, _, _, _, _), e) in iter.zip(expected) {
let t = t.replace("\n", r"\n").replace("\r", "");
let e = e.replace("\r", "");
assert!(t == e, "{} - you: [{}] != exp: [{}]", file, t, e);
}
}
}
#[test]
fn smoke_test_is_multi_char_pass() {
let docs = vec![". . .", "..", "--", "---", ". . . . .", ".. .."];
for d in docs.iter() {
assert!(is_multi_char(*d, 0).is_some(), "failed {}", *d);
}
}
#[test]
fn word_tokenizer_compare_nltk() {
use prelude::Standard;
for (expected, raw, file) in super::get_test_scenarios("test/word-training", "test/raw/") {
let iter: WordTokenizer<Standard> = WordTokenizer::new(&raw[..]);
println!(" running wordtok tests for {:?}", file);
for (t, e) in iter.zip(expected) {
assert!(
t.typ().to_lowercase() == e.trim(),
"{} - you: [{}] != exp: [{}]",
file,
t.typ().to_lowercase(),
e.trim()
);
}
}
}
#[cfg(test)]
fn train_on_document(data: &mut TrainingData, doc: &str) {
use trainer::Trainer;
let trainer: Trainer<::prelude::Standard> = Trainer::new();
trainer.train(&doc, data);
}
#[test]
fn sentence_tokenizer_compare_nltk_train_on_document() {
let cases = super::get_test_scenarios("test/sentence/", "test/raw/");
for (expected, raw, file) in cases {
println!(" running sentencetok tests for {:?}", file);
let mut data = TrainingData::new();
train_on_document(&mut data, &raw[..]);
let iter: SentenceTokenizer<::prelude::Standard> = SentenceTokenizer::new(&raw[..], &data);
for (t, e) in iter.zip(expected.iter()) {
let s = format!("[{}]", t)
.replace("\"", "\\\"")
.replace("\n", "\\n")
.replace("\r", "");
assert!(
s == e.trim(),
"{} - you: [{}] != exp: [{}]",
file,
s,
e.trim()
);
}
}
}
#[test]
fn sentence_tokenizer_issue_5_test() {
let data = TrainingData::english();
let doc = "this is a great sentence! this is a sad sentence.";
let mut iter = SentenceTokenizer::<::params::Standard>::new(doc, &data);
assert_eq!(iter.next().unwrap(), "this is a great sentence!");
assert_eq!(iter.next().unwrap(), "this is a sad sentence.");
}
#[test]
fn sentence_tokenizer_issue_8_test() {
let data = TrainingData::english();
let doc = "this is a great sentence! this is a sad sentence.)...";
let _: Vec<_> = SentenceTokenizer::<::params::Standard>::new(doc, &data).collect();
}
macro_rules! bench_word_tokenizer(
($name:ident, $doc:expr) => (
#[bench] fn $name(b: &mut ::test::Bencher) {
b.iter(|| {
let t: WordTokenizer<::prelude::Standard> = WordTokenizer::new($doc);
let _: Vec<Token> = t.collect();
})
}
)
);
bench_word_tokenizer!(
word_tokenizer_bench_short,
include_str!("../test/raw/sigma-wiki.txt")
);
bench_word_tokenizer!(
word_tokenizer_bench_medium,
include_str!("../test/raw/npr-article-01.txt")
);
bench_word_tokenizer!(
word_tokenizer_bench_long,
include_str!("../test/raw/the-sayings-of-confucius.txt")
);
bench_word_tokenizer!(
word_tokenizer_bench_very_long,
include_str!("../test/raw/pride-and-prejudice.txt")
);
macro_rules! bench_sentence_tokenizer(
($name:ident, $doc:expr) => (
#[bench] fn $name(b: &mut ::test::Bencher) {
let doc = $doc;
b.iter(|| {
let mut data = TrainingData::new();
train_on_document(&mut data, doc);
let iter: SentenceTokenizer<::prelude::Standard> =
SentenceTokenizer::new(doc, &mut data);
let _: Vec<&str> = iter.collect();
})
}
)
);
bench_sentence_tokenizer!(
bench_sentence_tokenizer_train_on_document_short,
include_str!("../test/raw/sigma-wiki.txt")
);
bench_sentence_tokenizer!(
bench_sentence_tokenizer_train_on_document_medium,
include_str!("../test/raw/npr-article-01.txt")
);
bench_sentence_tokenizer!(
bench_sentence_tokenizer_train_on_document_long,
include_str!("../test/raw/pride-and-prejudice.txt")
);