extern crate regex;
extern crate unicode_segmentation;
use crate::errors::EstimatorErr;
#[cfg(feature = "python")]
use dict_derive::{FromPyObject, IntoPyObject};
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::fmt;
use unicode_segmentation::UnicodeSegmentation;
#[cfg(test)]
mod tests;
pub trait Tokenizer: fmt::Debug {
fn tokenize<'a>(&'a self, text: &'a str) -> Box<dyn Iterator<Item = &'a str> + 'a>;
}
#[derive(Clone)]
pub struct RegexpTokenizer {
pub params: RegexpTokenizerParams,
regexp: Regex,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "python", derive(FromPyObject, IntoPyObject))]
pub struct RegexpTokenizerParams {
pattern: String,
}
impl RegexpTokenizerParams {
pub fn pattern(&mut self, value: &str) -> RegexpTokenizerParams {
self.pattern = value.to_string();
self.clone()
}
pub fn build(&mut self) -> Result<RegexpTokenizer, EstimatorErr> {
let pattern = &self.pattern;
let regexp = Regex::new(pattern)?;
Ok(RegexpTokenizer {
params: self.clone(),
regexp,
})
}
}
impl Default for RegexpTokenizerParams {
fn default() -> RegexpTokenizerParams {
RegexpTokenizerParams {
pattern: r"\b\w\w+\b".to_string(),
}
}
}
impl Default for RegexpTokenizer {
fn default() -> RegexpTokenizer {
RegexpTokenizerParams::default().build().unwrap()
}
}
impl Tokenizer for RegexpTokenizer {
fn tokenize<'a>(&'a self, text: &'a str) -> Box<dyn Iterator<Item = &'a str> + 'a> {
Box::new(self.regexp.find_iter(text).map(|m| m.as_str()))
}
}
impl fmt::Debug for RegexpTokenizer {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "RegexpTokenizer {{ pattern: {} }}", self.params.pattern)
}
}
#[derive(Debug, Clone)]
pub struct UnicodeWordTokenizer {
pub params: UnicodeWordTokenizerParams,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "python", derive(FromPyObject, IntoPyObject))]
pub struct UnicodeWordTokenizerParams {
word_bounds: bool,
}
impl UnicodeWordTokenizerParams {
pub fn word_bounds(&mut self, value: bool) -> UnicodeWordTokenizerParams {
self.word_bounds = value;
self.clone()
}
pub fn build(&mut self) -> Result<UnicodeWordTokenizer, EstimatorErr> {
Ok(UnicodeWordTokenizer {
params: self.clone(),
})
}
}
impl Default for UnicodeWordTokenizerParams {
fn default() -> UnicodeWordTokenizerParams {
UnicodeWordTokenizerParams { word_bounds: true }
}
}
impl Default for UnicodeWordTokenizer {
fn default() -> UnicodeWordTokenizer {
UnicodeWordTokenizerParams::default().build().unwrap()
}
}
impl Tokenizer for UnicodeWordTokenizer {
fn tokenize<'a>(&self, text: &'a str) -> Box<dyn Iterator<Item = &'a str> + 'a> {
if self.params.word_bounds {
let res = text.split_word_bounds().filter(|x| x != &" ");
Box::new(res)
} else {
Box::new(text.unicode_words())
}
}
}
#[derive(Debug, Clone)]
pub struct VTextTokenizer {
pub params: VTextTokenizerParams,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "python", derive(FromPyObject, IntoPyObject))]
pub struct VTextTokenizerParams {
lang: String,
}
impl VTextTokenizerParams {
pub fn lang(&mut self, value: &str) -> VTextTokenizerParams {
self.lang = value.to_string();
self.clone()
}
pub fn build(&mut self) -> Result<VTextTokenizer, EstimatorErr> {
let lang = match &self.lang[..] {
"en" | "fr" => &self.lang[..],
_ => {
"any"
}
};
self.lang = lang.to_string();
Ok(VTextTokenizer {
params: self.clone(),
})
}
}
impl Default for VTextTokenizerParams {
fn default() -> VTextTokenizerParams {
VTextTokenizerParams {
lang: "en".to_string(),
}
}
}
impl Default for VTextTokenizer {
fn default() -> VTextTokenizer {
VTextTokenizerParams::default().build().unwrap()
}
}
impl Tokenizer for VTextTokenizer {
fn tokenize<'a>(&self, text: &'a str) -> Box<dyn Iterator<Item = &'a str> + 'a> {
let tokens = text.split_word_bounds();
let mut res: Vec<&'a str> = Vec::new();
let mut punct_start_seq: i64 = -1;
let mut punct_last = 'X';
let mut str_idx: usize = 0;
for tok in tokens {
let tok_len = tok.len();
str_idx += tok_len;
if (tok_len == 1) & (tok != " ") {
let ch = tok.chars().next().unwrap();
if ch.is_ascii_punctuation() {
if ch != punct_last {
if punct_start_seq >= 0 {
res.push(&text[punct_start_seq as usize..str_idx - tok_len]);
}
punct_start_seq = (str_idx as i64) - (tok_len as i64);
}
punct_last = ch;
continue;
}
}
if punct_start_seq >= 0 {
res.push(&text[punct_start_seq as usize..str_idx - tok_len]);
punct_start_seq = -1;
punct_last = 'X';
}
match self.params.lang.as_ref() {
"en" => {
if let Some(apostroph_idx) = tok.find(&"'") {
let mut apostroph_idx = apostroph_idx;
if tok.ends_with(&"n't") {
apostroph_idx -= 1;
}
res.push(&tok[..apostroph_idx]);
res.push(&tok[apostroph_idx..]);
continue;
} else if let Some(apostroph_idx) = tok.find(&"’") {
let mut apostroph_idx = apostroph_idx;
if tok.ends_with(&"n’t") {
apostroph_idx -= 1;
}
res.push(&tok[..apostroph_idx]);
res.push(&tok[apostroph_idx..]);
continue;
}
}
"fr" => {
if let Some(apostroph_idx) = tok.find(&"'") {
let apostroph_idx = apostroph_idx;
if apostroph_idx == 1 {
let apostroph_idx = apostroph_idx + "'".len();
res.push(&tok[..apostroph_idx]);
res.push(&tok[apostroph_idx..]);
continue;
}
}
}
_ => {}
};
res.push(tok);
if res.len() >= 3 {
let tok0 = res[res.len() - 3];
let tok1 = res[res.len() - 2];
let tok2 = res[res.len() - 1];
if (tok0 != " ") & (tok2 != " ") & !tok0.is_empty() & !tok2.is_empty() {
let char0_last = tok0.chars().last().unwrap();
let char2_first = tok0.chars().next().unwrap();
let f1 = ((tok1 == "-") | (tok1 == "@") | (tok1 == "&"))
& char0_last.is_alphanumeric()
& char2_first.is_alphanumeric();
let f2 = ((tok1 == "/") | (tok1 == ":"))
& char0_last.is_numeric()
& char2_first.is_numeric();
if f1 | f2 {
res.truncate(res.len() - 3);
res.push(&text[str_idx - tok0.len() - tok1.len() - tok2.len()..str_idx]);
}
}
}
}
if punct_start_seq >= 0 {
res.push(&text[punct_start_seq as usize..]);
}
let res = res.into_iter().filter(|x| x != &" ");
Box::new(res)
}
}
#[derive(Debug, Clone)]
pub struct CharacterTokenizer {
pub params: CharacterTokenizerParams,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[cfg_attr(feature = "python", derive(FromPyObject, IntoPyObject))]
pub struct CharacterTokenizerParams {
window_size: usize,
}
impl CharacterTokenizerParams {
pub fn window_size(&mut self, value: usize) -> CharacterTokenizerParams {
self.window_size = value;
self.clone()
}
pub fn build(&mut self) -> Result<CharacterTokenizer, EstimatorErr> {
Ok(CharacterTokenizer {
params: self.clone(),
})
}
}
impl Default for CharacterTokenizerParams {
fn default() -> CharacterTokenizerParams {
CharacterTokenizerParams { window_size: 4 }
}
}
impl Default for CharacterTokenizer {
fn default() -> CharacterTokenizer {
CharacterTokenizerParams::default().build().unwrap()
}
}
impl Tokenizer for CharacterTokenizer {
fn tokenize<'a>(&self, text: &'a str) -> Box<dyn Iterator<Item = &'a str> + 'a> {
let res = text
.char_indices()
.zip(
text.char_indices()
.skip(self.params.window_size)
.chain(Some((text.len(), ' '))),
)
.map(move |((i, _), (j, _))| &text[i..j]);
Box::new(res)
}
}