use std::borrow::Cow;
use aho_corasick::{AhoCorasick, MatchKind};
use fst::Set;
use crate::detection::Language;
use crate::normalizer::{NormalizedTokenIter, NormalizerOption};
use crate::segmenter::{Segment, SegmentedStrIter, SegmentedTokenIter, SegmenterOption};
use crate::separators::DEFAULT_SEPARATORS;
use crate::Token;
pub struct ReconstructedTokenIter<'o, 'aho, 'lang, 'tb> {
token_iter: NormalizedTokenIter<'o, 'aho, 'lang, 'tb>,
original: &'o str,
}
impl<'o> Iterator for ReconstructedTokenIter<'o, '_, '_, '_> {
type Item = (&'o str, Token<'o>);
fn next(&mut self) -> Option<Self::Item> {
self.token_iter
.next()
.map(|token| (&self.original[token.byte_start..token.byte_end], token))
}
}
pub trait Tokenize<'o> {
fn tokenize(&self) -> NormalizedTokenIter<'_, '_, '_, '_>;
fn reconstruct(&self) -> ReconstructedTokenIter<'_, '_, '_, '_>;
}
impl Tokenize<'_> for &str {
fn tokenize(&self) -> NormalizedTokenIter<'_, '_, '_, '_> {
self.segment().normalize(&crate::normalizer::DEFAULT_NORMALIZER_OPTION)
}
fn reconstruct(&self) -> ReconstructedTokenIter<'_, '_, '_, '_> {
ReconstructedTokenIter { original: self, token_iter: self.tokenize() }
}
}
#[derive(Debug)]
pub struct Tokenizer<'tb> {
segmenter_option: Cow<'tb, SegmenterOption<'tb>>,
normalizer_option: Cow<'tb, NormalizerOption<'tb>>,
}
impl Tokenizer<'_> {
pub fn tokenize<'t, 'o>(&'t self, original: &'o str) -> NormalizedTokenIter<'o, 't, 't, 't> {
original
.segment_with_option(
self.segmenter_option.aho.as_ref(),
self.segmenter_option.allow_list,
)
.normalize(&self.normalizer_option)
}
pub fn tokenize_with_allow_list<'t, 'o, 'lang>(
&'t self,
original: &'o str,
allow_list: Option<&'lang [Language]>,
) -> NormalizedTokenIter<'o, 't, 'lang, 't> {
original
.segment_with_option(self.segmenter_option.aho.as_ref(), allow_list)
.normalize(&self.normalizer_option)
}
pub fn reconstruct<'t, 'o>(
&'t self,
original: &'o str,
) -> ReconstructedTokenIter<'o, 't, 't, 't> {
ReconstructedTokenIter { original, token_iter: self.tokenize(original) }
}
pub fn segment<'t, 'o>(&'t self, original: &'o str) -> SegmentedTokenIter<'o, 't, 't> {
original.segment_with_option(
self.segmenter_option.aho.as_ref(),
self.segmenter_option.allow_list,
)
}
pub fn segment_str<'t, 'o>(&'t self, original: &'o str) -> SegmentedStrIter<'o, 't, 't> {
original.segment_str_with_option(
self.segmenter_option.aho.as_ref(),
self.segmenter_option.allow_list,
)
}
}
pub struct TokenizerBuilder<'tb, A> {
stop_words: Option<&'tb Set<A>>,
words_dict: Option<&'tb [&'tb str]>,
normalizer_option: NormalizerOption<'tb>,
segmenter_option: SegmenterOption<'tb>,
}
impl<'tb, A> TokenizerBuilder<'tb, A> {
pub fn new() -> TokenizerBuilder<'tb, A> {
Self {
normalizer_option: crate::normalizer::DEFAULT_NORMALIZER_OPTION,
segmenter_option: SegmenterOption::default(),
stop_words: None,
words_dict: None,
}
}
}
impl<'tb, A: AsRef<[u8]>> TokenizerBuilder<'tb, A> {
pub fn stop_words(&mut self, stop_words: &'tb Set<A>) -> &mut Self {
self.stop_words = Some(stop_words);
self.normalizer_option.classifier.stop_words = self.stop_words.map(|sw| {
let sw = sw.as_fst().as_bytes();
Set::new(sw).unwrap()
});
self
}
pub fn separators(&mut self, separators: &'tb [&'tb str]) -> &mut Self {
self.normalizer_option.classifier.separators = Some(separators);
self
}
pub fn words_dict(&mut self, words: &'tb [&'tb str]) -> &mut Self {
self.words_dict = Some(words);
self
}
pub fn create_char_map(&mut self, create_char_map: bool) -> &mut Self {
self.normalizer_option.create_char_map = create_char_map;
self
}
pub fn lossy_normalization(&mut self, lossy: bool) -> &mut Self {
self.normalizer_option.lossy = lossy;
self
}
pub fn allow_list(&mut self, allow_list: &'tb [Language]) -> &mut Self {
self.segmenter_option.allow_list = Some(allow_list);
self
}
pub fn build(&mut self) -> Tokenizer<'_> {
match (self.normalizer_option.classifier.separators, self.words_dict) {
(Some(separators), None) => {
let pattern = separators.iter().filter(|s| !s.is_empty());
let aho = AhoCorasick::builder()
.match_kind(MatchKind::LeftmostLongest)
.build(pattern)
.unwrap();
self.segmenter_option.aho = Some(aho).filter(|aho| aho.patterns_len() != 0);
}
(separators, Some(words)) => {
let separators = separators.unwrap_or(DEFAULT_SEPARATORS);
let pattern = words.iter().chain(separators).filter(|s| !s.is_empty());
let aho = AhoCorasick::builder()
.match_kind(MatchKind::LeftmostLongest)
.build(pattern)
.unwrap();
self.segmenter_option.aho = Some(aho).filter(|aho| aho.patterns_len() != 0);
}
(None, None) => self.segmenter_option.aho = None,
}
Tokenizer {
normalizer_option: Cow::Borrowed(&self.normalizer_option),
segmenter_option: Cow::Borrowed(&self.segmenter_option),
}
}
pub fn into_tokenizer(mut self) -> Tokenizer<'tb> {
drop(self.build());
Tokenizer {
normalizer_option: Cow::Owned(self.normalizer_option),
segmenter_option: Cow::Owned(self.segmenter_option),
}
}
}
impl Default for TokenizerBuilder<'_, Vec<u8>> {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod test {
use fst::Set;
use quickcheck::quickcheck;
use crate::{Tokenize, TokenizerBuilder};
#[test]
fn check_lifetimes() {
let text = "Hello world! Pleased to see you.";
let tokens: Vec<_> = { text.tokenize().collect() };
assert_eq!(tokens.iter().last().map(|t| t.lemma()), Some("."));
let tokens: Vec<_> = {
let mut builder = TokenizerBuilder::default();
let tokens = {
let tokenizer = builder.build();
tokenizer.tokenize(text).collect()
};
tokens
};
assert_eq!(tokens.iter().last().map(|t| t.lemma()), Some("."));
let tokens: Vec<_> = {
let stop_words: Set<Vec<u8>> = Set::from_iter(["to"].iter()).unwrap();
let mut builder = TokenizerBuilder::new();
let builder = builder.stop_words(&stop_words);
let tokens = {
let tokenizer = builder.build();
tokenizer.tokenize(text).collect()
};
tokens
};
assert_eq!(tokens.iter().last().map(|t| t.lemma()), Some("."));
}
#[quickcheck]
fn shorten_after_tokenized(text: String) -> bool {
let text = text.as_str();
let tokens: Vec<_> = text.tokenize().collect();
tokens.len() <= text.len()
}
}