use crate::{
rule::id::{Index, Selector},
rule::MatchSentence,
types::*,
utils::{parallelism::MaybeParallelRefIterator, regex::Regex},
Error,
};
use fs_err::File;
use serde::{Deserialize, Serialize};
use std::{
io::{BufReader, Read, Write},
ops::Range,
path::Path,
sync::Arc,
};
pub mod chunk;
pub mod multiword;
pub mod tag;
use chunk::Chunker;
use multiword::MultiwordTagger;
use tag::Tagger;
use crate::rule::DisambiguationRule;
fn split<F>(text: &str, split_func: F) -> Vec<&str>
where
F: Fn(char) -> bool,
{
let mut result = Vec::new();
let mut last = 0;
for (index, matched) in text.match_indices(split_func) {
if last != index {
result.push(&text[last..index]);
}
result.push(matched);
last = index + matched.len();
}
if last < text.len() {
result.push(&text[last..]);
}
result
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub(crate) struct TokenizerLangOptions {
pub allow_errors: bool,
#[serde(default)]
pub ids: Vec<Selector>,
#[serde(default)]
pub ignore_ids: Vec<Selector>,
#[serde(default)]
pub known_failures: Vec<String>,
#[serde(default)]
pub extra_split_chars: Vec<char>,
#[serde(default)]
pub extra_join_regexes: Vec<Regex>,
}
impl Default for TokenizerLangOptions {
fn default() -> Self {
TokenizerLangOptions {
allow_errors: false,
ids: Vec::new(),
ignore_ids: Vec::new(),
known_failures: Vec::new(),
extra_split_chars: Vec::new(),
extra_join_regexes: Vec::new(),
}
}
}
pub struct IncompleteSentenceIter<'t> {
text: &'t str,
splits: Vec<Range<usize>>,
tokenizer: &'t Tokenizer,
index: usize,
position: Position,
}
impl<'t> Iterator for IncompleteSentenceIter<'t> {
type Item = IncompleteSentence<'t>;
fn next(&mut self) -> Option<Self::Item> {
if self.index == self.splits.len() {
return None;
}
let mut range = self.splits[self.index].clone();
self.index += 1;
while self.text[range.clone()].trim().is_empty() && self.index < self.splits.len() {
range.end = self.splits[self.index].end;
self.index += 1;
}
let sentence = self
.tokenizer
.tokenize(&self.text[range.clone()])
.map(|x| x.rshift(self.position));
self.position += Position {
char: self.text[range.clone()].chars().count(),
byte: range.len(),
};
sentence
}
}
pub struct SentenceIter<'t> {
inner: IncompleteSentenceIter<'t>,
tokenizer: &'t Tokenizer,
}
impl<'t> Iterator for SentenceIter<'t> {
type Item = Sentence<'t>;
fn next(&mut self) -> Option<Self::Item> {
self.inner
.next()
.map(|sentence| self.tokenizer.disambiguate(sentence).into_sentence())
}
}
#[derive(Serialize, Deserialize, Default, Clone)]
pub struct Tokenizer {
pub(crate) rules: Vec<DisambiguationRule>,
pub(crate) chunker: Option<Chunker>,
pub(crate) sentencizer: srx::Rules,
pub(crate) multiword_tagger: Option<MultiwordTagger>,
pub(crate) tagger: Arc<Tagger>,
pub(crate) lang_options: TokenizerLangOptions,
}
impl Tokenizer {
pub fn new<P: AsRef<Path>>(p: P) -> Result<Self, Error> {
let reader = BufReader::new(File::open(p.as_ref())?);
Ok(bincode::deserialize_from(reader)?)
}
pub fn from_reader<R: Read>(reader: R) -> Result<Self, Error> {
Ok(bincode::deserialize_from(reader)?)
}
pub fn to_writer<W: Write>(&self, writer: W) -> Result<(), Error> {
Ok(bincode::serialize_into(writer, &self)?)
}
pub fn rules(&self) -> &[DisambiguationRule] {
&self.rules
}
pub fn tagger(&self) -> &Arc<Tagger> {
&self.tagger
}
pub fn chunker(&self) -> &Option<Chunker> {
&self.chunker
}
pub(crate) fn lang_options(&self) -> &TokenizerLangOptions {
&self.lang_options
}
pub(crate) fn disambiguate_up_to_id<'t>(
&'t self,
mut sentence: IncompleteSentence<'t>,
id: Option<&Index>,
) -> IncompleteSentence<'t> {
let n = id.map_or(self.rules.len(), |id| {
self.rules.iter().position(|x| x.id == *id).unwrap()
});
let mut i = 0;
while i < n {
let complete_sentence = sentence.clone().into_sentence();
let match_sentence = MatchSentence::new(&complete_sentence);
let result = self.rules[i..n]
.maybe_par_iter()
.enumerate()
.filter_map(|(j, rule)| {
let changes = rule.apply(&match_sentence);
if changes.is_empty() {
None
} else {
Some((j + i, changes))
}
})
.find_first(|_| true);
if let Some((index, changes)) = result {
self.rules[index].change(&mut sentence, changes);
i = index + 1;
} else {
i = n;
}
}
sentence
}
pub fn disambiguate<'t>(&'t self, sentence: IncompleteSentence<'t>) -> IncompleteSentence<'t> {
self.disambiguate_up_to_id(sentence, None)
}
fn get_token_ranges<'t>(
&self,
text: &'t str,
) -> impl ExactSizeIterator<Item = Range<usize>> + 't {
let mut tokens = Vec::new();
let split_char = |c: char| c.is_whitespace() || crate::utils::splitting_chars().contains(c);
let split_text = |text: &'t str| {
let mut tokens = Vec::new();
for pretoken in split(text, split_char) {
if self.tagger.id_word(pretoken.into()).1.is_some() {
tokens.push(pretoken);
} else {
tokens.extend(split(pretoken, |c| {
split_char(c) || self.lang_options.extra_split_chars.contains(&c)
}));
}
}
tokens
};
let mut joined_mask = vec![false; text.len()];
let mut joins = Vec::new();
for regex in self.lang_options.extra_join_regexes.iter() {
for mat in regex.find_iter(text) {
if !joined_mask[mat.start()..mat.end()].iter().any(|x| *x) {
joins.push(mat.start()..mat.end());
joined_mask[mat.start()..mat.end()]
.iter_mut()
.for_each(|x| *x = true);
}
}
}
joins.sort_by(|a, b| a.start.cmp(&b.start));
let mut prev = 0;
for range in joins {
tokens.extend(split_text(&text[prev..range.start]));
prev = range.end;
tokens.push(&text[range]);
}
tokens.extend(split_text(&text[prev..text.len()]));
tokens.into_iter().map(move |token| {
let byte_start = (token.as_ptr() as usize)
.checked_sub(text.as_ptr() as usize)
.expect("Each token str is a slice of the text str.");
byte_start..byte_start + token.len()
})
}
pub(crate) fn tokenize<'t>(&'t self, sentence: &'t str) -> Option<IncompleteSentence<'t>> {
if sentence.trim().is_empty() {
return None;
}
let token_strs = self.get_token_ranges(sentence);
let n_token_strs = token_strs.len();
let mut tokens: Vec<_> = token_strs
.enumerate()
.filter(|(_, range)| !sentence[range.clone()].trim().is_empty())
.map(|(i, range)| {
let byte_start = range.start;
let char_start = sentence[..byte_start].chars().count();
let token_text = sentence[range].trim();
let is_sentence_start = i == 0;
let is_sentence_end = i == n_token_strs - 1;
IncompleteToken::new(
Word::new(
self.tagger.id_word(token_text.into()),
self.tagger.get_tags_with_options(
token_text,
if is_sentence_start { Some(true) } else { None },
None,
),
),
Span::new(
byte_start..byte_start + token_text.len(),
char_start..char_start + token_text.chars().count(),
),
is_sentence_end,
sentence[..byte_start].ends_with(char::is_whitespace),
Vec::new(),
)
})
.collect();
let last_idx = tokens.len() - 1;
*tokens[last_idx].is_sentence_end_mut() = true;
let mut sentence = IncompleteSentence::new(tokens, sentence, &self.tagger);
if let Some(chunker) = &self.chunker {
chunker.apply(&mut sentence);
}
if let Some(multiword_tagger) = &self.multiword_tagger {
multiword_tagger.apply(&mut sentence);
}
Some(sentence)
}
pub fn sentencize<'t>(&'t self, text: &'t str) -> IncompleteSentenceIter<'t> {
IncompleteSentenceIter {
text,
splits: self.sentencizer.split_ranges(text),
tokenizer: &self,
index: 0,
position: Position::default(),
}
}
pub fn pipe<'t>(&'t self, text: &'t str) -> SentenceIter<'t> {
SentenceIter {
inner: self.sentencize(text),
tokenizer: &self,
}
}
}