use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
use crate::{stemmer, Parsitext, ParsitextConfig};
#[derive(Clone)]
pub struct PersianTokenizer {
stem: bool,
normalizer: Option<std::sync::Arc<Parsitext>>,
}
impl Default for PersianTokenizer {
fn default() -> Self {
Self::new()
}
}
impl PersianTokenizer {
#[must_use]
pub fn new() -> Self {
Self {
stem: false,
normalizer: None,
}
}
#[must_use]
pub fn with_stem(mut self, stem: bool) -> Self {
self.stem = stem;
self
}
#[must_use]
pub fn with_normalize(mut self, normalize: bool) -> Self {
self.normalizer = if normalize {
let cfg = ParsitextConfig::builder()
.enable_entity_recognition(false)
.build();
Some(std::sync::Arc::new(Parsitext::new(cfg)))
} else {
None
};
self
}
}
pub struct PersianTokenStream {
tokens: Vec<TokenSpan>,
cursor: usize,
current: Token,
}
#[derive(Debug, Clone)]
struct TokenSpan {
text: String,
byte_start: usize,
byte_end: usize,
}
impl Tokenizer for PersianTokenizer {
type TokenStream<'a> = PersianTokenStream;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
let normalized;
let working: &str = if let Some(pt) = &self.normalizer {
normalized = pt.normalize_only(text);
normalized.as_str()
} else {
text
};
let raw_spans = collect_spans(working);
let tokens: Vec<TokenSpan> = if self.stem {
raw_spans
.into_iter()
.map(|s| TokenSpan {
text: stemmer::stem(&s.text),
byte_start: s.byte_start,
byte_end: s.byte_end,
})
.collect()
} else {
raw_spans
};
PersianTokenStream {
tokens,
cursor: 0,
current: Token::default(),
}
}
}
impl TokenStream for PersianTokenStream {
fn advance(&mut self) -> bool {
if self.cursor >= self.tokens.len() {
return false;
}
let s = &self.tokens[self.cursor];
self.current.offset_from = s.byte_start;
self.current.offset_to = s.byte_end;
self.current.position = self.cursor;
self.current.text.clear();
self.current.text.push_str(&s.text);
self.cursor += 1;
true
}
fn token(&self) -> &Token {
&self.current
}
fn token_mut(&mut self) -> &mut Token {
&mut self.current
}
}
fn collect_spans(text: &str) -> Vec<TokenSpan> {
let mut out = Vec::new();
let mut start: Option<usize> = None;
let mut buf = String::new();
for (i, c) in text.char_indices() {
if is_token_break(c) {
if let Some(s) = start.take() {
if !buf.is_empty() {
out.push(TokenSpan {
text: std::mem::take(&mut buf),
byte_start: s,
byte_end: i,
});
}
}
} else {
if start.is_none() {
start = Some(i);
}
buf.push(c);
}
}
if let Some(s) = start {
if !buf.is_empty() {
out.push(TokenSpan {
text: buf,
byte_start: s,
byte_end: text.len(),
});
}
}
out
}
#[inline]
fn is_token_break(c: char) -> bool {
if c == '\u{200C}' {
return false; }
c.is_whitespace()
|| matches!(
c,
'.' | '،'
| ','
| '!'
| '?'
| '؟'
| '؛'
| ';'
| ':'
| '('
| ')'
| '['
| ']'
| '{'
| '}'
| '«'
| '»'
| '"'
| '\''
| '—'
| '–'
)
}
#[cfg(test)]
mod tests {
use super::*;
fn tokens(s: &str, mut tk: PersianTokenizer) -> Vec<String> {
let mut stream = tk.token_stream(s);
let mut out = Vec::new();
while stream.advance() {
out.push(stream.token().text.clone());
}
out
}
#[test]
fn splits_on_whitespace_and_punct() {
let r = tokens("سلام، دنیا!", PersianTokenizer::new());
assert_eq!(r, vec!["سلام", "دنیا"]);
}
#[test]
fn keeps_zwnj_inside_token() {
let r = tokens("می\u{200C}روم به خانه", PersianTokenizer::new());
assert_eq!(r[0], "می\u{200C}روم");
}
#[test]
fn stem_filter() {
let r = tokens("کتابها را خواندم", PersianTokenizer::new().with_stem(true));
assert!(r[0].contains("کتاب"));
}
#[test]
fn empty_input_yields_no_tokens() {
assert!(tokens("", PersianTokenizer::new()).is_empty());
assert!(tokens(" ", PersianTokenizer::new()).is_empty());
}
#[test]
fn byte_offsets_correct() {
let text = "سلام دنیا";
let mut tk = PersianTokenizer::new();
let mut stream = tk.token_stream(text);
assert!(stream.advance());
let t = stream.token();
assert_eq!(t.offset_from, 0);
assert_eq!(t.offset_to, 8);
assert!(stream.advance());
let t = stream.token();
assert_eq!(t.offset_from, 9);
}
}