use std::io::{self, Read};
use unicode_segmentation::UnicodeSegmentation;
use crate::analysis::chunk_reader::Utf8ChunkReader;
use crate::analysis::{Analyzer, AnalyzerFactory, Token};
use crate::document::TermOffset;
#[derive(Default)]
pub struct UnicodeAnalyzer {
chunk_reader: Option<Utf8ChunkReader>,
buf: String,
segments: Vec<(usize, usize)>,
seg_idx: usize,
bytes_consumed: usize,
eof: bool,
}
impl UnicodeAnalyzer {
pub fn new() -> Self {
Self::default()
}
fn fill_and_segment(&mut self) -> io::Result<()> {
if let Some(reader) = &mut self.chunk_reader {
match reader.next_chunk()? {
Some(chunk) => {
let lowered = chunk.to_lowercase();
self.buf.push_str(&lowered);
}
None => {
self.eof = true;
}
}
} else {
self.eof = true;
}
self.segments.clear();
self.seg_idx = 0;
let mut offset = 0;
for segment in self.buf.split_word_bounds() {
let len = segment.len();
if segment.chars().any(|c| c.is_alphanumeric()) {
self.segments.push((offset, offset + len));
}
offset += len;
}
if !self.eof && !self.segments.is_empty() {
self.segments.pop();
}
Ok(())
}
#[cfg(test)]
fn with_capacity(capacity: usize, reader: Box<dyn Read + Send>) -> Self {
Self {
chunk_reader: Some(Utf8ChunkReader::with_capacity(capacity, reader)),
..Self::default()
}
}
}
impl Analyzer for UnicodeAnalyzer {
fn set_reader(&mut self, reader: Box<dyn Read + Send>) {
self.chunk_reader = Some(Utf8ChunkReader::new(reader));
self.buf.clear();
self.segments.clear();
self.seg_idx = 0;
self.bytes_consumed = 0;
self.eof = false;
}
fn next_token(&mut self) -> io::Result<Option<Token<'_>>> {
loop {
if self.seg_idx < self.segments.len() {
let (start, end) = self.segments[self.seg_idx];
self.seg_idx += 1;
let token_start_byte = self.bytes_consumed + start;
let token_len = (self.bytes_consumed + end) - token_start_byte;
return Ok(Some(Token {
text: &self.buf[start..end],
offset: TermOffset {
start: token_start_byte as u32,
length: token_len as u16,
},
position_increment: 1,
}));
}
if self.eof {
return Ok(None);
}
let trim_to = if let Some(&(_, end)) = self.segments.last() {
end
} else if !self.buf.is_empty() {
0
} else {
self.buf.len()
};
self.bytes_consumed += trim_to;
self.buf = self.buf[trim_to..].to_string();
self.segments.clear();
self.seg_idx = 0;
self.fill_and_segment()?;
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct UnicodeAnalyzerFactory;
impl AnalyzerFactory for UnicodeAnalyzerFactory {
fn create(&self) -> Box<dyn Analyzer> {
Box::new(UnicodeAnalyzer::new())
}
}
#[cfg(test)]
mod tests {
use super::*;
use assertables::*;
fn collect_tokens(text: &str) -> Vec<(String, TermOffset, i32)> {
let mut analyzer = UnicodeAnalyzer::default();
analyzer.set_reader(Box::new(io::Cursor::new(text.as_bytes().to_vec())));
let mut result = Vec::new();
while let Some(token) = analyzer.next_token().unwrap() {
result.push((
token.text.to_string(),
token.offset,
token.position_increment,
));
}
result
}
fn collect_tokens_chunked(text: &str, capacity: usize) -> Vec<(String, TermOffset, i32)> {
let reader: Box<dyn Read + Send> = Box::new(io::Cursor::new(text.as_bytes().to_vec()));
let mut analyzer = UnicodeAnalyzer::with_capacity(capacity, reader);
let mut result = Vec::new();
while let Some(token) = analyzer.next_token().unwrap() {
result.push((
token.text.to_string(),
token.offset,
token.position_increment,
));
}
result
}
fn texts(tokens: &[(String, TermOffset, i32)]) -> Vec<&str> {
tokens.iter().map(|t| t.0.as_str()).collect()
}
#[test]
fn test_simple_english() {
let tokens = collect_tokens("The quick brown fox");
assert_eq!(texts(&tokens), vec!["the", "quick", "brown", "fox"]);
}
#[test]
fn test_lowercases_tokens() {
let tokens = collect_tokens("Hello WORLD");
assert_eq!(tokens[0].0, "hello");
assert_eq!(tokens[1].0, "world");
}
#[test]
fn test_empty_string() {
let tokens = collect_tokens("");
assert_is_empty!(&tokens);
}
#[test]
fn test_position_increments() {
let tokens = collect_tokens("one two three");
for t in &tokens {
assert_eq!(t.2, 1);
}
}
#[test]
fn test_set_reader_reuse() {
let mut analyzer = UnicodeAnalyzer::default();
analyzer.set_reader(Box::new(io::Cursor::new(b"hello".to_vec())));
let token = analyzer.next_token().unwrap();
assert_eq!(token.unwrap().text, "hello");
let none = analyzer.next_token().unwrap();
assert_none!(&none);
analyzer.set_reader(Box::new(io::Cursor::new(b"world".to_vec())));
let token = analyzer.next_token().unwrap();
assert_eq!(token.unwrap().text, "world");
}
#[test]
fn test_numeric_grouping() {
let tokens = collect_tokens("1,200");
assert_len_eq_x!(&tokens, 1);
assert_eq!(tokens[0].0, "1,200");
}
#[test]
fn test_decimal_numbers() {
let tokens = collect_tokens("12.1 3.14159");
assert_len_eq_x!(&tokens, 2);
assert_eq!(tokens[0].0, "12.1");
assert_eq!(tokens[1].0, "3.14159");
}
#[test]
fn test_urls() {
let tokens = collect_tokens("www.gutenberg.org www.pgdp.net");
assert_len_eq_x!(&tokens, 2);
assert_eq!(tokens[0].0, "www.gutenberg.org");
assert_eq!(tokens[1].0, "www.pgdp.net");
}
#[test]
fn test_email_domain() {
let tokens = collect_tokens("user@example.com");
let t = texts(&tokens);
assert_eq!(t, vec!["user", "example.com"]);
}
#[test]
fn test_contractions() {
let tokens = collect_tokens("don't can't it's they're");
assert_eq!(texts(&tokens), vec!["don't", "can't", "it's", "they're"]);
}
#[test]
fn test_possessives() {
let tokens = collect_tokens("Todd's dog's");
assert_eq!(texts(&tokens), vec!["todd's", "dog's"]);
}
#[test]
fn test_cjk_ideographs_split() {
let tokens = collect_tokens("宋史");
assert_len_eq_x!(&tokens, 2);
assert_eq!(tokens[0].0, "宋");
assert_eq!(tokens[1].0, "史");
}
#[test]
fn test_cjk_multiple_sequences() {
let tokens = collect_tokens("中文测试 東京");
assert_eq!(texts(&tokens), vec!["中", "文", "测", "试", "東", "京"]);
}
#[test]
fn test_hiragana_split() {
let tokens = collect_tokens("ひらがな");
assert_eq!(texts(&tokens), vec!["ひ", "ら", "が", "な"]);
}
#[test]
fn test_katakana_kept() {
let tokens = collect_tokens("カタカナ");
assert_len_eq_x!(&tokens, 1);
assert_eq!(tokens[0].0, "カタカナ");
}
#[test]
fn test_hangul_kept() {
let tokens = collect_tokens("한국어");
assert_len_eq_x!(&tokens, 1);
assert_eq!(tokens[0].0, "한국어");
}
#[test]
fn test_accented_text() {
let tokens = collect_tokens("boïelle société café résumé naïve");
assert_eq!(
texts(&tokens),
vec!["boïelle", "société", "café", "résumé", "naïve"]
);
}
#[test]
fn test_unicode_lowercase() {
let tokens = collect_tokens("Société Café");
assert_eq!(tokens[0].0, "société");
assert_eq!(tokens[1].0, "café");
}
#[test]
fn test_smart_quote_contraction() {
let tokens = collect_tokens("don\u{2019}t");
assert_len_eq_x!(&tokens, 1);
assert_eq!(tokens[0].0, "don\u{2019}t");
}
#[test]
fn test_hyphenated_words_split() {
let tokens = collect_tokens("well-known state-of-the-art");
assert_eq!(
texts(&tokens),
vec!["well", "known", "state", "of", "the", "art"]
);
}
#[test]
fn test_underscore_words() {
let tokens = collect_tokens("foo_bar _private");
assert_eq!(texts(&tokens), vec!["foo_bar", "_private"]);
}
#[test]
fn test_mixed_alphanumeric() {
let tokens = collect_tokens("test123 abc456def");
assert_eq!(texts(&tokens), vec!["test123", "abc456def"]);
}
#[test]
fn test_full_uax29_document() {
let input = "Simple English words here.\n\
Numbers like 1,200 and 12.1 and 3.14159 should stay together.\n\
URLs like www.gutenberg.org and www.pgdp.net are single tokens.\n\
Email test user@example.com should be one token.\n\
Contractions: don't can't it's they're\n\
Possessives: Todd's dog's\n\
CJK ideographs: 宋史 中文测试 東京\n\
Hiragana: ひらがな\n\
Katakana: カタカナ\n\
Hangul: 한국어\n\
Accented: boïelle société d\u{2019}académie café résumé naïve\n\
Smart quote contraction: don\u{2019}t\n\
Mixed: test123 abc456def\n\
Hyphenated: well-known state-of-the-art\n\
Underscore: foo_bar _private\n";
let expected = vec![
"simple",
"english",
"words",
"here",
"numbers",
"like",
"1,200",
"and",
"12.1",
"and",
"3.14159",
"should",
"stay",
"together",
"urls",
"like",
"www.gutenberg.org",
"and",
"www.pgdp.net",
"are",
"single",
"tokens",
"email",
"test",
"user",
"example.com",
"should",
"be",
"one",
"token",
"contractions",
"don't",
"can't",
"it's",
"they're",
"possessives",
"todd's",
"dog's",
"cjk",
"ideographs",
"宋",
"史",
"中",
"文",
"测",
"试",
"東",
"京",
"hiragana",
"ひ",
"ら",
"が",
"な",
"katakana",
"カタカナ",
"hangul",
"한국어",
"accented",
"boïelle",
"société",
"d\u{2019}académie",
"café",
"résumé",
"naïve",
"smart",
"quote",
"contraction",
"don\u{2019}t",
"mixed",
"test123",
"abc456def",
"hyphenated",
"well",
"known",
"state",
"of",
"the",
"art",
"underscore",
"foo_bar",
"_private",
];
let tokens = collect_tokens(input);
let actual = texts(&tokens);
assert_eq!(actual, expected);
}
#[test]
fn test_token_spanning_chunk_boundary() {
let tokens = collect_tokens_chunked("hello world", 4);
assert_eq!(texts(&tokens), vec!["hello", "world"]);
}
#[test]
fn test_contraction_spanning_boundary() {
let tokens = collect_tokens_chunked("don't stop", 4);
assert_eq!(texts(&tokens), vec!["don't", "stop"]);
}
#[test]
fn test_numeric_grouping_spanning_boundary() {
let tokens = collect_tokens_chunked("1,200 test", 3);
assert_eq!(texts(&tokens), vec!["1,200", "test"]);
}
#[test]
fn test_url_spanning_boundary() {
let tokens = collect_tokens_chunked("www.gutenberg.org test", 6);
assert_eq!(texts(&tokens), vec!["www.gutenberg.org", "test"]);
}
#[test]
fn test_cjk_spanning_boundary() {
let tokens = collect_tokens_chunked("宋史", 4);
assert_eq!(texts(&tokens), vec!["宋", "史"]);
}
#[test]
fn test_tiny_chunks_match_default() {
let input = "The quick brown fox don't jump over the lazy dog's bed";
let default_tokens = collect_tokens(input);
let chunked_tokens = collect_tokens_chunked(input, 4);
assert_eq!(texts(&chunked_tokens), texts(&default_tokens));
}
#[test]
fn test_empty_input_chunked() {
let tokens = collect_tokens_chunked("", 4);
assert_is_empty!(&tokens);
}
#[test]
fn test_many_tokens_tiny_chunks() {
let input = "a b c d e f g h i j";
let tokens = collect_tokens_chunked(input, 3);
assert_eq!(
texts(&tokens),
vec!["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"]
);
}
}