1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
use std::iter::Enumerate; use tantivy::tokenizer::{Token, TokenStream}; pub struct TinySegmenterTokenStream { tinyseg_enum: Enumerate<std::vec::IntoIter<String>>, current_token: Token, offset_from: usize, offset_to: usize, } impl TinySegmenterTokenStream { pub fn new(text: &str) -> TinySegmenterTokenStream { TinySegmenterTokenStream { tinyseg_enum: tinysegmenter::tokenize(text).into_iter().enumerate(), current_token: Token::default(), offset_from: 0, offset_to: 0, } } } impl TokenStream for TinySegmenterTokenStream { fn advance(&mut self) -> bool { match self.tinyseg_enum.next() { Some((pos, term)) => { self.offset_from = self.offset_to; self.offset_to = self.offset_from + term.len(); let offset_from = self.offset_from; let offset_to = self.offset_to; self.current_token = Token { offset_from, offset_to, position: pos, text: term, position_length: 1, }; return true; } None => return false, } } fn token(&self) -> &Token { &self.current_token } fn token_mut(&mut self) -> &mut Token { &mut self.current_token } }