lance_tokenizer/
simple_tokenizer.rs1use std::str::CharIndices;
8
9use crate::{Token, TokenStream, Tokenizer};
10
11#[derive(Clone, Default)]
12pub struct SimpleTokenizer {
13 token: Token,
14}
15
16pub struct SimpleTokenStream<'a> {
17 text: &'a str,
18 chars: CharIndices<'a>,
19 token: &'a mut Token,
20}
21
22impl Tokenizer for SimpleTokenizer {
23 type TokenStream<'a> = SimpleTokenStream<'a>;
24
25 fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
26 self.token.reset();
27 SimpleTokenStream {
28 text,
29 chars: text.char_indices(),
30 token: &mut self.token,
31 }
32 }
33}
34
35impl SimpleTokenStream<'_> {
36 fn search_token_end(&mut self) -> usize {
37 (&mut self.chars)
38 .filter(|(_, c)| !c.is_alphanumeric())
39 .map(|(offset, _)| offset)
40 .next()
41 .unwrap_or(self.text.len())
42 }
43}
44
45impl TokenStream for SimpleTokenStream<'_> {
46 fn advance(&mut self) -> bool {
47 self.token.text.clear();
48 self.token.position = self.token.position.wrapping_add(1);
49 while let Some((offset_from, c)) = self.chars.next() {
50 if c.is_alphanumeric() {
51 let offset_to = self.search_token_end();
52 self.token.offset_from = offset_from;
53 self.token.offset_to = offset_to;
54 self.token.text.push_str(&self.text[offset_from..offset_to]);
55 return true;
56 }
57 }
58 false
59 }
60
61 fn token(&self) -> &Token {
62 self.token
63 }
64
65 fn token_mut(&mut self) -> &mut Token {
66 self.token
67 }
68}