pub struct Tokenizer { /* private fields */ }
Expand description
Text tokenizer for splitting text into tokens
Implementations§
Source§impl Tokenizer
impl Tokenizer
Sourcepub fn new() -> LangExtractResult<Self>
pub fn new() -> LangExtractResult<Self>
Create a new tokenizer
Sourcepub fn tokenize(&self, text: &str) -> LangExtractResult<TokenizedText>
pub fn tokenize(&self, text: &str) -> LangExtractResult<TokenizedText>
Tokenize text into tokens
Sourcepub fn tokens_text(
&self,
tokenized_text: &TokenizedText,
token_interval: &TokenInterval,
) -> LangExtractResult<String>
pub fn tokens_text( &self, tokenized_text: &TokenizedText, token_interval: &TokenInterval, ) -> LangExtractResult<String>
Reconstruct text from a token interval
Sourcepub fn is_end_of_sentence_token(
&self,
text: &str,
tokens: &[Token],
current_idx: usize,
) -> bool
pub fn is_end_of_sentence_token( &self, text: &str, tokens: &[Token], current_idx: usize, ) -> bool
Check if a punctuation token ends a sentence
Sourcepub fn is_sentence_break_after_newline(
&self,
text: &str,
tokens: &[Token],
current_idx: usize,
) -> bool
pub fn is_sentence_break_after_newline( &self, text: &str, tokens: &[Token], current_idx: usize, ) -> bool
Check if there’s a sentence break after a newline
Sourcepub fn find_sentence_range(
&self,
text: &str,
tokens: &[Token],
start_token_index: usize,
) -> LangExtractResult<TokenInterval>
pub fn find_sentence_range( &self, text: &str, tokens: &[Token], start_token_index: usize, ) -> LangExtractResult<TokenInterval>
Find sentence range starting from a given token index
Trait Implementations§
Auto Trait Implementations§
impl Freeze for Tokenizer
impl RefUnwindSafe for Tokenizer
impl Send for Tokenizer
impl Sync for Tokenizer
impl Unpin for Tokenizer
impl UnwindSafe for Tokenizer
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more