pub struct DocumentPreprocessor {
pub clean_whitespace: bool,
pub normalize_unicode: bool,
pub detect_language: bool,
pub chunk_size: Option<usize>,
}Expand description
Document preprocessor for cleaning and normalizing text.
Fields§
§clean_whitespace: boolNormalize whitespace (collapse multiple spaces, normalize line breaks)
normalize_unicode: boolNormalize Unicode (NFC normalization)
detect_language: boolDetect and record language
chunk_size: Option<usize>Maximum chunk size (None = no chunking)
Implementations§
Source§impl DocumentPreprocessor
impl DocumentPreprocessor
Sourcepub fn with_all_cleaning() -> Self
pub fn with_all_cleaning() -> Self
Create a preprocessor with all cleaning enabled.
Sourcepub fn prepare(&self, text: &str) -> PreparedDocument
pub fn prepare(&self, text: &str) -> PreparedDocument
Prepare text for entity extraction.
Trait Implementations§
Source§impl Clone for DocumentPreprocessor
impl Clone for DocumentPreprocessor
Source§fn clone(&self) -> DocumentPreprocessor
fn clone(&self) -> DocumentPreprocessor
Returns a duplicate of the value. Read more
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
Performs copy-assignment from
source. Read moreSource§impl Debug for DocumentPreprocessor
impl Debug for DocumentPreprocessor
Auto Trait Implementations§
impl Freeze for DocumentPreprocessor
impl RefUnwindSafe for DocumentPreprocessor
impl Send for DocumentPreprocessor
impl Sync for DocumentPreprocessor
impl Unpin for DocumentPreprocessor
impl UnsafeUnpin for DocumentPreprocessor
impl UnwindSafe for DocumentPreprocessor
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
Source§impl<T> CloneToUninit for Twhere
T: Clone,
impl<T> CloneToUninit for Twhere
T: Clone,
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more