pub struct TextProcessor { /* private fields */ }Expand description
Text processing utilities for chunking and preprocessing
Implementations§
Source§impl TextProcessor
impl TextProcessor
Sourcepub fn new(chunk_size: usize, chunk_overlap: usize) -> Result<Self>
pub fn new(chunk_size: usize, chunk_overlap: usize) -> Result<Self>
Create a new text processor
Sourcepub fn with_parallel_processing(
chunk_size: usize,
chunk_overlap: usize,
parallel_processor: ParallelProcessor,
) -> Result<Self>
pub fn with_parallel_processing( chunk_size: usize, chunk_overlap: usize, parallel_processor: ParallelProcessor, ) -> Result<Self>
Create a new text processor with parallel processing support
Sourcepub fn chunk_text_hierarchical(
&self,
document: &Document,
) -> Result<Vec<TextChunk>>
pub fn chunk_text_hierarchical( &self, document: &Document, ) -> Result<Vec<TextChunk>>
Split text into chunks with overlap using hierarchical boundary preservation
Sourcepub fn chunk_text(&self, document: &Document) -> Result<Vec<TextChunk>>
pub fn chunk_text(&self, document: &Document) -> Result<Vec<TextChunk>>
Split text into chunks with overlap (legacy method)
Sourcepub fn chunk_text_with_enrichment(
&self,
document: &Document,
enricher: &mut ChunkEnricher,
) -> Result<Vec<TextChunk>>
pub fn chunk_text_with_enrichment( &self, document: &Document, enricher: &mut ChunkEnricher, ) -> Result<Vec<TextChunk>>
Chunk text and enrich with semantic metadata
Sourcepub fn chunk_text_hierarchical_with_enrichment(
&self,
document: &Document,
enricher: &mut ChunkEnricher,
) -> Result<Vec<TextChunk>>
pub fn chunk_text_hierarchical_with_enrichment( &self, document: &Document, enricher: &mut ChunkEnricher, ) -> Result<Vec<TextChunk>>
Chunk text hierarchically and enrich with semantic metadata
Sourcepub fn create_default_enricher(document: &Document) -> ChunkEnricher
pub fn create_default_enricher(document: &Document) -> ChunkEnricher
Create a default enricher for document processing
Sourcepub fn chunk_and_enrich(&self, document: &Document) -> Result<Vec<TextChunk>>
pub fn chunk_and_enrich(&self, document: &Document) -> Result<Vec<TextChunk>>
Convenience method: chunk and enrich with auto-detected format
Sourcepub fn chunk_hierarchical_and_enrich(
&self,
document: &Document,
) -> Result<Vec<TextChunk>>
pub fn chunk_hierarchical_and_enrich( &self, document: &Document, ) -> Result<Vec<TextChunk>>
Convenience method: chunk hierarchically and enrich with auto-detected format
Sourcepub fn chunk_with_strategy(
&self,
document: &Document,
strategy: &dyn ChunkingStrategy,
) -> Result<Vec<TextChunk>>
pub fn chunk_with_strategy( &self, document: &Document, strategy: &dyn ChunkingStrategy, ) -> Result<Vec<TextChunk>>
Chunk text using any strategy that implements ChunkingStrategy trait
This method provides a flexible way to use different chunking approaches while maintaining the same interface.
§Arguments
document- The document to chunkstrategy- Any type implementing ChunkingStrategy
§Returns
A vector of TextChunk objects
§Examples
use graphrag_core::text::{TextProcessor, HierarchicalChunkingStrategy};
let processor = TextProcessor::new(1000, 100)?;
let strategy = HierarchicalChunkingStrategy::new(500, 50, document.id.clone());
let chunks = processor.chunk_with_strategy(&document, &strategy)?;Sourcepub fn clean_text(&self, text: &str) -> String
pub fn clean_text(&self, text: &str) -> String
Clean and normalize text
Sourcepub fn extract_sentences(&self, text: &str) -> Vec<String>
pub fn extract_sentences(&self, text: &str) -> Vec<String>
Extract sentences from text
Sourcepub fn word_count(&self, text: &str) -> usize
pub fn word_count(&self, text: &str) -> usize
Count words in text
Sourcepub fn batch_chunk_documents(
&self,
documents: Vec<Document>,
) -> Result<Vec<Vec<TextChunk>>>
pub fn batch_chunk_documents( &self, documents: Vec<Document>, ) -> Result<Vec<Vec<TextChunk>>>
Process multiple documents in parallel
Sourcepub fn batch_extract_keywords(
&self,
texts: &[&str],
max_keywords: usize,
) -> Vec<Vec<String>>
pub fn batch_extract_keywords( &self, texts: &[&str], max_keywords: usize, ) -> Vec<Vec<String>>
Parallel extraction of keywords from multiple texts
Sourcepub fn batch_extract_sentences(&self, texts: &[&str]) -> Vec<Vec<String>>
pub fn batch_extract_sentences(&self, texts: &[&str]) -> Vec<Vec<String>>
Parallel sentence extraction from multiple texts
Sourcepub fn batch_clean_text(&self, texts: &[&str]) -> Vec<String>
pub fn batch_clean_text(&self, texts: &[&str]) -> Vec<String>
Parallel text cleaning for multiple texts
Sourcepub fn extract_keywords(&self, text: &str, max_keywords: usize) -> Vec<String>
pub fn extract_keywords(&self, text: &str, max_keywords: usize) -> Vec<String>
Extract keywords using simple frequency analysis
Sourcepub fn get_performance_stats(&self) -> (usize, Duration)
pub fn get_performance_stats(&self) -> (usize, Duration)
Get performance statistics
Sourcepub fn average_processing_time(&self) -> Duration
pub fn average_processing_time(&self) -> Duration
Get average processing time per operation
Sourcepub fn reset_performance_stats(&mut self)
pub fn reset_performance_stats(&mut self)
Reset performance monitoring statistics
Sourcepub fn get_parallel_stats(&self) -> Option<ParallelStatistics>
pub fn get_parallel_stats(&self) -> Option<ParallelStatistics>
Get parallel processing statistics if available
Trait Implementations§
Auto Trait Implementations§
impl Freeze for TextProcessor
impl RefUnwindSafe for TextProcessor
impl Send for TextProcessor
impl Sync for TextProcessor
impl Unpin for TextProcessor
impl UnsafeUnpin for TextProcessor
impl UnwindSafe for TextProcessor
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> Instrument for T
impl<T> Instrument for T
Source§fn instrument(self, span: Span) -> Instrumented<Self>
fn instrument(self, span: Span) -> Instrumented<Self>
Source§fn in_current_span(self) -> Instrumented<Self>
fn in_current_span(self) -> Instrumented<Self>
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more