triplets_core/preprocessor/mod.rs
1//! Pluggable text preprocessor infrastructure.
2//!
3//! Preprocessors run as a sequential pipeline inside
4//! [`crate::chunking::SlidingWindowChunker`] before tokenization. Each
5//! preprocessor receives the text of a section and returns either
6//! `Some(transformed)` or `None`. A `None` return from any stage
7//! short-circuits the remainder and causes the entire section to be dropped —
8//! no chunks are produced from it.
9//!
10//! ## Registration
11//!
12//! Preprocessors are registered on a [`crate::config::ChunkingStrategy`] via
13//! [`crate::config::ChunkingStrategy::register_preprocessor`]:
14//!
15//! ```rust
16//! use triplets_core::{ChunkingStrategy, DenoiserConfig, DenoiserPreprocessor};
17//!
18//! let mut strategy = ChunkingStrategy::default();
19//! strategy.register_preprocessor(DenoiserPreprocessor::new(DenoiserConfig {
20//! enabled: true,
21//! max_digit_ratio: 0.35,
22//! strip_markdown: true,
23//! }));
24//! ```
25//!
26//! Multiple preprocessors run in registration order; the output of one feeds
27//! the next.
28
29/// Built-in preprocessor implementations.
30pub mod backends;
31
32/// Trait for pluggable text preprocessors.
33///
34/// Implement this trait to transform or filter section text before it is
35/// tokenized and chunked. The pipeline is sequential: the output of each
36/// stage feeds the next.
37///
38/// # Implementing
39///
40/// ```rust
41/// use triplets_core::TextPreprocessor;
42///
43/// struct UppercasePreprocessor;
44///
45/// impl TextPreprocessor for UppercasePreprocessor {
46/// fn process(&self, text: &str) -> Option<String> {
47/// Some(text.to_uppercase())
48/// }
49/// }
50/// ```
51pub trait TextPreprocessor: Send + Sync {
52 /// Process a text block.
53 ///
54 /// Returns `Some(transformed)` with the (possibly modified) text, or
55 /// `None` to signal that the section should be discarded entirely —
56 /// no chunks will be produced from it.
57 fn process(&self, text: &str) -> Option<String>;
58}