Skip to main content

triplets_core/
config.rs

1use crate::data::SectionRole;
2use crate::splits::{SplitLabel, SplitRatios};
3use std::borrow::Cow;
4use std::sync::Arc;
5
6/// Configuration for the OCR denoiser that filters digit-heavy text.
7///
8/// When enabled, text sections that are predominantly numerical (e.g. mangled OCR tables)
9/// are stripped down to their alphabetical content on a line-by-line basis, or dropped
10/// entirely when no alphabetical content remains.
11#[derive(Clone, Debug)]
12pub struct DenoiserConfig {
13    /// Whether denoising is active. Defaults to `false` so existing behavior is unchanged.
14    pub enabled: bool,
15    /// Maximum ratio of digit characters to (digit + alphabetical) characters before a
16    /// line is considered mangled OCR output. Range: `0.0`–`1.0`.
17    ///
18    /// A value of `0.35` means that if more than 35% of the alphanumeric characters on a
19    /// line are digits, the line is treated as a mangled table row and stripped down to
20    /// its alphabetical tokens.
21    ///
22    /// Defaults to `0.35`.
23    pub max_digit_ratio: f32,
24    /// Whether to strip common markdown formatting boundaries (e.g. pipe `|` table boundaries,
25    /// dropping layout-only separator rows like `|---|---|`).
26    ///
27    /// Currently covers GFM tables; may expand to other structural markers in the future.
28    /// Semantic text is preserved.
29    ///
30    /// Defaults to `true`.
31    pub strip_markdown: bool,
32}
33
34impl Default for DenoiserConfig {
35    fn default() -> Self {
36        Self {
37            enabled: false,
38            max_digit_ratio: 0.35,
39            strip_markdown: true,
40        }
41    }
42}
43
44/// Controls how long text sections are chunked and weighted.
45pub struct ChunkingStrategy {
46    /// Max tokens per window when slicing a section into chunks.
47    pub max_window_tokens: usize,
48    /// Overlap sizes (in tokens) used when sliding windows across a section.
49    pub overlap_tokens: Vec<usize>,
50    /// Weight assigned to summary-fallback chunks (when generated).
51    pub summary_fallback_weight: f32,
52    /// Max tokens for summary-fallback chunks (0 disables fallback chunks).
53    pub summary_fallback_tokens: usize,
54    /// Floor applied to per-chunk weight after offset or summary fallback weighting.
55    pub chunk_weight_floor: f32,
56    /// Pluggable text preprocessors applied in order before chunking.
57    pub(crate) preprocessors: Vec<Arc<dyn crate::preprocessor::TextPreprocessor>>,
58}
59
60impl Default for ChunkingStrategy {
61    fn default() -> Self {
62        Self {
63            max_window_tokens: 1024,
64            overlap_tokens: vec![64],
65            summary_fallback_weight: 0.35,
66            summary_fallback_tokens: 512,
67            chunk_weight_floor: 0.1,
68            preprocessors: Vec::new(),
69        }
70    }
71}
72
73impl Clone for ChunkingStrategy {
74    fn clone(&self) -> Self {
75        Self {
76            max_window_tokens: self.max_window_tokens,
77            overlap_tokens: self.overlap_tokens.clone(),
78            summary_fallback_weight: self.summary_fallback_weight,
79            summary_fallback_tokens: self.summary_fallback_tokens,
80            chunk_weight_floor: self.chunk_weight_floor,
81            preprocessors: self.preprocessors.clone(),
82        }
83    }
84}
85
86impl std::fmt::Debug for ChunkingStrategy {
87    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
88        f.debug_struct("ChunkingStrategy")
89            .field("max_window_tokens", &self.max_window_tokens)
90            .field("overlap_tokens", &self.overlap_tokens)
91            .field("summary_fallback_weight", &self.summary_fallback_weight)
92            .field("summary_fallback_tokens", &self.summary_fallback_tokens)
93            .field("chunk_weight_floor", &self.chunk_weight_floor)
94            .field(
95                "preprocessors",
96                &format_args!("{} registered", self.preprocessors.len()),
97            )
98            .finish()
99    }
100}
101
102impl ChunkingStrategy {
103    /// Register a [`crate::preprocessor::TextPreprocessor`] to run before chunking.
104    ///
105    /// Preprocessors are applied in registration order; if any returns `None`
106    /// the section is dropped and produces no chunks.
107    pub fn register_preprocessor(
108        &mut self,
109        p: impl crate::preprocessor::TextPreprocessor + 'static,
110    ) -> &mut Self {
111        self.preprocessors.push(Arc::new(p));
112        self
113    }
114
115    /// Return the slice of registered preprocessors.
116    pub fn preprocessors(&self) -> &[Arc<dyn crate::preprocessor::TextPreprocessor>] {
117        &self.preprocessors
118    }
119}
120
121/// Defines a triplet recipe (anchor/positive/negative selection + weighting).
122///
123/// ## Split-isolation contract
124///
125/// All three chunk slots (anchor, positive, negative) must resolve to records
126/// whose IDs hash to the same split as the request split. The sampler enforces
127/// this automatically for `Selector::Role`, `Selector::Paragraph`, and
128/// `Selector::Random` — those selectors always read from the record that was
129/// already confirmed to be in the correct split.
130///
131/// `Selector::TemporalOffset` crosses a record boundary (it picks a *different*
132/// record by proximity in time) and the split check is re-applied inside
133/// `select_temporal_neighbor`. No additional care is required on your side,
134/// but you should be aware that in pools with few same-split neighbors the
135/// selector will return `None` and fall back to skipping a slot rather than
136/// contaminating splits.
137///
138/// ## Stable IDs
139///
140/// Record IDs must be stable across runs. Split assignment is derived
141/// deterministically from the record ID and the sampler seed; changing an ID
142/// changes its split assignment, which invalidates any persisted split state.
143/// IDs should also be globally unique — if two records from different sources
144/// share the same ID, only one will be kept in the sampler, and the discarded
145/// record's split assignment silently goes with it.
146#[derive(Clone, Debug)]
147pub struct TripletRecipe {
148    /// Unique name for this recipe.
149    pub name: Cow<'static, str>,
150    /// Selector used for anchor chunks.
151    pub anchor: Selector,
152    /// Selector used for positive chunks (same record).
153    pub positive_selector: Selector,
154    /// Selector used for negative chunks (different record).
155    pub negative_selector: Selector,
156    /// Strategy used to pick negatives.
157    pub negative_strategy: NegativeStrategy,
158    /// Relative weight controlling how often this recipe is selected versus other recipes.
159    ///
160    /// Each recipe with a positive weight receives a number of slots in the shuffled selection
161    /// order proportional to `weight / min_positive_weight` across all active recipes, so a
162    /// recipe with `weight = 2.0` is drawn approximately twice as often as one with `weight =
163    /// 1.0`.  The weight also scales the `weight` field on every [`crate::SampleTriplet`]
164    /// returned by this recipe, which the caller's training loop can use for loss weighting.
165    ///
166    /// Recipes with `weight <= 0.0` are excluded from selection entirely and no samples
167    /// are produced for them.
168    pub weight: f32,
169    /// Optional instruction text attached to samples from this recipe.
170    pub instruction: Option<Cow<'static, str>>,
171    /// Allow anchor and positive to carry identical text (SimCSE / dropout-trick mode).
172    ///
173    /// When `true`, the sampler will emit triplets even when the anchor and positive
174    /// sections resolve to the same text.  This enables the unsupervised SimCSE
175    /// training pattern: the same text string feeds both slots, and the model's
176    /// dropout layers produce two slightly different embeddings at training time.
177    ///
178    /// Negatives are still required to differ from both anchor and positive.
179    ///
180    /// Defaults to `false`; set `true` only for recipes whose anchor and positive
181    /// selectors intentionally resolve to the same content (e.g. text-only sources).
182    pub allow_same_anchor_positive: bool,
183}
184
185impl Default for TripletRecipe {
186    fn default() -> Self {
187        Self {
188            name: "".into(),
189            anchor: Selector::Random,
190            positive_selector: Selector::Random,
191            negative_selector: Selector::Random,
192            negative_strategy: NegativeStrategy::WrongArticle,
193            weight: 1.0,
194            instruction: None,
195            allow_same_anchor_positive: false,
196        }
197    }
198}
199
200/// Selector for choosing a section or neighboring record.
201#[derive(Clone, Debug, PartialEq, Eq)]
202pub enum Selector {
203    /// Select a section by role.
204    Role(SectionRole),
205    /// Select a specific section by index.
206    Paragraph(usize),
207    /// Select a temporal neighbor record by offset days.
208    ///
209    /// Candidates are restricted to the same split as the requesting record;
210    /// if no same-split neighbor exists the selector returns `None` for that
211    /// slot rather than crossing split boundaries.
212    TemporalOffset(i32),
213    /// Select a random section.
214    Random,
215}
216
217/// Defines how to build a text sample from a record.
218#[derive(Clone, Debug)]
219pub struct TextRecipe {
220    /// Unique name for this recipe.
221    pub name: Cow<'static, str>,
222    /// Selector used for text chunks.
223    pub selector: Selector,
224    /// Relative weight controlling how often this recipe is selected versus other recipes.
225    ///
226    /// Each recipe with a positive weight receives a number of slots in the shuffled selection
227    /// order proportional to `weight / min_positive_weight` across all active recipes, so a
228    /// recipe with `weight = 2.0` is drawn approximately twice as often as one with `weight =
229    /// 1.0`.  The weight also scales the `weight` field on every [`crate::TextSample`]
230    /// returned by this recipe, which the caller's training loop can use for loss weighting.
231    ///
232    /// Recipes with `weight <= 0.0` are excluded from selection entirely and no samples
233    /// are produced for them.
234    pub weight: f32,
235    /// Optional instruction text attached to samples from this recipe.
236    pub instruction: Option<Cow<'static, str>>,
237}
238
239/// Strategy for picking the negative *record* in a triplet.
240///
241/// Each variant defines the candidate pool from which the negative record is drawn.
242/// By default all variants scope candidates to the same source as the anchor, so
243/// negatives are hard relative to the source domain rather than trivially
244/// cross-domain. A same-split fallback engages automatically when the in-source
245/// pool is too small (for example a source with only one record in the split).
246///
247/// When the `bm25-mining` feature is enabled, BM25 lexical re-ranking is applied
248/// on top of the strategy-filtered pool — BM25 re-orders candidates by keyword
249/// overlap with the anchor, but does not widen or replace the strategy pool.
250/// BM25 is a first-pass lexical ranker, not a semantic one; it is well-suited for
251/// lifting average negative quality without an encoder at data-generation time.
252/// Semantic or embedding-based re-ranking (iterative hard-negative mining with the
253/// trained encoder, cross-encoder scoring, dense retrieval) is out of scope for
254/// the data pipeline and can be integrated by pre-ranking negatives before
255/// ingestion or by reweighting source batches in the training loop.
256#[derive(Clone, Debug)]
257pub enum NegativeStrategy {
258    /// Choose a record with a different publication date from record metadata.
259    ///
260    /// This refers to metadata/taxonomy publication-date values (for example
261    /// `META_FIELD_DATE`), not filesystem timestamps like mtime/ctime/atime.
262    WrongPublicationDate,
263    /// Choose a different record from the same source.
264    ///
265    /// Negatives are drawn from within the same source, making them hard relative
266    /// to the source domain. This is appropriate when each source represents a
267    /// coherent domain (a collection of finance articles, a physics paper set,
268    /// etc.) where same-source records are already confusable. If your sources are
269    /// not meaningful domain boundaries, the fallback path (same split, any source)
270    /// is the relevant escape hatch.
271    WrongArticle,
272    /// Choose a mismatched Q/A pair.
273    QuestionAnswerMismatch,
274}
275
276/// Top-level sampler configuration.
277#[derive(Clone, Debug)]
278pub struct SamplerConfig {
279    /// RNG seed that controls deterministic sampling order.
280    pub seed: u64,
281    /// Target number of samples per batch.
282    pub batch_size: usize,
283    /// Max number of records kept in the ingestion cache for candidate sampling.
284    ///
285    /// This is intentionally decoupled from `batch_size` so anchors/negatives can
286    /// be drawn from a broader rolling pool.
287    ///
288    /// Practical tuning: values above `batch_size` usually improve diversity and
289    /// reduce short-horizon repetition; gains taper off as source/recipe/split
290    /// constraints become the limiting factor. Higher values also increase memory.
291    ///
292    /// For remote shard-backed sources (for example Hugging Face), larger initial
293    /// targets may require fetching more shards before the first batch, so startup
294    /// latency can increase based on shard sizes and network throughput.
295    pub ingestion_max_records: usize,
296    /// Chunking behavior for long sections.
297    pub chunking: ChunkingStrategy,
298    /// Triplet recipes to use; empty means sources may provide defaults.
299    pub recipes: Vec<TripletRecipe>,
300    /// Text recipes to use; empty means derived from triplet recipes if available.
301    pub text_recipes: Vec<TextRecipe>,
302    /// Split ratios used when assigning records to train/val/test.
303    pub split: SplitRatios,
304    /// Splits allowed for sampling requests.
305    pub allowed_splits: Vec<SplitLabel>,
306}
307
308impl Default for SamplerConfig {
309    fn default() -> Self {
310        Self {
311            seed: 42,
312            batch_size: 128,
313            ingestion_max_records: 2048,
314            chunking: ChunkingStrategy::default(),
315            recipes: Vec::new(),
316            text_recipes: Vec::new(),
317            split: SplitRatios::default(),
318            allowed_splits: vec![SplitLabel::Train],
319        }
320    }
321}
322
323impl SamplerConfig {
324    /// Consuming builder to enable the built-in OCR/markdown denoiser on
325    /// the sampler's chunking strategy.
326    ///
327    /// Chains denoiser setup during `SamplerConfig` construction. Works with
328    /// struct update syntax to customize other fields at the same time:
329    ///
330    /// ```rust,no_run
331    /// use triplets_core::{SamplerConfig, config::DenoiserConfig};
332    ///
333    /// // Enable denoiser with all other fields at their defaults:
334    /// let config = SamplerConfig::default()
335    ///     .with_denoiser(DenoiserConfig { enabled: true, ..DenoiserConfig::default() });
336    ///
337    /// // Or customize other fields first, then add the denoiser:
338    /// let config = SamplerConfig { batch_size: 32, ..SamplerConfig::default() }
339    ///     .with_denoiser(DenoiserConfig { enabled: true, ..DenoiserConfig::default() });
340    /// ```
341    pub fn with_denoiser(mut self, config: DenoiserConfig) -> Self {
342        use crate::preprocessor::backends::denoiser_preprocessor::DenoiserPreprocessor;
343        self.chunking
344            .preprocessors
345            .push(Arc::new(DenoiserPreprocessor::new(config)));
346        self
347    }
348}
349
350#[cfg(test)]
351mod tests {
352    use super::*;
353
354    #[test]
355    fn chunking_strategy_defaults_are_stable() {
356        let cfg = ChunkingStrategy::default();
357        assert_eq!(cfg.max_window_tokens, 1024);
358        assert_eq!(cfg.overlap_tokens, vec![64]);
359        assert_eq!(cfg.summary_fallback_weight, 0.35);
360        assert_eq!(cfg.summary_fallback_tokens, 512);
361        assert_eq!(cfg.chunk_weight_floor, 0.1);
362    }
363
364    #[test]
365    fn sampler_config_defaults_are_expected() {
366        let cfg = SamplerConfig::default();
367        assert_eq!(cfg.seed, 42);
368        assert_eq!(cfg.batch_size, 128);
369        assert_eq!(cfg.ingestion_max_records, 2048);
370        assert!(cfg.recipes.is_empty());
371        assert!(cfg.text_recipes.is_empty());
372        assert_eq!(cfg.allowed_splits, vec![SplitLabel::Train]);
373        assert_eq!(cfg.chunking.max_window_tokens, 1024);
374    }
375
376    #[test]
377    fn selector_variants_can_be_constructed() {
378        let role = Selector::Role(SectionRole::Anchor);
379        let paragraph = Selector::Paragraph(3);
380        let temporal = Selector::TemporalOffset(-2);
381        let random = Selector::Random;
382
383        assert!(matches!(role, Selector::Role(SectionRole::Anchor)));
384        assert!(matches!(paragraph, Selector::Paragraph(3)));
385        assert!(matches!(temporal, Selector::TemporalOffset(-2)));
386        assert!(matches!(random, Selector::Random));
387    }
388
389    #[test]
390    fn triplet_recipe_default_is_expected() {
391        let recipe = TripletRecipe::default();
392        assert_eq!(recipe.name.as_ref(), "");
393        assert!(matches!(recipe.anchor, Selector::Random));
394        assert!(matches!(recipe.positive_selector, Selector::Random));
395        assert!(matches!(recipe.negative_selector, Selector::Random));
396        assert!(matches!(
397            recipe.negative_strategy,
398            NegativeStrategy::WrongArticle
399        ));
400        assert_eq!(recipe.weight, 1.0);
401        assert!(recipe.instruction.is_none());
402        assert!(!recipe.allow_same_anchor_positive);
403    }
404}