triplets_core/config.rs
1use crate::data::SectionRole;
2use crate::splits::{SplitLabel, SplitRatios};
3use std::borrow::Cow;
4use std::sync::Arc;
5
6/// Configuration for the OCR denoiser that filters digit-heavy text.
7///
8/// When enabled, text sections that are predominantly numerical (e.g. mangled OCR tables)
9/// are stripped down to their alphabetical content on a line-by-line basis, or dropped
10/// entirely when no alphabetical content remains.
11#[derive(Clone, Debug)]
12pub struct DenoiserConfig {
13 /// Whether denoising is active. Defaults to `false` so existing behavior is unchanged.
14 pub enabled: bool,
15 /// Maximum ratio of digit characters to (digit + alphabetical) characters before a
16 /// line is considered mangled OCR output. Range: `0.0`–`1.0`.
17 ///
18 /// A value of `0.35` means that if more than 35% of the alphanumeric characters on a
19 /// line are digits, the line is treated as a mangled table row and stripped down to
20 /// its alphabetical tokens.
21 ///
22 /// Defaults to `0.35`.
23 pub max_digit_ratio: f32,
24 /// Whether to strip common markdown formatting boundaries (e.g. pipe `|` table boundaries,
25 /// dropping layout-only separator rows like `|---|---|`).
26 ///
27 /// Currently covers GFM tables; may expand to other structural markers in the future.
28 /// Semantic text is preserved.
29 ///
30 /// Defaults to `true`.
31 pub strip_markdown: bool,
32}
33
34impl Default for DenoiserConfig {
35 fn default() -> Self {
36 Self {
37 enabled: false,
38 max_digit_ratio: 0.35,
39 strip_markdown: true,
40 }
41 }
42}
43
44/// Controls how long text sections are chunked and weighted.
45pub struct ChunkingStrategy {
46 /// Max tokens per window when slicing a section into chunks.
47 pub max_window_tokens: usize,
48 /// Overlap sizes (in tokens) used when sliding windows across a section.
49 pub overlap_tokens: Vec<usize>,
50 /// Weight assigned to summary-fallback chunks (when generated).
51 pub summary_fallback_weight: f32,
52 /// Max tokens for summary-fallback chunks (0 disables fallback chunks).
53 pub summary_fallback_tokens: usize,
54 /// Floor applied to per-chunk weight after offset or summary fallback weighting.
55 pub chunk_weight_floor: f32,
56 /// Pluggable text preprocessors applied in order before chunking.
57 pub(crate) preprocessors: Vec<Arc<dyn crate::preprocessor::TextPreprocessor>>,
58}
59
60impl Default for ChunkingStrategy {
61 fn default() -> Self {
62 Self {
63 max_window_tokens: 1024,
64 overlap_tokens: vec![64],
65 summary_fallback_weight: 0.35,
66 summary_fallback_tokens: 512,
67 chunk_weight_floor: 0.1,
68 preprocessors: Vec::new(),
69 }
70 }
71}
72
73impl Clone for ChunkingStrategy {
74 fn clone(&self) -> Self {
75 Self {
76 max_window_tokens: self.max_window_tokens,
77 overlap_tokens: self.overlap_tokens.clone(),
78 summary_fallback_weight: self.summary_fallback_weight,
79 summary_fallback_tokens: self.summary_fallback_tokens,
80 chunk_weight_floor: self.chunk_weight_floor,
81 preprocessors: self.preprocessors.clone(),
82 }
83 }
84}
85
86impl std::fmt::Debug for ChunkingStrategy {
87 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
88 f.debug_struct("ChunkingStrategy")
89 .field("max_window_tokens", &self.max_window_tokens)
90 .field("overlap_tokens", &self.overlap_tokens)
91 .field("summary_fallback_weight", &self.summary_fallback_weight)
92 .field("summary_fallback_tokens", &self.summary_fallback_tokens)
93 .field("chunk_weight_floor", &self.chunk_weight_floor)
94 .field(
95 "preprocessors",
96 &format_args!("{} registered", self.preprocessors.len()),
97 )
98 .finish()
99 }
100}
101
102impl ChunkingStrategy {
103 /// Register a [`crate::preprocessor::TextPreprocessor`] to run before chunking.
104 ///
105 /// Preprocessors are applied in registration order; if any returns `None`
106 /// the section is dropped and produces no chunks.
107 pub fn register_preprocessor(
108 &mut self,
109 p: impl crate::preprocessor::TextPreprocessor + 'static,
110 ) -> &mut Self {
111 self.preprocessors.push(Arc::new(p));
112 self
113 }
114
115 /// Return the slice of registered preprocessors.
116 pub fn preprocessors(&self) -> &[Arc<dyn crate::preprocessor::TextPreprocessor>] {
117 &self.preprocessors
118 }
119}
120
121/// Defines a triplet recipe (anchor/positive/negative selection + weighting).
122///
123/// ## Split-isolation contract
124///
125/// All three chunk slots (anchor, positive, negative) must resolve to records
126/// whose IDs hash to the same split as the request split. The sampler enforces
127/// this automatically for `Selector::Role`, `Selector::Paragraph`, and
128/// `Selector::Random` — those selectors always read from the record that was
129/// already confirmed to be in the correct split.
130///
131/// `Selector::TemporalOffset` crosses a record boundary (it picks a *different*
132/// record by proximity in time) and the split check is re-applied inside
133/// `select_temporal_neighbor`. No additional care is required on your side,
134/// but you should be aware that in pools with few same-split neighbors the
135/// selector will return `None` and fall back to skipping a slot rather than
136/// contaminating splits.
137///
138/// ## Stable IDs
139///
140/// Record IDs must be stable across runs. Split assignment is derived
141/// deterministically from the record ID and the sampler seed; changing an ID
142/// changes its split assignment, which invalidates any persisted split state.
143/// IDs should also be globally unique — if two records from different sources
144/// share the same ID, only one will be kept in the sampler, and the discarded
145/// record's split assignment silently goes with it.
146#[derive(Clone, Debug)]
147pub struct TripletRecipe {
148 /// Unique name for this recipe.
149 pub name: Cow<'static, str>,
150 /// Selector used for anchor chunks.
151 pub anchor: Selector,
152 /// Selector used for positive chunks (same record).
153 pub positive_selector: Selector,
154 /// Selector used for negative chunks (different record).
155 pub negative_selector: Selector,
156 /// Strategy used to pick negatives.
157 pub negative_strategy: NegativeStrategy,
158 /// Relative weight controlling how often this recipe is selected versus other recipes.
159 ///
160 /// Each recipe with a positive weight receives a number of slots in the shuffled selection
161 /// order proportional to `weight / min_positive_weight` across all active recipes, so a
162 /// recipe with `weight = 2.0` is drawn approximately twice as often as one with `weight =
163 /// 1.0`. The weight also scales the `weight` field on every [`crate::SampleTriplet`]
164 /// returned by this recipe, which the caller's training loop can use for loss weighting.
165 ///
166 /// Recipes with `weight <= 0.0` are excluded from selection entirely and no samples
167 /// are produced for them.
168 pub weight: f32,
169 /// Optional instruction text attached to samples from this recipe.
170 pub instruction: Option<Cow<'static, str>>,
171 /// Allow anchor and positive to carry identical text (SimCSE / dropout-trick mode).
172 ///
173 /// When `true`, the sampler will emit triplets even when the anchor and positive
174 /// sections resolve to the same text. This enables the unsupervised SimCSE
175 /// training pattern: the same text string feeds both slots, and the model's
176 /// dropout layers produce two slightly different embeddings at training time.
177 ///
178 /// Negatives are still required to differ from both anchor and positive.
179 ///
180 /// Defaults to `false`; set `true` only for recipes whose anchor and positive
181 /// selectors intentionally resolve to the same content (e.g. text-only sources).
182 pub allow_same_anchor_positive: bool,
183}
184
185impl Default for TripletRecipe {
186 fn default() -> Self {
187 Self {
188 name: "".into(),
189 anchor: Selector::Random,
190 positive_selector: Selector::Random,
191 negative_selector: Selector::Random,
192 negative_strategy: NegativeStrategy::WrongArticle,
193 weight: 1.0,
194 instruction: None,
195 allow_same_anchor_positive: false,
196 }
197 }
198}
199
200/// Selector for choosing a section or neighboring record.
201#[derive(Clone, Debug, PartialEq, Eq)]
202pub enum Selector {
203 /// Select a section by role.
204 Role(SectionRole),
205 /// Select a specific section by index.
206 Paragraph(usize),
207 /// Select a temporal neighbor record by offset days.
208 ///
209 /// Candidates are restricted to the same split as the requesting record;
210 /// if no same-split neighbor exists the selector returns `None` for that
211 /// slot rather than crossing split boundaries.
212 TemporalOffset(i32),
213 /// Select a random section.
214 Random,
215}
216
217/// Defines how to build a text sample from a record.
218#[derive(Clone, Debug)]
219pub struct TextRecipe {
220 /// Unique name for this recipe.
221 pub name: Cow<'static, str>,
222 /// Selector used for text chunks.
223 pub selector: Selector,
224 /// Relative weight controlling how often this recipe is selected versus other recipes.
225 ///
226 /// Each recipe with a positive weight receives a number of slots in the shuffled selection
227 /// order proportional to `weight / min_positive_weight` across all active recipes, so a
228 /// recipe with `weight = 2.0` is drawn approximately twice as often as one with `weight =
229 /// 1.0`. The weight also scales the `weight` field on every [`crate::TextSample`]
230 /// returned by this recipe, which the caller's training loop can use for loss weighting.
231 ///
232 /// Recipes with `weight <= 0.0` are excluded from selection entirely and no samples
233 /// are produced for them.
234 pub weight: f32,
235 /// Optional instruction text attached to samples from this recipe.
236 pub instruction: Option<Cow<'static, str>>,
237}
238
239/// Strategy for picking the negative *record* in a triplet.
240///
241/// Each variant defines the candidate pool from which the negative record is drawn.
242/// By default all variants scope candidates to the same source as the anchor, so
243/// negatives are hard relative to the source domain rather than trivially
244/// cross-domain. A same-split fallback engages automatically when the in-source
245/// pool is too small (for example a source with only one record in the split).
246///
247/// When the `bm25-mining` feature is enabled, BM25 lexical re-ranking is applied
248/// on top of the strategy-filtered pool — BM25 re-orders candidates by keyword
249/// overlap with the anchor, but does not widen or replace the strategy pool.
250/// BM25 is a first-pass lexical ranker, not a semantic one; it is well-suited for
251/// lifting average negative quality without an encoder at data-generation time.
252/// Semantic or embedding-based re-ranking (iterative hard-negative mining with the
253/// trained encoder, cross-encoder scoring, dense retrieval) is out of scope for
254/// the data pipeline and can be integrated by pre-ranking negatives before
255/// ingestion or by reweighting source batches in the training loop.
256#[derive(Clone, Debug)]
257pub enum NegativeStrategy {
258 /// Choose a record with a different publication date from record metadata.
259 ///
260 /// This refers to metadata/taxonomy publication-date values (for example
261 /// `META_FIELD_DATE`), not filesystem timestamps like mtime/ctime/atime.
262 WrongPublicationDate,
263 /// Choose a different record from the same source.
264 ///
265 /// Negatives are drawn from within the same source, making them hard relative
266 /// to the source domain. This is appropriate when each source represents a
267 /// coherent domain (a collection of finance articles, a physics paper set,
268 /// etc.) where same-source records are already confusable. If your sources are
269 /// not meaningful domain boundaries, the fallback path (same split, any source)
270 /// is the relevant escape hatch.
271 WrongArticle,
272 /// Choose a mismatched Q/A pair.
273 QuestionAnswerMismatch,
274}
275
276/// Top-level sampler configuration.
277#[derive(Clone, Debug)]
278pub struct SamplerConfig {
279 /// RNG seed that controls deterministic sampling order.
280 pub seed: u64,
281 /// Target number of samples per batch.
282 pub batch_size: usize,
283 /// Max number of records kept in the ingestion cache for candidate sampling.
284 ///
285 /// This is intentionally decoupled from `batch_size` so anchors/negatives can
286 /// be drawn from a broader rolling pool.
287 ///
288 /// Practical tuning: values above `batch_size` usually improve diversity and
289 /// reduce short-horizon repetition; gains taper off as source/recipe/split
290 /// constraints become the limiting factor. Higher values also increase memory.
291 ///
292 /// For remote shard-backed sources (for example Hugging Face), larger initial
293 /// targets may require fetching more shards before the first batch, so startup
294 /// latency can increase based on shard sizes and network throughput.
295 pub ingestion_max_records: usize,
296 /// Chunking behavior for long sections.
297 pub chunking: ChunkingStrategy,
298 /// Triplet recipes to use; empty means sources may provide defaults.
299 pub recipes: Vec<TripletRecipe>,
300 /// Text recipes to use; empty means derived from triplet recipes if available.
301 pub text_recipes: Vec<TextRecipe>,
302 /// Split ratios used when assigning records to train/val/test.
303 pub split: SplitRatios,
304 /// Splits allowed for sampling requests.
305 pub allowed_splits: Vec<SplitLabel>,
306}
307
308impl Default for SamplerConfig {
309 fn default() -> Self {
310 Self {
311 seed: 42,
312 batch_size: 128,
313 ingestion_max_records: 2048,
314 chunking: ChunkingStrategy::default(),
315 recipes: Vec::new(),
316 text_recipes: Vec::new(),
317 split: SplitRatios::default(),
318 allowed_splits: vec![SplitLabel::Train],
319 }
320 }
321}
322
323impl SamplerConfig {
324 /// Consuming builder to enable the built-in OCR/markdown denoiser on
325 /// the sampler's chunking strategy.
326 ///
327 /// Chains denoiser setup during `SamplerConfig` construction. Works with
328 /// struct update syntax to customize other fields at the same time:
329 ///
330 /// ```rust,no_run
331 /// use triplets_core::{SamplerConfig, config::DenoiserConfig};
332 ///
333 /// // Enable denoiser with all other fields at their defaults:
334 /// let config = SamplerConfig::default()
335 /// .with_denoiser(DenoiserConfig { enabled: true, ..DenoiserConfig::default() });
336 ///
337 /// // Or customize other fields first, then add the denoiser:
338 /// let config = SamplerConfig { batch_size: 32, ..SamplerConfig::default() }
339 /// .with_denoiser(DenoiserConfig { enabled: true, ..DenoiserConfig::default() });
340 /// ```
341 pub fn with_denoiser(mut self, config: DenoiserConfig) -> Self {
342 use crate::preprocessor::backends::denoiser_preprocessor::DenoiserPreprocessor;
343 self.chunking
344 .preprocessors
345 .push(Arc::new(DenoiserPreprocessor::new(config)));
346 self
347 }
348}
349
350#[cfg(test)]
351mod tests {
352 use super::*;
353
354 #[test]
355 fn chunking_strategy_defaults_are_stable() {
356 let cfg = ChunkingStrategy::default();
357 assert_eq!(cfg.max_window_tokens, 1024);
358 assert_eq!(cfg.overlap_tokens, vec![64]);
359 assert_eq!(cfg.summary_fallback_weight, 0.35);
360 assert_eq!(cfg.summary_fallback_tokens, 512);
361 assert_eq!(cfg.chunk_weight_floor, 0.1);
362 }
363
364 #[test]
365 fn sampler_config_defaults_are_expected() {
366 let cfg = SamplerConfig::default();
367 assert_eq!(cfg.seed, 42);
368 assert_eq!(cfg.batch_size, 128);
369 assert_eq!(cfg.ingestion_max_records, 2048);
370 assert!(cfg.recipes.is_empty());
371 assert!(cfg.text_recipes.is_empty());
372 assert_eq!(cfg.allowed_splits, vec![SplitLabel::Train]);
373 assert_eq!(cfg.chunking.max_window_tokens, 1024);
374 }
375
376 #[test]
377 fn selector_variants_can_be_constructed() {
378 let role = Selector::Role(SectionRole::Anchor);
379 let paragraph = Selector::Paragraph(3);
380 let temporal = Selector::TemporalOffset(-2);
381 let random = Selector::Random;
382
383 assert!(matches!(role, Selector::Role(SectionRole::Anchor)));
384 assert!(matches!(paragraph, Selector::Paragraph(3)));
385 assert!(matches!(temporal, Selector::TemporalOffset(-2)));
386 assert!(matches!(random, Selector::Random));
387 }
388
389 #[test]
390 fn triplet_recipe_default_is_expected() {
391 let recipe = TripletRecipe::default();
392 assert_eq!(recipe.name.as_ref(), "");
393 assert!(matches!(recipe.anchor, Selector::Random));
394 assert!(matches!(recipe.positive_selector, Selector::Random));
395 assert!(matches!(recipe.negative_selector, Selector::Random));
396 assert!(matches!(
397 recipe.negative_strategy,
398 NegativeStrategy::WrongArticle
399 ));
400 assert_eq!(recipe.weight, 1.0);
401 assert!(recipe.instruction.is_none());
402 assert!(!recipe.allow_same_anchor_positive);
403 }
404}