triplets-core 0.20.0-alpha

use crate::data::SectionRole;
use crate::splits::{SplitLabel, SplitRatios};
use std::borrow::Cow;
use std::sync::Arc;

/// Configuration for the OCR denoiser that filters digit-heavy text.
///
/// When enabled, text sections that are predominantly numerical (e.g. mangled OCR tables)
/// are stripped down to their alphabetical content on a line-by-line basis, or dropped
/// entirely when no alphabetical content remains.
#[derive(Clone, Debug)]
pub struct DenoiserConfig {
    /// Whether denoising is active. Defaults to `false` so existing behavior is unchanged.
    pub enabled: bool,
    /// Maximum ratio of digit characters to (digit + alphabetical) characters before a
    /// line is considered mangled OCR output. Range: `0.0`–`1.0`.
    ///
    /// A value of `0.35` means that if more than 35% of the alphanumeric characters on a
    /// line are digits, the line is treated as a mangled table row and stripped down to
    /// its alphabetical tokens.
    ///
    /// Defaults to `0.35`.
    pub max_digit_ratio: f32,
    /// Whether to strip common markdown formatting boundaries (e.g. pipe `|` table boundaries,
    /// dropping layout-only separator rows like `|---|---|`).
    ///
    /// Currently covers GFM tables; may expand to other structural markers in the future.
    /// Semantic text is preserved.
    ///
    /// Defaults to `true`.
    pub strip_markdown: bool,
}

impl Default for DenoiserConfig {
    fn default() -> Self {
        Self {
            enabled: false,
            max_digit_ratio: 0.35,
            strip_markdown: true,
        }
    }
}

/// Controls how long text sections are chunked and weighted.
pub struct ChunkingStrategy {
    /// Max tokens per window when slicing a section into chunks.
    pub max_window_tokens: usize,
    /// Overlap sizes (in tokens) used when sliding windows across a section.
    pub overlap_tokens: Vec<usize>,
    /// Weight assigned to summary-fallback chunks (when generated).
    pub summary_fallback_weight: f32,
    /// Max tokens for summary-fallback chunks (0 disables fallback chunks).
    pub summary_fallback_tokens: usize,
    /// Floor applied to per-chunk weight after offset or summary fallback weighting.
    pub chunk_weight_floor: f32,
    /// Pluggable text preprocessors applied in order before chunking.
    pub(crate) preprocessors: Vec<Arc<dyn crate::preprocessor::TextPreprocessor>>,
}

impl Default for ChunkingStrategy {
    fn default() -> Self {
        Self {
            max_window_tokens: 1024,
            overlap_tokens: vec![64],
            summary_fallback_weight: 0.35,
            summary_fallback_tokens: 512,
            chunk_weight_floor: 0.1,
            preprocessors: Vec::new(),
        }
    }
}

impl Clone for ChunkingStrategy {
    fn clone(&self) -> Self {
        Self {
            max_window_tokens: self.max_window_tokens,
            overlap_tokens: self.overlap_tokens.clone(),
            summary_fallback_weight: self.summary_fallback_weight,
            summary_fallback_tokens: self.summary_fallback_tokens,
            chunk_weight_floor: self.chunk_weight_floor,
            preprocessors: self.preprocessors.clone(),
        }
    }
}

impl std::fmt::Debug for ChunkingStrategy {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("ChunkingStrategy")
            .field("max_window_tokens", &self.max_window_tokens)
            .field("overlap_tokens", &self.overlap_tokens)
            .field("summary_fallback_weight", &self.summary_fallback_weight)
            .field("summary_fallback_tokens", &self.summary_fallback_tokens)
            .field("chunk_weight_floor", &self.chunk_weight_floor)
            .field(
                "preprocessors",
                &format_args!("{} registered", self.preprocessors.len()),
            )
            .finish()
    }
}

impl ChunkingStrategy {
    /// Register a [`crate::preprocessor::TextPreprocessor`] to run before chunking.
    ///
    /// Preprocessors are applied in registration order; if any returns `None`
    /// the section is dropped and produces no chunks.
    pub fn register_preprocessor(
        &mut self,
        p: impl crate::preprocessor::TextPreprocessor + 'static,
    ) -> &mut Self {
        self.preprocessors.push(Arc::new(p));
        self
    }

    /// Return the slice of registered preprocessors.
    pub fn preprocessors(&self) -> &[Arc<dyn crate::preprocessor::TextPreprocessor>] {
        &self.preprocessors
    }
}

/// Defines a triplet recipe (anchor/positive/negative selection + weighting).
///
/// ## Split-isolation contract
///
/// All three chunk slots (anchor, positive, negative) must resolve to records
/// whose IDs hash to the same split as the request split. The sampler enforces
/// this automatically for `Selector::Role`, `Selector::Paragraph`, and
/// `Selector::Random` — those selectors always read from the record that was
/// already confirmed to be in the correct split.
///
/// `Selector::TemporalOffset` crosses a record boundary (it picks a *different*
/// record by proximity in time) and the split check is re-applied inside
/// `select_temporal_neighbor`. No additional care is required on your side,
/// but you should be aware that in pools with few same-split neighbors the
/// selector will return `None` and fall back to skipping a slot rather than
/// contaminating splits.
///
/// ## Stable IDs
///
/// Record IDs must be stable across runs. Split assignment is derived
/// deterministically from the record ID and the sampler seed; changing an ID
/// changes its split assignment, which invalidates any persisted split state.
/// IDs should also be globally unique — if two records from different sources
/// share the same ID, only one will be kept in the sampler, and the discarded
/// record's split assignment silently goes with it.
#[derive(Clone, Debug)]
pub struct TripletRecipe {
    /// Unique name for this recipe.
    pub name: Cow<'static, str>,
    /// Selector used for anchor chunks.
    pub anchor: Selector,
    /// Selector used for positive chunks (same record).
    pub positive_selector: Selector,
    /// Selector used for negative chunks (different record).
    pub negative_selector: Selector,
    /// Strategy used to pick negatives.
    pub negative_strategy: NegativeStrategy,
    /// Relative weight controlling how often this recipe is selected versus other recipes.
    ///
    /// Each recipe with a positive weight receives a number of slots in the shuffled selection
    /// order proportional to `weight / min_positive_weight` across all active recipes, so a
    /// recipe with `weight = 2.0` is drawn approximately twice as often as one with `weight =
    /// 1.0`.  The weight also scales the `weight` field on every [`crate::SampleTriplet`]
    /// returned by this recipe, which the caller's training loop can use for loss weighting.
    ///
    /// Recipes with `weight <= 0.0` are excluded from selection entirely and no samples
    /// are produced for them.
    pub weight: f32,
    /// Optional instruction text attached to samples from this recipe.
    pub instruction: Option<Cow<'static, str>>,
    /// Allow anchor and positive to carry identical text (SimCSE / dropout-trick mode).
    ///
    /// When `true`, the sampler will emit triplets even when the anchor and positive
    /// sections resolve to the same text.  This enables the unsupervised SimCSE
    /// training pattern: the same text string feeds both slots, and the model's
    /// dropout layers produce two slightly different embeddings at training time.
    ///
    /// Negatives are still required to differ from both anchor and positive.
    ///
    /// Defaults to `false`; set `true` only for recipes whose anchor and positive
    /// selectors intentionally resolve to the same content (e.g. text-only sources).
    pub allow_same_anchor_positive: bool,
}

impl Default for TripletRecipe {
    fn default() -> Self {
        Self {
            name: "".into(),
            anchor: Selector::Random,
            positive_selector: Selector::Random,
            negative_selector: Selector::Random,
            negative_strategy: NegativeStrategy::WrongArticle,
            weight: 1.0,
            instruction: None,
            allow_same_anchor_positive: false,
        }
    }
}

/// Selector for choosing a section or neighboring record.
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum Selector {
    /// Select a section by role.
    Role(SectionRole),
    /// Select a specific section by index.
    Paragraph(usize),
    /// Select a temporal neighbor record by offset days.
    ///
    /// Candidates are restricted to the same split as the requesting record;
    /// if no same-split neighbor exists the selector returns `None` for that
    /// slot rather than crossing split boundaries.
    TemporalOffset(i32),
    /// Select a random section.
    Random,
}

/// Defines how to build a text sample from a record.
#[derive(Clone, Debug)]
pub struct TextRecipe {
    /// Unique name for this recipe.
    pub name: Cow<'static, str>,
    /// Selector used for text chunks.
    pub selector: Selector,
    /// Relative weight controlling how often this recipe is selected versus other recipes.
    ///
    /// Each recipe with a positive weight receives a number of slots in the shuffled selection
    /// order proportional to `weight / min_positive_weight` across all active recipes, so a
    /// recipe with `weight = 2.0` is drawn approximately twice as often as one with `weight =
    /// 1.0`.  The weight also scales the `weight` field on every [`crate::TextSample`]
    /// returned by this recipe, which the caller's training loop can use for loss weighting.
    ///
    /// Recipes with `weight <= 0.0` are excluded from selection entirely and no samples
    /// are produced for them.
    pub weight: f32,
    /// Optional instruction text attached to samples from this recipe.
    pub instruction: Option<Cow<'static, str>>,
}

/// Strategy for picking the negative *record* in a triplet.
///
/// Each variant defines the candidate pool from which the negative record is drawn.
/// By default all variants scope candidates to the same source as the anchor, so
/// negatives are hard relative to the source domain rather than trivially
/// cross-domain. A same-split fallback engages automatically when the in-source
/// pool is too small (for example a source with only one record in the split).
///
/// When the `bm25-mining` feature is enabled, BM25 lexical re-ranking is applied
/// on top of the strategy-filtered pool — BM25 re-orders candidates by keyword
/// overlap with the anchor, but does not widen or replace the strategy pool.
/// BM25 is a first-pass lexical ranker, not a semantic one; it is well-suited for
/// lifting average negative quality without an encoder at data-generation time.
/// Semantic or embedding-based re-ranking (iterative hard-negative mining with the
/// trained encoder, cross-encoder scoring, dense retrieval) is out of scope for
/// the data pipeline and can be integrated by pre-ranking negatives before
/// ingestion or by reweighting source batches in the training loop.
#[derive(Clone, Debug)]
pub enum NegativeStrategy {
    /// Choose a record with a different publication date from record metadata.
    ///
    /// This refers to metadata/taxonomy publication-date values (for example
    /// `META_FIELD_DATE`), not filesystem timestamps like mtime/ctime/atime.
    WrongPublicationDate,
    /// Choose a different record from the same source.
    ///
    /// Negatives are drawn from within the same source, making them hard relative
    /// to the source domain. This is appropriate when each source represents a
    /// coherent domain (a collection of finance articles, a physics paper set,
    /// etc.) where same-source records are already confusable. If your sources are
    /// not meaningful domain boundaries, the fallback path (same split, any source)
    /// is the relevant escape hatch.
    WrongArticle,
    /// Choose a mismatched Q/A pair.
    QuestionAnswerMismatch,
}

/// Top-level sampler configuration.
#[derive(Clone, Debug)]
pub struct SamplerConfig {
    /// RNG seed that controls deterministic sampling order.
    pub seed: u64,
    /// Target number of samples per batch.
    pub batch_size: usize,
    /// Max number of records kept in the ingestion cache for candidate sampling.
    ///
    /// This is intentionally decoupled from `batch_size` so anchors/negatives can
    /// be drawn from a broader rolling pool.
    ///
    /// Practical tuning: values above `batch_size` usually improve diversity and
    /// reduce short-horizon repetition; gains taper off as source/recipe/split
    /// constraints become the limiting factor. Higher values also increase memory.
    ///
    /// For remote shard-backed sources (for example Hugging Face), larger initial
    /// targets may require fetching more shards before the first batch, so startup
    /// latency can increase based on shard sizes and network throughput.
    pub ingestion_max_records: usize,
    /// Chunking behavior for long sections.
    pub chunking: ChunkingStrategy,
    /// Triplet recipes to use; empty means sources may provide defaults.
    pub recipes: Vec<TripletRecipe>,
    /// Text recipes to use; empty means derived from triplet recipes if available.
    pub text_recipes: Vec<TextRecipe>,
    /// Split ratios used when assigning records to train/val/test.
    pub split: SplitRatios,
    /// Splits allowed for sampling requests.
    pub allowed_splits: Vec<SplitLabel>,
}

impl Default for SamplerConfig {
    fn default() -> Self {
        Self {
            seed: 42,
            batch_size: 128,
            ingestion_max_records: 2048,
            chunking: ChunkingStrategy::default(),
            recipes: Vec::new(),
            text_recipes: Vec::new(),
            split: SplitRatios::default(),
            allowed_splits: vec![SplitLabel::Train],
        }
    }
}

impl SamplerConfig {
    /// Consuming builder to enable the built-in OCR/markdown denoiser on
    /// the sampler's chunking strategy.
    ///
    /// Chains denoiser setup during `SamplerConfig` construction. Works with
    /// struct update syntax to customize other fields at the same time:
    ///
    /// ```rust,no_run
    /// use triplets_core::{SamplerConfig, config::DenoiserConfig};
    ///
    /// // Enable denoiser with all other fields at their defaults:
    /// let config = SamplerConfig::default()
    ///     .with_denoiser(DenoiserConfig { enabled: true, ..DenoiserConfig::default() });
    ///
    /// // Or customize other fields first, then add the denoiser:
    /// let config = SamplerConfig { batch_size: 32, ..SamplerConfig::default() }
    ///     .with_denoiser(DenoiserConfig { enabled: true, ..DenoiserConfig::default() });
    /// ```
    pub fn with_denoiser(mut self, config: DenoiserConfig) -> Self {
        use crate::preprocessor::backends::denoiser_preprocessor::DenoiserPreprocessor;
        self.chunking
            .preprocessors
            .push(Arc::new(DenoiserPreprocessor::new(config)));
        self
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn chunking_strategy_defaults_are_stable() {
        let cfg = ChunkingStrategy::default();
        assert_eq!(cfg.max_window_tokens, 1024);
        assert_eq!(cfg.overlap_tokens, vec![64]);
        assert_eq!(cfg.summary_fallback_weight, 0.35);
        assert_eq!(cfg.summary_fallback_tokens, 512);
        assert_eq!(cfg.chunk_weight_floor, 0.1);
    }

    #[test]
    fn sampler_config_defaults_are_expected() {
        let cfg = SamplerConfig::default();
        assert_eq!(cfg.seed, 42);
        assert_eq!(cfg.batch_size, 128);
        assert_eq!(cfg.ingestion_max_records, 2048);
        assert!(cfg.recipes.is_empty());
        assert!(cfg.text_recipes.is_empty());
        assert_eq!(cfg.allowed_splits, vec![SplitLabel::Train]);
        assert_eq!(cfg.chunking.max_window_tokens, 1024);
    }

    #[test]
    fn selector_variants_can_be_constructed() {
        let role = Selector::Role(SectionRole::Anchor);
        let paragraph = Selector::Paragraph(3);
        let temporal = Selector::TemporalOffset(-2);
        let random = Selector::Random;

        assert!(matches!(role, Selector::Role(SectionRole::Anchor)));
        assert!(matches!(paragraph, Selector::Paragraph(3)));
        assert!(matches!(temporal, Selector::TemporalOffset(-2)));
        assert!(matches!(random, Selector::Random));
    }

    #[test]
    fn triplet_recipe_default_is_expected() {
        let recipe = TripletRecipe::default();
        assert_eq!(recipe.name.as_ref(), "");
        assert!(matches!(recipe.anchor, Selector::Random));
        assert!(matches!(recipe.positive_selector, Selector::Random));
        assert!(matches!(recipe.negative_selector, Selector::Random));
        assert!(matches!(
            recipe.negative_strategy,
            NegativeStrategy::WrongArticle
        ));
        assert_eq!(recipe.weight, 1.0);
        assert!(recipe.instruction.is_none());
        assert!(!recipe.allow_same_anchor_positive);
    }
}