triplets_core/
constants.rs

1use crate::metadata::MetadataKey;
2use crate::splits::SplitLabel;
3
4/// Environment variable names read at runtime to override default behavior.
5///
6/// Keeping the strings here ensures every call site references the same name
7/// and makes the full set of supported overrides easy to discover.
8pub mod env_vars {
9    /// When set to any non-empty value, live network tests that require HF
10    /// credentials will skip silently rather than panicking.  Intended for
11    /// CI jobs that run without secrets (e.g. fork pull requests).
12    pub const ENV_TRIPLETS_SKIP_LIVE_TESTS: &str = "TRIPLETS_SKIP_LIVE_TESTS";
13}
14
15/// Constants used by capacity estimation heuristics.
16pub mod heuristics {
17    /// Effective positive examples sampled per anchor during bounded estimates.
18    pub const EFFECTIVE_POSITIVES_PER_ANCHOR: u128 = 1;
19    /// Effective negative examples sampled per anchor during bounded estimates.
20    pub const EFFECTIVE_NEGATIVES_PER_ANCHOR: u128 = 4;
21}
22
23/// Constants used by metadata key encoding and canonical fields.
24pub mod metadata {
25    use super::MetadataKey;
26
27    /// Separator used for serialized metadata entries (for example `date=2025-01-01`).
28    pub const METADATA_DELIMITER: &str = "=";
29    /// Canonical metadata field key used for publication dates.
30    pub const META_FIELD_DATE: MetadataKey = MetadataKey::new("date");
31}
32
33/// Constants used by sampler runtime behavior and labeling.
34pub mod sampler {
35    /// Maximum number of forced refresh retries after an exhausted sampling pass.
36    pub const EXHAUSTION_RETRY_LIMIT: usize = 2;
37    /// Retry cap when trying to produce a valid anchor/positive pair from the same selector.
38    pub const SAME_SELECTOR_PAIR_RETRY_LIMIT: usize = 8;
39    /// Offset mixed into epoch RNG seed derivation for deterministic variation.
40    pub const EPOCH_SEED_OFFSET: u64 = 0xB4C3_5EED;
41    /// Label used for triplet recipe weight maps.
42    pub const RECIPE_LABEL_TRIPLETS: &str = "triplet_recipes";
43    /// Label used for text recipe weight maps.
44    pub const RECIPE_LABEL_TEXT: &str = "text_recipes";
45    /// Label identifying anchor-role sections in role-weight maps.
46    pub const ROLE_LABEL_ANCHOR: &str = "anchor";
47    /// Label identifying context-role sections in role-weight maps.
48    pub const ROLE_LABEL_CONTEXT: &str = "context";
49    /// Synthetic source id used in prefetcher failure reporting.
50    pub const PREFETCHER_SOURCE_ID: &str = "prefetcher";
51    /// Failure reason emitted when a prefetcher worker stops unexpectedly.
52    pub const PREFETCHER_STOPPED_REASON: &str = "prefetcher stopped";
53    /// Negative-pair reason tag for mismatched publication dates.
54    pub const NEG_REASON_WRONG_DATE: &str = "wrong_publication_date";
55    /// Negative-pair reason tag for mismatched article associations.
56    pub const NEG_REASON_WRONG_ARTICLE: &str = "wrong_article";
57    /// Negative-pair reason tag for mismatched question/answer pairings.
58    pub const NEG_REASON_WRONG_QA: &str = "wrong_qa_pairing";
59    /// Recipe name auto-injected when long sections require chunk-window pairing.
60    ///
61    /// This recipe is appended for eligible sources during normal ingest sync,
62    /// regardless of whether custom triplet recipes are configured.
63    pub const AUTO_INJECTED_LONG_SECTION_CHUNK_PAIR_RECIPE_NAME: &str =
64        "auto_injected_long_section_chunk_pair_wrong_article";
65    /// Maximum slot multiplier applied to the highest-weighted recipe when building the
66    /// shuffled selection order.
67    ///
68    /// When recipe weights differ, each recipe receives a number of slots in the shuffled
69    /// order proportional to `w / w_min`, where `w_min` is the smallest positive weight.
70    /// This value caps that multiplier to keep the order list bounded regardless of how
71    /// extreme the weight ratios are.  A recipe 17× heavier than the lightest one is
72    /// treated as 16× — close enough for practical use and prevents pathological list sizes.
73    pub const RECIPE_ORDER_MAX_WEIGHT_MULTIPLIER: usize = 16;
74    /// Denominator used for the anchor/positive swap coin-flip (swap when `rng & mask == 0`).
75    ///
76    /// A value of `1` means the least-significant bit is tested, giving a uniform 50% swap
77    /// rate. This eliminates positional shortcuts — e.g. a model cannot learn to always treat
78    /// the first slot as the "short" anchor — which is especially important for InfoNCE and
79    /// similar contrastive objectives.
80    pub const ANCHOR_POSITIVE_SWAP_MASK: u64 = 1;
81    /// Number of highest-ranked BM25 hard negatives rotated per anchor before repeating.
82    ///
83    /// Effective selection window per draw:
84    /// - `top_k = min(BM25_HARD_NEGATIVE_ROTATION_TOP_K, ranked_pool.len())`
85    /// - draws cycle over indices `0..top_k` in order.
86    ///
87    /// Rotating through the top-K preserves lexical hardness while preventing
88    /// collapse to the single top-ranked document on repeated draws for the same anchor.
89    #[cfg(feature = "bm25-mining")]
90    pub const BM25_HARD_NEGATIVE_ROTATION_TOP_K: usize = 3;
91
92    /// Number of top-ranked BM25 results to retrieve per anchor query.
93    ///
94    /// Must be large enough that, after split filtering, at least
95    /// `BM25_HARD_NEGATIVE_ROTATION_TOP_K` same-split candidates remain.
96    /// Raise this value if you use a heavily imbalanced split or a very small per-source pool.
97    #[cfg(feature = "bm25-mining")]
98    pub const BM25_SEARCH_TOP_K: usize = 32;
99
100    /// Maximum number of whitespace-delimited tokens taken from the anchor
101    /// window text when used as a BM25 query.
102    ///
103    /// BM25 search cost scales with the number of unique query tokens — each
104    /// token triggers an inverted-index lookup and score accumulation over its
105    /// posting list. Capping the query token count keeps per-search latency
106    /// predictable without meaningful quality loss: the leading tokens of a
107    /// domain-specific window are typically the most distinctive.
108    #[cfg(feature = "bm25-mining")]
109    pub const BM25_QUERY_TOKEN_LIMIT: usize = 64;
110}
111
112/// Constants used by split-store persistence and wire encoding.
113pub mod splits {
114    use super::SplitLabel;
115
116    /// Version tag for persisted epoch metadata payloads.
117    pub const EPOCH_STATE_VERSION: u8 = 1;
118    /// Version tag for persisted sampler-state payloads.
119    pub const SAMPLER_STATE_RECORD_VERSION: u8 = 1;
120    /// Key used for storing sampler-state payloads.
121    pub const SAMPLER_STATE_KEY: &[u8] = b"sampler_state";
122
123    /// Key used for split-store global metadata.
124    pub const META_KEY: &[u8] = b"__meta__";
125    /// Key prefix for split label assignments.
126    pub const SPLIT_PREFIX: &[u8] = b"split:";
127    /// Key prefix for per-split epoch metadata records.
128    pub const EPOCH_META_PREFIX: &[u8] = b"epoch_meta:";
129    /// Key prefix for per-split epoch hash-list records.
130    pub const EPOCH_HASHES_PREFIX: &[u8] = b"epoch_hashes:";
131    /// Tombstone marker byte for clearing persisted epoch hashes.
132    pub const EPOCH_RECORD_TOMBSTONE: u8 = b'-';
133
134    /// Check whether a source id matches the `__*__` reserved pattern.
135    ///
136    /// Source identifiers starting and ending with double underscores are
137    /// reserved for internal synthetic/metadata use.
138    /// Users may not register sources whose `id()` returns such a value.
139    pub fn is_reserved_source_id(id: &str) -> bool {
140        id.len() >= 4 && id.starts_with("__") && id.ends_with("__")
141    }
142    /// Version tag for persisted epoch-meta records.
143    pub const EPOCH_META_RECORD_VERSION: u8 = 1;
144    /// Version tag for persisted epoch-hash records.
145    pub const EPOCH_HASH_RECORD_VERSION: u8 = 1;
146    /// Prefix marker for bitcode-encoded payloads.
147    pub const BITCODE_PREFIX: u8 = b'B';
148    /// Version tag for split-store metadata compatibility checks.
149    pub const STORE_VERSION: u8 = 1;
150    /// Canonical split iteration order used when storing/loading all splits.
151    pub const ALL_SPLITS: [SplitLabel; 3] =
152        [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test];
153}
154
155/// Constants used by file-corpus indexing and persisted index layout.
156pub mod file_corpus {
157    /// Metadata key for serialized file-index settings and entry count.
158    pub const FILE_INDEX_META_KEY: &[u8] = b"meta";
159    /// Prefix for serialized file-index path records.
160    pub const FILE_INDEX_PATH_KEY_PREFIX: &[u8] = b"idx:";
161    /// Default directory name for persisted file-index datastore.
162    pub const FILE_INDEX_STORE_DIR: &str = "sampler_file_index";
163    /// Internal datastore read chunk size for file-index lookups.
164    ///
165    /// This only controls how many index keys are fetched per `batch_read` call
166    /// when scanning index metadata; it does **not** cap sampler/training
167    /// `batch_size` values.
168    pub const FILE_INDEX_READ_BATCH: usize = 256;
169    /// Log message used when unreadable records are skipped.
170    pub const SKIP_UNREADABLE_MSG: &str = "skipping unreadable file record";
171}
172
173/// Constants used for managed cache-root groups.
174pub mod cache {
175    /// Managed cache group for file-corpus index stores.
176    pub const FILE_CORPUS_GROUP: &str = "triplets/file-corpus";
177    /// Managed cache group for multi-source demo split-store persistence.
178    pub const MULTI_SOURCE_DEMO_GROUP: &str = "triplets/multi-source-demo";
179    /// Filename used by the demo app split-store persistence.
180    pub const MULTI_SOURCE_DEMO_STORE_FILENAME: &str = "split_store.bin";
181}
triplets_core/constants.rs

triplets_core/
constants.rs