triplets_core/constants.rs
1use crate::metadata::MetadataKey;
2use crate::splits::SplitLabel;
3
4/// Environment variable names read at runtime to override default behavior.
5///
6/// Keeping the strings here ensures every call site references the same name
7/// and makes the full set of supported overrides easy to discover.
8pub mod env_vars {
9 /// When set to any non-empty value, live network tests that require HF
10 /// credentials will skip silently rather than panicking. Intended for
11 /// CI jobs that run without secrets (e.g. fork pull requests).
12 pub const ENV_TRIPLETS_SKIP_LIVE_TESTS: &str = "TRIPLETS_SKIP_LIVE_TESTS";
13}
14
15/// Constants used by capacity estimation heuristics.
16pub mod heuristics {
17 /// Effective positive examples sampled per anchor during bounded estimates.
18 pub const EFFECTIVE_POSITIVES_PER_ANCHOR: u128 = 1;
19 /// Effective negative examples sampled per anchor during bounded estimates.
20 pub const EFFECTIVE_NEGATIVES_PER_ANCHOR: u128 = 4;
21}
22
23/// Constants used by metadata key encoding and canonical fields.
24pub mod metadata {
25 use super::MetadataKey;
26
27 /// Separator used for serialized metadata entries (for example `date=2025-01-01`).
28 pub const METADATA_DELIMITER: &str = "=";
29 /// Canonical metadata field key used for publication dates.
30 pub const META_FIELD_DATE: MetadataKey = MetadataKey::new("date");
31}
32
33/// Constants used by sampler runtime behavior and labeling.
34pub mod sampler {
35 /// Maximum number of forced refresh retries after an exhausted sampling pass.
36 pub const EXHAUSTION_RETRY_LIMIT: usize = 2;
37 /// Retry cap when trying to produce a valid anchor/positive pair from the same selector.
38 pub const SAME_SELECTOR_PAIR_RETRY_LIMIT: usize = 8;
39 /// Offset mixed into epoch RNG seed derivation for deterministic variation.
40 pub const EPOCH_SEED_OFFSET: u64 = 0xB4C3_5EED;
41 /// Label used for triplet recipe weight maps.
42 pub const RECIPE_LABEL_TRIPLETS: &str = "triplet_recipes";
43 /// Label used for text recipe weight maps.
44 pub const RECIPE_LABEL_TEXT: &str = "text_recipes";
45 /// Label identifying anchor-role sections in role-weight maps.
46 pub const ROLE_LABEL_ANCHOR: &str = "anchor";
47 /// Label identifying context-role sections in role-weight maps.
48 pub const ROLE_LABEL_CONTEXT: &str = "context";
49 /// Synthetic source id used in prefetcher failure reporting.
50 pub const PREFETCHER_SOURCE_ID: &str = "prefetcher";
51 /// Failure reason emitted when a prefetcher worker stops unexpectedly.
52 pub const PREFETCHER_STOPPED_REASON: &str = "prefetcher stopped";
53 /// Negative-pair reason tag for mismatched publication dates.
54 pub const NEG_REASON_WRONG_DATE: &str = "wrong_publication_date";
55 /// Negative-pair reason tag for mismatched article associations.
56 pub const NEG_REASON_WRONG_ARTICLE: &str = "wrong_article";
57 /// Negative-pair reason tag for mismatched question/answer pairings.
58 pub const NEG_REASON_WRONG_QA: &str = "wrong_qa_pairing";
59 /// Recipe name auto-injected when long sections require chunk-window pairing.
60 ///
61 /// This recipe is appended for eligible sources during normal ingest sync,
62 /// regardless of whether custom triplet recipes are configured.
63 pub const AUTO_INJECTED_LONG_SECTION_CHUNK_PAIR_RECIPE_NAME: &str =
64 "auto_injected_long_section_chunk_pair_wrong_article";
65 /// Maximum slot multiplier applied to the highest-weighted recipe when building the
66 /// shuffled selection order.
67 ///
68 /// When recipe weights differ, each recipe receives a number of slots in the shuffled
69 /// order proportional to `w / w_min`, where `w_min` is the smallest positive weight.
70 /// This value caps that multiplier to keep the order list bounded regardless of how
71 /// extreme the weight ratios are. A recipe 17× heavier than the lightest one is
72 /// treated as 16× — close enough for practical use and prevents pathological list sizes.
73 pub const RECIPE_ORDER_MAX_WEIGHT_MULTIPLIER: usize = 16;
74 /// Denominator used for the anchor/positive swap coin-flip (swap when `rng & mask == 0`).
75 ///
76 /// A value of `1` means the least-significant bit is tested, giving a uniform 50% swap
77 /// rate. This eliminates positional shortcuts — e.g. a model cannot learn to always treat
78 /// the first slot as the "short" anchor — which is especially important for InfoNCE and
79 /// similar contrastive objectives.
80 pub const ANCHOR_POSITIVE_SWAP_MASK: u64 = 1;
81 /// Number of highest-ranked BM25 hard negatives rotated per anchor before repeating.
82 ///
83 /// Effective selection window per draw:
84 /// - `top_k = min(BM25_HARD_NEGATIVE_ROTATION_TOP_K, ranked_pool.len())`
85 /// - draws cycle over indices `0..top_k` in order.
86 ///
87 /// Rotating through the top-K preserves lexical hardness while preventing
88 /// collapse to the single top-ranked document on repeated draws for the same anchor.
89 #[cfg(feature = "bm25-mining")]
90 pub const BM25_HARD_NEGATIVE_ROTATION_TOP_K: usize = 3;
91
92 /// Number of top-ranked BM25 results to retrieve per anchor query.
93 ///
94 /// Must be large enough that, after split filtering, at least
95 /// `BM25_HARD_NEGATIVE_ROTATION_TOP_K` same-split candidates remain.
96 /// Raise this value if you use a heavily imbalanced split or a very small per-source pool.
97 #[cfg(feature = "bm25-mining")]
98 pub const BM25_SEARCH_TOP_K: usize = 32;
99
100 /// Maximum number of whitespace-delimited tokens taken from the anchor
101 /// window text when used as a BM25 query.
102 ///
103 /// BM25 search cost scales with the number of unique query tokens — each
104 /// token triggers an inverted-index lookup and score accumulation over its
105 /// posting list. Capping the query token count keeps per-search latency
106 /// predictable without meaningful quality loss: the leading tokens of a
107 /// domain-specific window are typically the most distinctive.
108 #[cfg(feature = "bm25-mining")]
109 pub const BM25_QUERY_TOKEN_LIMIT: usize = 64;
110}
111
112/// Constants used by split-store persistence and wire encoding.
113pub mod splits {
114 use super::SplitLabel;
115
116 /// Version tag for persisted epoch metadata payloads.
117 pub const EPOCH_STATE_VERSION: u8 = 1;
118 /// Version tag for persisted sampler-state payloads.
119 pub const SAMPLER_STATE_RECORD_VERSION: u8 = 1;
120 /// Key used for storing sampler-state payloads.
121 pub const SAMPLER_STATE_KEY: &[u8] = b"sampler_state";
122
123 /// Key used for split-store global metadata.
124 pub const META_KEY: &[u8] = b"__meta__";
125 /// Key prefix for split label assignments.
126 pub const SPLIT_PREFIX: &[u8] = b"split:";
127 /// Key prefix for per-split epoch metadata records.
128 pub const EPOCH_META_PREFIX: &[u8] = b"epoch_meta:";
129 /// Key prefix for per-split epoch hash-list records.
130 pub const EPOCH_HASHES_PREFIX: &[u8] = b"epoch_hashes:";
131 /// Tombstone marker byte for clearing persisted epoch hashes.
132 pub const EPOCH_RECORD_TOMBSTONE: u8 = b'-';
133
134 /// Check whether a source id matches the `__*__` reserved pattern.
135 ///
136 /// Source identifiers starting and ending with double underscores are
137 /// reserved for internal synthetic/metadata use.
138 /// Users may not register sources whose `id()` returns such a value.
139 pub fn is_reserved_source_id(id: &str) -> bool {
140 id.len() >= 4 && id.starts_with("__") && id.ends_with("__")
141 }
142 /// Version tag for persisted epoch-meta records.
143 pub const EPOCH_META_RECORD_VERSION: u8 = 1;
144 /// Version tag for persisted epoch-hash records.
145 pub const EPOCH_HASH_RECORD_VERSION: u8 = 1;
146 /// Prefix marker for bitcode-encoded payloads.
147 pub const BITCODE_PREFIX: u8 = b'B';
148 /// Version tag for split-store metadata compatibility checks.
149 pub const STORE_VERSION: u8 = 1;
150 /// Canonical split iteration order used when storing/loading all splits.
151 pub const ALL_SPLITS: [SplitLabel; 3] =
152 [SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test];
153}
154
155/// Constants used by file-corpus indexing and persisted index layout.
156pub mod file_corpus {
157 /// Metadata key for serialized file-index settings and entry count.
158 pub const FILE_INDEX_META_KEY: &[u8] = b"meta";
159 /// Prefix for serialized file-index path records.
160 pub const FILE_INDEX_PATH_KEY_PREFIX: &[u8] = b"idx:";
161 /// Default directory name for persisted file-index datastore.
162 pub const FILE_INDEX_STORE_DIR: &str = "sampler_file_index";
163 /// Internal datastore read chunk size for file-index lookups.
164 ///
165 /// This only controls how many index keys are fetched per `batch_read` call
166 /// when scanning index metadata; it does **not** cap sampler/training
167 /// `batch_size` values.
168 pub const FILE_INDEX_READ_BATCH: usize = 256;
169 /// Log message used when unreadable records are skipped.
170 pub const SKIP_UNREADABLE_MSG: &str = "skipping unreadable file record";
171}
172
173/// Constants used for managed cache-root groups.
174pub mod cache {
175 /// Managed cache group for file-corpus index stores.
176 pub const FILE_CORPUS_GROUP: &str = "triplets/file-corpus";
177 /// Managed cache group for multi-source demo split-store persistence.
178 pub const MULTI_SOURCE_DEMO_GROUP: &str = "triplets/multi-source-demo";
179 /// Filename used by the demo app split-store persistence.
180 pub const MULTI_SOURCE_DEMO_STORE_FILENAME: &str = "split_store.bin";
181}