1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
use crate::metadata::MetadataKey;
use crate::splits::SplitLabel;
/// Environment variable names read at runtime to override default behavior.
///
/// Keeping the strings here ensures every call site references the same name
/// and makes the full set of supported overrides easy to discover.
pub mod env_vars {
/// When set to any non-empty value, live network tests that require HF
/// credentials will skip silently rather than panicking. Intended for
/// CI jobs that run without secrets (e.g. fork pull requests).
pub const ENV_TRIPLETS_SKIP_LIVE_TESTS: &str = "TRIPLETS_SKIP_LIVE_TESTS";
}
/// Constants used by capacity estimation heuristics.
pub mod heuristics {
/// Effective positive examples sampled per anchor during bounded estimates.
pub const EFFECTIVE_POSITIVES_PER_ANCHOR: u128 = 1;
/// Effective negative examples sampled per anchor during bounded estimates.
pub const EFFECTIVE_NEGATIVES_PER_ANCHOR: u128 = 4;
}
/// Constants used by metadata key encoding and canonical fields.
pub mod metadata {
use super::MetadataKey;
/// Separator used for serialized metadata entries (for example `date=2025-01-01`).
pub const METADATA_DELIMITER: &str = "=";
/// Canonical metadata field key used for publication dates.
pub const META_FIELD_DATE: MetadataKey = MetadataKey::new("date");
}
/// Constants used by sampler runtime behavior and labeling.
pub mod sampler {
/// Maximum number of forced refresh retries after an exhausted sampling pass.
pub const EXHAUSTION_RETRY_LIMIT: usize = 2;
/// Retry cap when trying to produce a valid anchor/positive pair from the same selector.
pub const SAME_SELECTOR_PAIR_RETRY_LIMIT: usize = 8;
/// Offset mixed into epoch RNG seed derivation for deterministic variation.
pub const EPOCH_SEED_OFFSET: u64 = 0xB4C3_5EED;
/// Label used for triplet recipe weight maps.
pub const RECIPE_LABEL_TRIPLETS: &str = "triplet_recipes";
/// Label used for text recipe weight maps.
pub const RECIPE_LABEL_TEXT: &str = "text_recipes";
/// Label identifying anchor-role sections in role-weight maps.
pub const ROLE_LABEL_ANCHOR: &str = "anchor";
/// Label identifying context-role sections in role-weight maps.
pub const ROLE_LABEL_CONTEXT: &str = "context";
/// Synthetic source id used in prefetcher failure reporting.
pub const PREFETCHER_SOURCE_ID: &str = "prefetcher";
/// Failure reason emitted when a prefetcher worker stops unexpectedly.
pub const PREFETCHER_STOPPED_REASON: &str = "prefetcher stopped";
/// Negative-pair reason tag for mismatched publication dates.
pub const NEG_REASON_WRONG_DATE: &str = "wrong_publication_date";
/// Negative-pair reason tag for mismatched article associations.
pub const NEG_REASON_WRONG_ARTICLE: &str = "wrong_article";
/// Negative-pair reason tag for mismatched question/answer pairings.
pub const NEG_REASON_WRONG_QA: &str = "wrong_qa_pairing";
/// Recipe name auto-injected when long sections require chunk-window pairing.
///
/// This recipe is appended for eligible sources during normal ingest sync,
/// regardless of whether custom triplet recipes are configured.
pub const AUTO_INJECTED_LONG_SECTION_CHUNK_PAIR_RECIPE_NAME: &str =
"auto_injected_long_section_chunk_pair_wrong_article";
/// Maximum slot multiplier applied to the highest-weighted recipe when building the
/// shuffled selection order.
///
/// When recipe weights differ, each recipe receives a number of slots in the shuffled
/// order proportional to `w / w_min`, where `w_min` is the smallest positive weight.
/// This value caps that multiplier to keep the order list bounded regardless of how
/// extreme the weight ratios are. A recipe 17× heavier than the lightest one is
/// treated as 16× — close enough for practical use and prevents pathological list sizes.
pub const RECIPE_ORDER_MAX_WEIGHT_MULTIPLIER: usize = 16;
/// Denominator used for the anchor/positive swap coin-flip (swap when `rng & mask == 0`).
///
/// A value of `1` means the least-significant bit is tested, giving a uniform 50% swap
/// rate. This eliminates positional shortcuts — e.g. a model cannot learn to always treat
/// the first slot as the "short" anchor — which is especially important for InfoNCE and
/// similar contrastive objectives.
pub const ANCHOR_POSITIVE_SWAP_MASK: u64 = 1;
/// Number of highest-ranked BM25 hard negatives rotated per anchor before repeating.
///
/// Effective selection window per draw:
/// - `top_k = min(BM25_HARD_NEGATIVE_ROTATION_TOP_K, ranked_pool.len())`
/// - draws cycle over indices `0..top_k` in order.
///
/// Rotating through the top-K preserves lexical hardness while preventing
/// collapse to the single top-ranked document on repeated draws for the same anchor.
#[cfg(feature = "bm25-mining")]
pub const BM25_HARD_NEGATIVE_ROTATION_TOP_K: usize = 3;
/// Number of top-ranked BM25 results to retrieve per anchor query.
///
/// Must be large enough that, after split filtering, at least
/// `BM25_HARD_NEGATIVE_ROTATION_TOP_K` same-split candidates remain.
/// Raise this value if you use a heavily imbalanced split or a very small per-source pool.
#[cfg(feature = "bm25-mining")]
pub const BM25_SEARCH_TOP_K: usize = 32;
/// Maximum number of whitespace-delimited tokens taken from the anchor
/// window text when used as a BM25 query.
///
/// BM25 search cost scales with the number of unique query tokens — each
/// token triggers an inverted-index lookup and score accumulation over its
/// posting list. Capping the query token count keeps per-search latency
/// predictable without meaningful quality loss: the leading tokens of a
/// domain-specific window are typically the most distinctive.
#[cfg(feature = "bm25-mining")]
pub const BM25_QUERY_TOKEN_LIMIT: usize = 64;
}
/// Constants used by split-store persistence and wire encoding.
pub mod splits {
use super::SplitLabel;
/// Version tag for persisted epoch metadata payloads.
pub const EPOCH_STATE_VERSION: u8 = 1;
/// Version tag for persisted sampler-state payloads.
pub const SAMPLER_STATE_RECORD_VERSION: u8 = 1;
/// Key used for storing sampler-state payloads.
pub const SAMPLER_STATE_KEY: &[u8] = b"sampler_state";
/// Key used for split-store global metadata.
pub const META_KEY: &[u8] = b"__meta__";
/// Key prefix for split label assignments.
pub const SPLIT_PREFIX: &[u8] = b"split:";
/// Key prefix for per-split epoch metadata records.
pub const EPOCH_META_PREFIX: &[u8] = b"epoch_meta:";
/// Key prefix for per-split epoch hash-list records.
pub const EPOCH_HASHES_PREFIX: &[u8] = b"epoch_hashes:";
/// Tombstone marker byte for clearing persisted epoch hashes.
pub const EPOCH_RECORD_TOMBSTONE: u8 = b'-';
/// Check whether a source id matches the `__*__` reserved pattern.
///
/// Source identifiers starting and ending with double underscores are
/// reserved for internal synthetic/metadata use.
/// Users may not register sources whose `id()` returns such a value.
pub fn is_reserved_source_id(id: &str) -> bool {
id.len() >= 4 && id.starts_with("__") && id.ends_with("__")
}
/// Version tag for persisted epoch-meta records.
pub const EPOCH_META_RECORD_VERSION: u8 = 1;
/// Version tag for persisted epoch-hash records.
pub const EPOCH_HASH_RECORD_VERSION: u8 = 1;
/// Prefix marker for bitcode-encoded payloads.
pub const BITCODE_PREFIX: u8 = b'B';
/// Version tag for split-store metadata compatibility checks.
pub const STORE_VERSION: u8 = 1;
/// Canonical split iteration order used when storing/loading all splits.
pub const ALL_SPLITS: [SplitLabel; 3] =
[SplitLabel::Train, SplitLabel::Validation, SplitLabel::Test];
}
/// Constants used by file-corpus indexing and persisted index layout.
pub mod file_corpus {
/// Metadata key for serialized file-index settings and entry count.
pub const FILE_INDEX_META_KEY: &[u8] = b"meta";
/// Prefix for serialized file-index path records.
pub const FILE_INDEX_PATH_KEY_PREFIX: &[u8] = b"idx:";
/// Default directory name for persisted file-index datastore.
pub const FILE_INDEX_STORE_DIR: &str = "sampler_file_index";
/// Internal datastore read chunk size for file-index lookups.
///
/// This only controls how many index keys are fetched per `batch_read` call
/// when scanning index metadata; it does **not** cap sampler/training
/// `batch_size` values.
pub const FILE_INDEX_READ_BATCH: usize = 256;
/// Log message used when unreadable records are skipped.
pub const SKIP_UNREADABLE_MSG: &str = "skipping unreadable file record";
}
/// Constants used for managed cache-root groups.
pub mod cache {
/// Managed cache group for file-corpus index stores.
pub const FILE_CORPUS_GROUP: &str = "triplets/file-corpus";
/// Managed cache group for multi-source demo split-store persistence.
pub const MULTI_SOURCE_DEMO_GROUP: &str = "triplets/multi-source-demo";
/// Filename used by the demo app split-store persistence.
pub const MULTI_SOURCE_DEMO_STORE_FILENAME: &str = "split_store.bin";
}