triplets_core/data.rs
1use chrono::{DateTime, Utc};
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4
5use crate::kvp::KvpPrefixSampler;
6
7pub use crate::types::{RecordId, Sentence, SourceId, TaxonomyValue};
8
9/// Trust/quality metadata for a record.
10#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
11pub struct QualityScore {
12 /// Normalized 0-1 trust measure combining provenance, recency, and manual reviews.
13 pub trust: f32,
14}
15
16impl Default for QualityScore {
17 fn default() -> Self {
18 Self {
19 // Assume medium trust by default, allowing recipes to upweight or downweight based on other signals.
20 trust: 0.5,
21 }
22 }
23}
24
25/// Canonical record payload produced by a DataSource.
26#[derive(Clone, Debug, Serialize, Deserialize)]
27pub struct DataRecord {
28 /// Stable record identifier (used for splits and determinism).
29 pub id: RecordId,
30 /// Source identifier that produced this record.
31 pub source: SourceId,
32 /// Canonical creation time for the record (used for ordering/metadata).
33 pub created_at: DateTime<Utc>,
34 /// Last update time for the record (used for refresh decisions).
35 pub updated_at: DateTime<Utc>,
36 /// Trust/quality score used to weight sampling.
37 pub quality: QualityScore,
38 /// Free-form tags (e.g., source id, year, date) used for filtering/recipes.
39 pub taxonomy: Vec<TaxonomyValue>,
40 /// Structured content sections used by sampling recipes.
41 pub sections: Vec<RecordSection>,
42 /// Optional metadata prefix policy for KVP sampling (key-value headers injected into text).
43 #[serde(default, skip_serializing_if = "Option::is_none")]
44 pub meta_prefix: Option<KvpPrefixSampler>,
45}
46
47impl DataRecord {
48 /// Create a record with a single [`SectionRole::Context`] section from a plain text string.
49 ///
50 /// The `id` and `source` are set to the same value. Use [`DataRecord::from_text_with_role`]
51 /// to assign a different role, or construct the struct directly for full control.
52 ///
53 /// # Example
54 ///
55 /// ```
56 /// use triplets_core::DataRecord;
57 ///
58 /// let record = DataRecord::from_text("doc-0", "my_corpus", "The quick brown fox.");
59 /// assert_eq!(record.id.as_str(), "doc-0");
60 /// assert_eq!(record.sections[0].text, "The quick brown fox.");
61 /// ```
62 pub fn from_text(
63 id: impl Into<crate::types::RecordId>,
64 source: impl Into<crate::types::SourceId>,
65 text: impl Into<String>,
66 ) -> Self {
67 Self::from_text_with_role(id, source, text, SectionRole::Context)
68 }
69
70 /// Create a record with a single section of the given role from a plain text string.
71 ///
72 /// # Example
73 ///
74 /// ```
75 /// use triplets_core::{DataRecord, SectionRole};
76 ///
77 /// let record = DataRecord::from_text_with_role(
78 /// "doc-0", "my_corpus", "What is the capital of France?", SectionRole::Anchor,
79 /// );
80 /// assert_eq!(record.sections[0].role, SectionRole::Anchor);
81 /// ```
82 pub fn from_text_with_role(
83 id: impl Into<crate::types::RecordId>,
84 source: impl Into<crate::types::SourceId>,
85 text: impl Into<String>,
86 role: SectionRole,
87 ) -> Self {
88 let now = chrono::Utc::now();
89 Self {
90 id: id.into(),
91 source: source.into(),
92 created_at: now,
93 updated_at: now,
94 quality: QualityScore::default(),
95 taxonomy: vec![],
96 sections: vec![RecordSection {
97 role,
98 heading: None,
99 text: text.into(),
100 sentences: vec![],
101 }],
102 meta_prefix: None,
103 }
104 }
105}
106
107/// A structured section within a record.
108#[derive(Clone, Debug, Serialize, Deserialize)]
109pub struct RecordSection {
110 /// Semantic role used by selectors (for example, anchor vs context text).
111 pub role: SectionRole,
112 /// Optional short heading/title for this section.
113 pub heading: Option<String>,
114 /// Full section text.
115 pub text: String,
116 /// Sentence-level segmentation of `text` used by chunking strategies.
117 pub sentences: Vec<Sentence>,
118}
119
120/// Role label for a section.
121#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
122pub enum SectionRole {
123 /// Primary section typically used as an anchor candidate.
124 Anchor,
125 /// Supporting/context section used for positives, negatives, or text samples.
126 Context,
127}
128
129/// A chunked view over a section.
130#[derive(Clone, Debug, Serialize, Deserialize)]
131pub struct RecordChunk {
132 /// Parent record id this chunk belongs to.
133 pub record_id: RecordId,
134 /// Index of the source section in `DataRecord.sections`.
135 pub section_idx: usize,
136 /// Chunk view metadata (window position or summary fallback).
137 pub view: ChunkView,
138 /// Rendered chunk text (possibly with metadata prefix decoration).
139 pub text: String,
140 /// Approximate token count for scheduling/weighting heuristics.
141 pub tokens_estimate: usize,
142 /// Trust/quality inherited from the parent record.
143 pub quality: QualityScore,
144 /// All KVP metadata defined on the source record's `meta_prefix`, exposed for
145 /// downstream inspection and debugging. Contains every key with all its possible
146 /// values across all variants — unaffected by presence probability, dropout, or
147 /// which variant was sampled into this chunk's text.
148 ///
149 /// Populated unconditionally by the sampler during chunk decoration. Empty when the
150 /// record has no `meta_prefix` configured.
151 #[serde(default, skip_serializing_if = "HashMap::is_empty")]
152 pub kvp_meta: HashMap<String, Vec<String>>,
153}
154
155/// Chunk view metadata (window or summary).
156#[derive(Clone, Debug, Serialize, Deserialize)]
157pub enum ChunkView {
158 /// Sliding-window chunk extracted directly from section text.
159 Window {
160 /// Zero-based window index within the section.
161 index: usize,
162 /// Overlap (in tokens) with the previous window.
163 overlap: usize,
164 /// Nominal window span in tokens.
165 span: usize,
166 },
167 /// Summary fallback chunk used when window extraction is unavailable.
168 SummaryFallback {
169 /// Name of summary strategy that produced this fallback chunk.
170 strategy: String,
171 /// Precomputed base weight for summary-fallback chunks before trust/floor are applied.
172 weight: f32,
173 },
174}
175
176/// Sample pair (positive/negative) derived from a triplet.
177#[derive(Clone, Debug, Serialize, Deserialize)]
178pub struct SamplePair {
179 /// Recipe name used to generate this pair.
180 pub recipe: String,
181 /// Anchor chunk used to build this supervised pair.
182 pub anchor: RecordChunk,
183 /// Candidate chunk paired with the anchor.
184 pub positive: RecordChunk,
185 /// Training weight for this pair.
186 pub weight: f32,
187 /// Optional instruction/prompt hint for this sample.
188 pub instruction: Option<String>,
189 /// Supervision label (positive or negative).
190 pub label: PairLabel,
191 /// Optional reason/annotation describing the label.
192 pub reason: Option<String>,
193}
194
195/// Sample triplet (anchor/positive/negative).
196#[derive(Clone, Debug, Serialize, Deserialize)]
197pub struct SampleTriplet {
198 /// Recipe name used to generate this triplet.
199 pub recipe: String,
200 /// Anchor chunk.
201 pub anchor: RecordChunk,
202 /// Positive chunk.
203 pub positive: RecordChunk,
204 /// Negative chunk.
205 pub negative: RecordChunk,
206 /// Training weight for this triplet.
207 pub weight: f32,
208 /// Optional instruction/prompt hint for this sample.
209 pub instruction: Option<String>,
210}
211
212/// Pair label for supervised pair batches.
213#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
214pub enum PairLabel {
215 /// Anchor and candidate are semantically aligned.
216 Positive,
217 /// Anchor and candidate are semantically mismatched.
218 Negative,
219}
220
221/// Batch of pairs.
222#[derive(Clone, Debug, Serialize, Deserialize)]
223pub struct SampleBatch {
224 /// Pair samples contained in this batch.
225 pub pairs: Vec<SamplePair>,
226}
227
228impl SampleBatch {
229 /// Returns `true` when the batch has no pairs.
230 pub fn is_empty(&self) -> bool {
231 self.pairs.is_empty()
232 }
233}
234
235/// Batch of triplets.
236#[derive(Clone, Debug, Serialize, Deserialize)]
237pub struct TripletBatch {
238 /// Triplet samples contained in this batch.
239 pub triplets: Vec<SampleTriplet>,
240}
241
242impl TripletBatch {
243 /// Returns `true` when the batch has no triplets.
244 pub fn is_empty(&self) -> bool {
245 self.triplets.is_empty()
246 }
247}
248
249/// A single text sample (chunk + weight).
250#[derive(Clone, Debug, Serialize, Deserialize)]
251pub struct TextSample {
252 /// Recipe name used to generate this sample.
253 pub recipe: String,
254 /// Chunk payload used for this text sample.
255 pub chunk: RecordChunk,
256 /// Training weight for this sample.
257 pub weight: f32,
258 /// Optional instruction/prompt hint for this sample.
259 pub instruction: Option<String>,
260}
261
262/// Batch of text samples.
263#[derive(Clone, Debug, Serialize, Deserialize)]
264pub struct TextBatch {
265 /// Text samples contained in this batch.
266 pub samples: Vec<TextSample>,
267}
268
269impl TextBatch {
270 /// Returns `true` when the batch has no text samples.
271 pub fn is_empty(&self) -> bool {
272 self.samples.is_empty()
273 }
274}
275
276#[cfg(test)]
277mod tests {
278 use super::*;
279 use chrono::{TimeZone, Utc};
280
281 fn sample_chunk(id: &str) -> RecordChunk {
282 RecordChunk {
283 record_id: id.to_string(),
284 section_idx: 0,
285 view: ChunkView::SummaryFallback {
286 strategy: "test".to_string(),
287 weight: 1.0,
288 },
289 text: "text".to_string(),
290 tokens_estimate: 4,
291 quality: QualityScore::default(),
292 kvp_meta: Default::default(),
293 }
294 }
295
296 #[test]
297 fn quality_score_defaults_to_medium_trust() {
298 let quality = QualityScore::default();
299 assert!((quality.trust - 0.5).abs() < f32::EPSILON);
300 }
301
302 #[test]
303 fn batch_is_empty_helpers_match_contents() {
304 let empty_pairs = SampleBatch { pairs: Vec::new() };
305 assert!(empty_pairs.is_empty());
306
307 let non_empty_pairs = SampleBatch {
308 pairs: vec![SamplePair {
309 recipe: "r".to_string(),
310 anchor: sample_chunk("a"),
311 positive: sample_chunk("b"),
312 weight: 1.0,
313 instruction: None,
314 label: PairLabel::Positive,
315 reason: Some("test".to_string()),
316 }],
317 };
318 assert!(!non_empty_pairs.is_empty());
319
320 let empty_triplets = TripletBatch {
321 triplets: Vec::new(),
322 };
323 assert!(empty_triplets.is_empty());
324
325 let non_empty_triplets = TripletBatch {
326 triplets: vec![SampleTriplet {
327 recipe: "r".to_string(),
328 anchor: sample_chunk("a"),
329 positive: sample_chunk("b"),
330 negative: sample_chunk("c"),
331 weight: 1.0,
332 instruction: Some("hint".to_string()),
333 }],
334 };
335 assert!(!non_empty_triplets.is_empty());
336
337 let empty_text = TextBatch {
338 samples: Vec::new(),
339 };
340 assert!(empty_text.is_empty());
341
342 let non_empty_text = TextBatch {
343 samples: vec![TextSample {
344 recipe: "r".to_string(),
345 chunk: sample_chunk("t"),
346 weight: 1.0,
347 instruction: None,
348 }],
349 };
350 assert!(!non_empty_text.is_empty());
351 }
352
353 #[test]
354 fn data_record_roundtrip_basics_are_constructible() {
355 let now = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
356 let record = DataRecord {
357 id: "source_a::1".to_string(),
358 source: "source_a".to_string(),
359 created_at: now,
360 updated_at: now,
361 quality: QualityScore { trust: 0.9 },
362 taxonomy: vec!["topic:news".to_string()],
363 sections: vec![RecordSection {
364 role: SectionRole::Anchor,
365 heading: Some("headline".to_string()),
366 text: "body".to_string(),
367 sentences: vec!["body".to_string()],
368 }],
369 meta_prefix: None,
370 };
371
372 assert_eq!(record.source, "source_a");
373 assert_eq!(record.sections.len(), 1);
374 assert!(matches!(record.sections[0].role, SectionRole::Anchor));
375 }
376}