Skip to main content

triplets_core/
data.rs

1use chrono::{DateTime, Utc};
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4
5use crate::kvp::KvpPrefixSampler;
6
7pub use crate::types::{RecordId, Sentence, SourceId, TaxonomyValue};
8
9/// Trust/quality metadata for a record.
10#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
11pub struct QualityScore {
12    /// Normalized 0-1 trust measure combining provenance, recency, and manual reviews.
13    pub trust: f32,
14}
15
16impl Default for QualityScore {
17    fn default() -> Self {
18        Self {
19            // Assume medium trust by default, allowing recipes to upweight or downweight based on other signals.
20            trust: 0.5,
21        }
22    }
23}
24
25/// Canonical record payload produced by a DataSource.
26#[derive(Clone, Debug, Serialize, Deserialize)]
27pub struct DataRecord {
28    /// Stable record identifier (used for splits and determinism).
29    pub id: RecordId,
30    /// Source identifier that produced this record.
31    pub source: SourceId,
32    /// Canonical creation time for the record (used for ordering/metadata).
33    pub created_at: DateTime<Utc>,
34    /// Last update time for the record (used for refresh decisions).
35    pub updated_at: DateTime<Utc>,
36    /// Trust/quality score used to weight sampling.
37    pub quality: QualityScore,
38    /// Free-form tags (e.g., source id, year, date) used for filtering/recipes.
39    pub taxonomy: Vec<TaxonomyValue>,
40    /// Structured content sections used by sampling recipes.
41    pub sections: Vec<RecordSection>,
42    /// Optional metadata prefix policy for KVP sampling (key-value headers injected into text).
43    #[serde(default, skip_serializing_if = "Option::is_none")]
44    pub meta_prefix: Option<KvpPrefixSampler>,
45}
46
47impl DataRecord {
48    /// Create a record with a single [`SectionRole::Context`] section from a plain text string.
49    ///
50    /// The `id` and `source` are set to the same value. Use [`DataRecord::from_text_with_role`]
51    /// to assign a different role, or construct the struct directly for full control.
52    ///
53    /// # Example
54    ///
55    /// ```
56    /// use triplets_core::DataRecord;
57    ///
58    /// let record = DataRecord::from_text("doc-0", "my_corpus", "The quick brown fox.");
59    /// assert_eq!(record.id.as_str(), "doc-0");
60    /// assert_eq!(record.sections[0].text, "The quick brown fox.");
61    /// ```
62    pub fn from_text(
63        id: impl Into<crate::types::RecordId>,
64        source: impl Into<crate::types::SourceId>,
65        text: impl Into<String>,
66    ) -> Self {
67        Self::from_text_with_role(id, source, text, SectionRole::Context)
68    }
69
70    /// Create a record with a single section of the given role from a plain text string.
71    ///
72    /// # Example
73    ///
74    /// ```
75    /// use triplets_core::{DataRecord, SectionRole};
76    ///
77    /// let record = DataRecord::from_text_with_role(
78    ///     "doc-0", "my_corpus", "What is the capital of France?", SectionRole::Anchor,
79    /// );
80    /// assert_eq!(record.sections[0].role, SectionRole::Anchor);
81    /// ```
82    pub fn from_text_with_role(
83        id: impl Into<crate::types::RecordId>,
84        source: impl Into<crate::types::SourceId>,
85        text: impl Into<String>,
86        role: SectionRole,
87    ) -> Self {
88        let now = chrono::Utc::now();
89        Self {
90            id: id.into(),
91            source: source.into(),
92            created_at: now,
93            updated_at: now,
94            quality: QualityScore::default(),
95            taxonomy: vec![],
96            sections: vec![RecordSection {
97                role,
98                heading: None,
99                text: text.into(),
100                sentences: vec![],
101            }],
102            meta_prefix: None,
103        }
104    }
105}
106
107/// A structured section within a record.
108#[derive(Clone, Debug, Serialize, Deserialize)]
109pub struct RecordSection {
110    /// Semantic role used by selectors (for example, anchor vs context text).
111    pub role: SectionRole,
112    /// Optional short heading/title for this section.
113    pub heading: Option<String>,
114    /// Full section text.
115    pub text: String,
116    /// Sentence-level segmentation of `text` used by chunking strategies.
117    pub sentences: Vec<Sentence>,
118}
119
120/// Role label for a section.
121#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
122pub enum SectionRole {
123    /// Primary section typically used as an anchor candidate.
124    Anchor,
125    /// Supporting/context section used for positives, negatives, or text samples.
126    Context,
127}
128
129/// A chunked view over a section.
130#[derive(Clone, Debug, Serialize, Deserialize)]
131pub struct RecordChunk {
132    /// Parent record id this chunk belongs to.
133    pub record_id: RecordId,
134    /// Index of the source section in `DataRecord.sections`.
135    pub section_idx: usize,
136    /// Chunk view metadata (window position or summary fallback).
137    pub view: ChunkView,
138    /// Rendered chunk text (possibly with metadata prefix decoration).
139    pub text: String,
140    /// Approximate token count for scheduling/weighting heuristics.
141    pub tokens_estimate: usize,
142    /// Trust/quality inherited from the parent record.
143    pub quality: QualityScore,
144    /// All KVP metadata defined on the source record's `meta_prefix`, exposed for
145    /// downstream inspection and debugging. Contains every key with all its possible
146    /// values across all variants — unaffected by presence probability, dropout, or
147    /// which variant was sampled into this chunk's text.
148    ///
149    /// Populated unconditionally by the sampler during chunk decoration. Empty when the
150    /// record has no `meta_prefix` configured.
151    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
152    pub kvp_meta: HashMap<String, Vec<String>>,
153}
154
155/// Chunk view metadata (window or summary).
156#[derive(Clone, Debug, Serialize, Deserialize)]
157pub enum ChunkView {
158    /// Sliding-window chunk extracted directly from section text.
159    Window {
160        /// Zero-based window index within the section.
161        index: usize,
162        /// Overlap (in tokens) with the previous window.
163        overlap: usize,
164        /// Nominal window span in tokens.
165        span: usize,
166    },
167    /// Summary fallback chunk used when window extraction is unavailable.
168    SummaryFallback {
169        /// Name of summary strategy that produced this fallback chunk.
170        strategy: String,
171        /// Precomputed base weight for summary-fallback chunks before trust/floor are applied.
172        weight: f32,
173    },
174}
175
176/// Sample pair (positive/negative) derived from a triplet.
177#[derive(Clone, Debug, Serialize, Deserialize)]
178pub struct SamplePair {
179    /// Recipe name used to generate this pair.
180    pub recipe: String,
181    /// Anchor chunk used to build this supervised pair.
182    pub anchor: RecordChunk,
183    /// Candidate chunk paired with the anchor.
184    pub positive: RecordChunk,
185    /// Training weight for this pair.
186    pub weight: f32,
187    /// Optional instruction/prompt hint for this sample.
188    pub instruction: Option<String>,
189    /// Supervision label (positive or negative).
190    pub label: PairLabel,
191    /// Optional reason/annotation describing the label.
192    pub reason: Option<String>,
193}
194
195/// Sample triplet (anchor/positive/negative).
196#[derive(Clone, Debug, Serialize, Deserialize)]
197pub struct SampleTriplet {
198    /// Recipe name used to generate this triplet.
199    pub recipe: String,
200    /// Anchor chunk.
201    pub anchor: RecordChunk,
202    /// Positive chunk.
203    pub positive: RecordChunk,
204    /// Negative chunk.
205    pub negative: RecordChunk,
206    /// Training weight for this triplet.
207    pub weight: f32,
208    /// Optional instruction/prompt hint for this sample.
209    pub instruction: Option<String>,
210}
211
212/// Pair label for supervised pair batches.
213#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
214pub enum PairLabel {
215    /// Anchor and candidate are semantically aligned.
216    Positive,
217    /// Anchor and candidate are semantically mismatched.
218    Negative,
219}
220
221/// Batch of pairs.
222#[derive(Clone, Debug, Serialize, Deserialize)]
223pub struct SampleBatch {
224    /// Pair samples contained in this batch.
225    pub pairs: Vec<SamplePair>,
226}
227
228impl SampleBatch {
229    /// Returns `true` when the batch has no pairs.
230    pub fn is_empty(&self) -> bool {
231        self.pairs.is_empty()
232    }
233}
234
235/// Batch of triplets.
236#[derive(Clone, Debug, Serialize, Deserialize)]
237pub struct TripletBatch {
238    /// Triplet samples contained in this batch.
239    pub triplets: Vec<SampleTriplet>,
240}
241
242impl TripletBatch {
243    /// Returns `true` when the batch has no triplets.
244    pub fn is_empty(&self) -> bool {
245        self.triplets.is_empty()
246    }
247}
248
249/// A single text sample (chunk + weight).
250#[derive(Clone, Debug, Serialize, Deserialize)]
251pub struct TextSample {
252    /// Recipe name used to generate this sample.
253    pub recipe: String,
254    /// Chunk payload used for this text sample.
255    pub chunk: RecordChunk,
256    /// Training weight for this sample.
257    pub weight: f32,
258    /// Optional instruction/prompt hint for this sample.
259    pub instruction: Option<String>,
260}
261
262/// Batch of text samples.
263#[derive(Clone, Debug, Serialize, Deserialize)]
264pub struct TextBatch {
265    /// Text samples contained in this batch.
266    pub samples: Vec<TextSample>,
267}
268
269impl TextBatch {
270    /// Returns `true` when the batch has no text samples.
271    pub fn is_empty(&self) -> bool {
272        self.samples.is_empty()
273    }
274}
275
276#[cfg(test)]
277mod tests {
278    use super::*;
279    use chrono::{TimeZone, Utc};
280
281    fn sample_chunk(id: &str) -> RecordChunk {
282        RecordChunk {
283            record_id: id.to_string(),
284            section_idx: 0,
285            view: ChunkView::SummaryFallback {
286                strategy: "test".to_string(),
287                weight: 1.0,
288            },
289            text: "text".to_string(),
290            tokens_estimate: 4,
291            quality: QualityScore::default(),
292            kvp_meta: Default::default(),
293        }
294    }
295
296    #[test]
297    fn quality_score_defaults_to_medium_trust() {
298        let quality = QualityScore::default();
299        assert!((quality.trust - 0.5).abs() < f32::EPSILON);
300    }
301
302    #[test]
303    fn batch_is_empty_helpers_match_contents() {
304        let empty_pairs = SampleBatch { pairs: Vec::new() };
305        assert!(empty_pairs.is_empty());
306
307        let non_empty_pairs = SampleBatch {
308            pairs: vec![SamplePair {
309                recipe: "r".to_string(),
310                anchor: sample_chunk("a"),
311                positive: sample_chunk("b"),
312                weight: 1.0,
313                instruction: None,
314                label: PairLabel::Positive,
315                reason: Some("test".to_string()),
316            }],
317        };
318        assert!(!non_empty_pairs.is_empty());
319
320        let empty_triplets = TripletBatch {
321            triplets: Vec::new(),
322        };
323        assert!(empty_triplets.is_empty());
324
325        let non_empty_triplets = TripletBatch {
326            triplets: vec![SampleTriplet {
327                recipe: "r".to_string(),
328                anchor: sample_chunk("a"),
329                positive: sample_chunk("b"),
330                negative: sample_chunk("c"),
331                weight: 1.0,
332                instruction: Some("hint".to_string()),
333            }],
334        };
335        assert!(!non_empty_triplets.is_empty());
336
337        let empty_text = TextBatch {
338            samples: Vec::new(),
339        };
340        assert!(empty_text.is_empty());
341
342        let non_empty_text = TextBatch {
343            samples: vec![TextSample {
344                recipe: "r".to_string(),
345                chunk: sample_chunk("t"),
346                weight: 1.0,
347                instruction: None,
348            }],
349        };
350        assert!(!non_empty_text.is_empty());
351    }
352
353    #[test]
354    fn data_record_roundtrip_basics_are_constructible() {
355        let now = Utc.with_ymd_and_hms(2025, 1, 1, 0, 0, 0).unwrap();
356        let record = DataRecord {
357            id: "source_a::1".to_string(),
358            source: "source_a".to_string(),
359            created_at: now,
360            updated_at: now,
361            quality: QualityScore { trust: 0.9 },
362            taxonomy: vec!["topic:news".to_string()],
363            sections: vec![RecordSection {
364                role: SectionRole::Anchor,
365                heading: Some("headline".to_string()),
366                text: "body".to_string(),
367                sentences: vec!["body".to_string()],
368            }],
369            meta_prefix: None,
370        };
371
372        assert_eq!(record.source, "source_a");
373        assert_eq!(record.sections.len(), 1);
374        assert!(matches!(record.sections[0].role, SectionRole::Anchor));
375    }
376}