use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use crate::kvp::KvpPrefixSampler;
pub use crate::types::{RecordId, Sentence, SourceId, TaxonomyValue};
#[derive(Clone, Copy, Debug, Serialize, Deserialize)]
pub struct QualityScore {
pub trust: f32,
}
impl Default for QualityScore {
fn default() -> Self {
Self {
trust: 1.0,
}
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct DataRecord {
pub id: RecordId,
pub source: SourceId,
pub created_at: DateTime<Utc>,
pub updated_at: DateTime<Utc>,
pub quality: QualityScore,
pub taxonomy: Vec<TaxonomyValue>,
pub sections: Vec<RecordSection>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub meta_prefix: Option<KvpPrefixSampler>,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct RecordSection {
pub role: SectionRole,
pub heading: Option<String>,
pub text: String,
pub sentences: Vec<Sentence>,
}
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub enum SectionRole {
Anchor,
Context,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct RecordChunk {
pub record_id: RecordId,
pub section_idx: usize,
pub view: ChunkView,
pub text: String,
pub tokens_estimate: usize,
pub quality: QualityScore,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub enum ChunkView {
Window {
index: usize,
overlap: usize,
span: usize,
start_ratio: f32,
},
SummaryFallback {
strategy: String,
weight: f32,
},
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SamplePair {
pub recipe: String,
pub anchor: RecordChunk,
pub positive: RecordChunk,
pub weight: f32,
pub instruction: Option<String>,
pub label: PairLabel,
pub reason: Option<String>,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SampleTriplet {
pub recipe: String,
pub anchor: RecordChunk,
pub positive: RecordChunk,
pub negative: RecordChunk,
pub weight: f32,
pub instruction: Option<String>,
}
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
pub enum PairLabel {
Positive,
Negative,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SampleBatch {
pub pairs: Vec<SamplePair>,
}
impl SampleBatch {
pub fn is_empty(&self) -> bool {
self.pairs.is_empty()
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct TripletBatch {
pub triplets: Vec<SampleTriplet>,
}
impl TripletBatch {
pub fn is_empty(&self) -> bool {
self.triplets.is_empty()
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct TextSample {
pub recipe: String,
pub chunk: RecordChunk,
pub weight: f32,
pub instruction: Option<String>,
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct TextBatch {
pub samples: Vec<TextSample>,
}
impl TextBatch {
pub fn is_empty(&self) -> bool {
self.samples.is_empty()
}
}