Skip to main content

phago_runtime/
training_format.rs

1//! Training data format generators.
2//!
3//! Converts curriculum-ordered triples into JSONL and Alpaca
4//! instruction format for language model fine-tuning.
5
6use crate::curriculum::Curriculum;
7use crate::export::WeightedTriple;
8use serde::Serialize;
9
10/// A single training example in JSONL format.
11#[derive(Debug, Clone, Serialize)]
12pub struct TrainingExample {
13    pub instruction: String,
14    pub input: String,
15    pub output: String,
16    pub weight: f64,
17    pub section: String,
18}
19
20/// Generate JSONL training data from a curriculum.
21pub fn to_jsonl(curriculum: &Curriculum) -> String {
22    let mut lines = Vec::new();
23
24    for triple in &curriculum.foundation {
25        lines.push(triple_to_example(triple, "foundation"));
26    }
27    for triple in &curriculum.bridges {
28        lines.push(triple_to_example(triple, "bridge"));
29    }
30    for triple in &curriculum.periphery {
31        lines.push(triple_to_example(triple, "periphery"));
32    }
33
34    lines.iter()
35        .filter_map(|ex| serde_json::to_string(ex).ok())
36        .collect::<Vec<_>>()
37        .join("\n")
38}
39
40/// Generate randomly-ordered JSONL from the same triples (baseline).
41pub fn to_jsonl_random(curriculum: &Curriculum, seed: u64) -> String {
42    let mut all_triples: Vec<(&WeightedTriple, &str)> = Vec::new();
43    for t in &curriculum.foundation { all_triples.push((t, "foundation")); }
44    for t in &curriculum.bridges { all_triples.push((t, "bridge")); }
45    for t in &curriculum.periphery { all_triples.push((t, "periphery")); }
46
47    // Deterministic shuffle
48    let mut indices: Vec<usize> = (0..all_triples.len()).collect();
49    let mut rng = seed;
50    for i in (1..indices.len()).rev() {
51        rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
52        let j = (rng >> 33) as usize % (i + 1);
53        indices.swap(i, j);
54    }
55
56    let lines: Vec<String> = indices.iter()
57        .filter_map(|&i| {
58            let (triple, section) = all_triples.get(i)?;
59            let ex = triple_to_example(triple, section);
60            serde_json::to_string(&ex).ok()
61        })
62        .collect();
63
64    lines.join("\n")
65}
66
67fn triple_to_example(triple: &WeightedTriple, section: &str) -> TrainingExample {
68    TrainingExample {
69        instruction: format!(
70            "What is the relationship between '{}' and '{}'?",
71            triple.subject, triple.object
72        ),
73        input: String::new(),
74        output: format!(
75            "'{}' is {} '{}'. This is a {} concept with connection strength {:.2}.",
76            triple.subject,
77            triple.predicate,
78            triple.object,
79            section,
80            triple.weight,
81        ),
82        weight: triple.weight,
83        section: section.to_string(),
84    }
85}
86
87/// Count examples per section.
88pub fn section_counts(curriculum: &Curriculum) -> (usize, usize, usize) {
89    (
90        curriculum.foundation.len(),
91        curriculum.bridges.len(),
92        curriculum.periphery.len(),
93    )
94}