brainwires_datasets/format/
mod.rs

1/// Alpaca instruction-following format converter.
2pub mod alpaca;
3/// ChatML template format converter.
4pub mod chatml;
5/// OpenAI fine-tuning format converter.
6pub mod openai;
7/// ShareGPT conversation format converter.
8pub mod sharegpt;
9/// Together AI fine-tuning format converter.
10pub mod together;
11
12use crate::error::DatasetResult;
13use crate::types::{DataFormat, PreferencePair, TrainingExample};
14
15/// Convert training examples to/from a specific provider format.
16pub trait FormatConverter: Send + Sync {
17    /// Name of this format (e.g., "openai", "alpaca").
18    fn name(&self) -> &str;
19
20    /// Convert a TrainingExample to this format's JSON representation.
21    fn to_json(&self, example: &TrainingExample) -> DatasetResult<serde_json::Value>;
22
23    /// Parse this format's JSON back into a TrainingExample.
24    fn parse_json(&self, value: &serde_json::Value) -> DatasetResult<TrainingExample>;
25
26    /// Convert a batch of examples to this format.
27    fn to_json_batch(&self, examples: &[TrainingExample]) -> DatasetResult<Vec<serde_json::Value>> {
28        examples.iter().map(|e| self.to_json(e)).collect()
29    }
30
31    /// Parse a batch of JSON values into training examples.
32    fn parse_json_batch(
33        &self,
34        values: &[serde_json::Value],
35    ) -> DatasetResult<Vec<TrainingExample>> {
36        values.iter().map(|v| self.parse_json(v)).collect()
37    }
38}
39
40/// Convert preference pairs to/from a specific provider format.
41pub trait PreferenceConverter: Send + Sync {
42    /// Name of this format.
43    fn name(&self) -> &str;
44
45    /// Convert a PreferencePair to this format's JSON representation.
46    fn preference_to_json(&self, pair: &PreferencePair) -> DatasetResult<serde_json::Value>;
47
48    /// Parse this format's JSON back into a PreferencePair.
49    fn parse_preference_json(&self, value: &serde_json::Value) -> DatasetResult<PreferencePair>;
50
51    /// Convert a batch of preference pairs to this format.
52    fn preference_to_json_batch(
53        &self,
54        pairs: &[PreferencePair],
55    ) -> DatasetResult<Vec<serde_json::Value>> {
56        pairs.iter().map(|p| self.preference_to_json(p)).collect()
57    }
58
59    /// Parse a batch of JSON values into preference pairs.
60    fn parse_preference_json_batch(
61        &self,
62        values: &[serde_json::Value],
63    ) -> DatasetResult<Vec<PreferencePair>> {
64        values
65            .iter()
66            .map(|v| self.parse_preference_json(v))
67            .collect()
68    }
69}
70
71/// Auto-detect the format of a JSON value.
72pub fn detect_format(value: &serde_json::Value) -> Option<DataFormat> {
73    if value.get("messages").is_some() {
74        return Some(DataFormat::OpenAI);
75    }
76    if value.get("instruction").is_some() && value.get("output").is_some() {
77        return Some(DataFormat::Alpaca);
78    }
79    if value.get("conversations").is_some() {
80        return Some(DataFormat::ShareGpt);
81    }
82    if let Some(text) = value.get("text").and_then(|v| v.as_str()) {
83        if text.contains("<|im_start|>") {
84            return Some(DataFormat::ChatMl);
85        }
86        return Some(DataFormat::Together);
87    }
88    None
89}
90
91pub use alpaca::AlpacaFormat;
92pub use chatml::ChatMlFormat;
93pub use openai::OpenAiFormat;
94pub use sharegpt::ShareGptFormat;
95pub use together::TogetherFormat;
brainwires_datasets/format/mod.rs

brainwires_datasets/format/
mod.rs