brainwires_datasets/format/
mod.rs1pub mod alpaca;
3pub mod chatml;
5pub mod openai;
7pub mod sharegpt;
9pub mod together;
11
12use crate::error::DatasetResult;
13use crate::types::{DataFormat, PreferencePair, TrainingExample};
14
15pub trait FormatConverter: Send + Sync {
17 fn name(&self) -> &str;
19
20 fn to_json(&self, example: &TrainingExample) -> DatasetResult<serde_json::Value>;
22
23 fn parse_json(&self, value: &serde_json::Value) -> DatasetResult<TrainingExample>;
25
26 fn to_json_batch(&self, examples: &[TrainingExample]) -> DatasetResult<Vec<serde_json::Value>> {
28 examples.iter().map(|e| self.to_json(e)).collect()
29 }
30
31 fn parse_json_batch(
33 &self,
34 values: &[serde_json::Value],
35 ) -> DatasetResult<Vec<TrainingExample>> {
36 values.iter().map(|v| self.parse_json(v)).collect()
37 }
38}
39
40pub trait PreferenceConverter: Send + Sync {
42 fn name(&self) -> &str;
44
45 fn preference_to_json(&self, pair: &PreferencePair) -> DatasetResult<serde_json::Value>;
47
48 fn parse_preference_json(&self, value: &serde_json::Value) -> DatasetResult<PreferencePair>;
50
51 fn preference_to_json_batch(
53 &self,
54 pairs: &[PreferencePair],
55 ) -> DatasetResult<Vec<serde_json::Value>> {
56 pairs.iter().map(|p| self.preference_to_json(p)).collect()
57 }
58
59 fn parse_preference_json_batch(
61 &self,
62 values: &[serde_json::Value],
63 ) -> DatasetResult<Vec<PreferencePair>> {
64 values
65 .iter()
66 .map(|v| self.parse_preference_json(v))
67 .collect()
68 }
69}
70
71pub fn detect_format(value: &serde_json::Value) -> Option<DataFormat> {
73 if value.get("messages").is_some() {
74 return Some(DataFormat::OpenAI);
75 }
76 if value.get("instruction").is_some() && value.get("output").is_some() {
77 return Some(DataFormat::Alpaca);
78 }
79 if value.get("conversations").is_some() {
80 return Some(DataFormat::ShareGpt);
81 }
82 if let Some(text) = value.get("text").and_then(|v| v.as_str()) {
83 if text.contains("<|im_start|>") {
84 return Some(DataFormat::ChatMl);
85 }
86 return Some(DataFormat::Together);
87 }
88 None
89}
90
91pub use alpaca::AlpacaFormat;
92pub use chatml::ChatMlFormat;
93pub use openai::OpenAiFormat;
94pub use sharegpt::ShareGptFormat;
95pub use together::TogetherFormat;