pub mod alpaca;
pub mod chatml;
pub mod openai;
pub mod sharegpt;
pub mod together;
use crate::error::DatasetResult;
use crate::types::{DataFormat, PreferencePair, TrainingExample};
pub trait FormatConverter: Send + Sync {
fn name(&self) -> &str;
fn to_json(&self, example: &TrainingExample) -> DatasetResult<serde_json::Value>;
fn parse_json(&self, value: &serde_json::Value) -> DatasetResult<TrainingExample>;
fn to_json_batch(&self, examples: &[TrainingExample]) -> DatasetResult<Vec<serde_json::Value>> {
examples.iter().map(|e| self.to_json(e)).collect()
}
fn parse_json_batch(
&self,
values: &[serde_json::Value],
) -> DatasetResult<Vec<TrainingExample>> {
values.iter().map(|v| self.parse_json(v)).collect()
}
}
pub trait PreferenceConverter: Send + Sync {
fn name(&self) -> &str;
fn preference_to_json(&self, pair: &PreferencePair) -> DatasetResult<serde_json::Value>;
fn parse_preference_json(&self, value: &serde_json::Value) -> DatasetResult<PreferencePair>;
fn preference_to_json_batch(
&self,
pairs: &[PreferencePair],
) -> DatasetResult<Vec<serde_json::Value>> {
pairs.iter().map(|p| self.preference_to_json(p)).collect()
}
fn parse_preference_json_batch(
&self,
values: &[serde_json::Value],
) -> DatasetResult<Vec<PreferencePair>> {
values
.iter()
.map(|v| self.parse_preference_json(v))
.collect()
}
}
pub fn detect_format(value: &serde_json::Value) -> Option<DataFormat> {
if value.get("messages").is_some() {
return Some(DataFormat::OpenAI);
}
if value.get("instruction").is_some() && value.get("output").is_some() {
return Some(DataFormat::Alpaca);
}
if value.get("conversations").is_some() {
return Some(DataFormat::ShareGpt);
}
if let Some(text) = value.get("text").and_then(|v| v.as_str()) {
if text.contains("<|im_start|>") {
return Some(DataFormat::ChatMl);
}
return Some(DataFormat::Together);
}
None
}
pub use alpaca::AlpacaFormat;
pub use chatml::ChatMlFormat;
pub use openai::OpenAiFormat;
pub use sharegpt::ShareGptFormat;
pub use together::TogetherFormat;