#![deny(missing_docs)]
pub mod dataset;
pub mod error;
pub mod format;
pub mod jsonl;
pub mod quality;
pub mod sampling;
pub mod tokenizer;
pub mod types;
pub use dataset::{Dataset, InstructDataset, PreferenceDataset};
pub use error::{DatasetError, DatasetResult};
pub use format::{
AlpacaFormat, ChatMlFormat, FormatConverter, OpenAiFormat, PreferenceConverter, ShareGptFormat,
TogetherFormat, detect_format,
};
pub use jsonl::{
JsonlReader, JsonlWriter, read_jsonl, read_jsonl_preferences, write_jsonl,
write_jsonl_preferences,
};
pub use quality::{
DataValidator, DatasetStats, HistogramBucket, IssueSeverity, PreferenceStats, RoleCounts,
ValidationIssue, ValidationReport, ValidatorConfig, compute_preference_stats, compute_stats,
};
pub use sampling::{
PreferenceSplitResult, SplitConfig, SplitResult, curriculum_order, preference_curriculum_order,
preference_sample_n, preference_train_eval_split, sample_n, train_eval_split,
};
pub use types::{DataFormat, PreferencePair, TrainingExample, TrainingMessage, TrainingRole};
#[cfg(feature = "hf-tokenizer")]
pub use tokenizer::HfTokenizer;
#[cfg(feature = "tiktoken")]
pub use tokenizer::TiktokenTokenizer;
#[cfg(feature = "dedup")]
pub use quality::{Deduplicator, exact_dedup, exact_dedup_preferences};
pub use tokenizer::Tokenizer;