brainwires_datasets/
lib.rs1#![deny(missing_docs)]
2pub mod dataset;
11pub mod error;
13pub mod format;
15pub mod jsonl;
17pub mod quality;
19pub mod sampling;
21pub mod tokenizer;
23pub mod types;
25
26pub use dataset::{Dataset, InstructDataset, PreferenceDataset};
28pub use error::{DatasetError, DatasetResult};
29pub use format::{
30 AlpacaFormat, ChatMlFormat, FormatConverter, OpenAiFormat, PreferenceConverter, ShareGptFormat,
31 TogetherFormat, detect_format,
32};
33pub use jsonl::{
34 JsonlReader, JsonlWriter, read_jsonl, read_jsonl_preferences, write_jsonl,
35 write_jsonl_preferences,
36};
37pub use quality::{
38 DataValidator, DatasetStats, HistogramBucket, IssueSeverity, PreferenceStats, RoleCounts,
39 ValidationIssue, ValidationReport, ValidatorConfig, compute_preference_stats, compute_stats,
40};
41pub use sampling::{
42 PreferenceSplitResult, SplitConfig, SplitResult, curriculum_order, preference_curriculum_order,
43 preference_sample_n, preference_train_eval_split, sample_n, train_eval_split,
44};
45pub use types::{DataFormat, PreferencePair, TrainingExample, TrainingMessage, TrainingRole};
46
47#[cfg(feature = "hf-tokenizer")]
49pub use tokenizer::HfTokenizer;
50
51#[cfg(feature = "tiktoken")]
52pub use tokenizer::TiktokenTokenizer;
53
54#[cfg(feature = "dedup")]
55pub use quality::{Deduplicator, exact_dedup, exact_dedup_preferences};
56
57pub use tokenizer::Tokenizer;