Skip to main content

brainwires_datasets/
lib.rs

1#![deny(missing_docs)]
2//! # Brainwires Datasets
3//!
4//! Training data pipelines for the Brainwires Agent Framework.
5//!
6//! Provides JSONL I/O, tokenization, deduplication, format conversion, and
7//! dataset management for cloud and local model fine-tuning workflows.
8
9/// Dataset trait and concrete dataset implementations.
10pub mod dataset;
11/// Error types for dataset operations.
12pub mod error;
13/// Format converters for various fine-tuning providers.
14pub mod format;
15/// JSONL reader and writer for streaming I/O.
16pub mod jsonl;
17/// Data quality validation, statistics, and deduplication.
18pub mod quality;
19/// Train/eval splitting, curriculum ordering, and sampling utilities.
20pub mod sampling;
21/// Tokenizer abstractions and implementations.
22pub mod tokenizer;
23/// Core training data types (messages, examples, preference pairs).
24pub mod types;
25
26// Re-export core types
27pub use dataset::{Dataset, InstructDataset, PreferenceDataset};
28pub use error::{DatasetError, DatasetResult};
29pub use format::{
30    AlpacaFormat, ChatMlFormat, FormatConverter, OpenAiFormat, PreferenceConverter, ShareGptFormat,
31    TogetherFormat, detect_format,
32};
33pub use jsonl::{
34    JsonlReader, JsonlWriter, read_jsonl, read_jsonl_preferences, write_jsonl,
35    write_jsonl_preferences,
36};
37pub use quality::{
38    DataValidator, DatasetStats, HistogramBucket, IssueSeverity, PreferenceStats, RoleCounts,
39    ValidationIssue, ValidationReport, ValidatorConfig, compute_preference_stats, compute_stats,
40};
41pub use sampling::{
42    PreferenceSplitResult, SplitConfig, SplitResult, curriculum_order, preference_curriculum_order,
43    preference_sample_n, preference_train_eval_split, sample_n, train_eval_split,
44};
45pub use types::{DataFormat, PreferencePair, TrainingExample, TrainingMessage, TrainingRole};
46
47// Feature-gated re-exports
48#[cfg(feature = "hf-tokenizer")]
49pub use tokenizer::HfTokenizer;
50
51#[cfg(feature = "tiktoken")]
52pub use tokenizer::TiktokenTokenizer;
53
54#[cfg(feature = "dedup")]
55pub use quality::{Deduplicator, exact_dedup, exact_dedup_preferences};
56
57pub use tokenizer::Tokenizer;