#![doc = include_str!("../README.md")]
#![warn(missing_docs)]
pub mod chunking;
pub mod config;
pub mod constants;
pub mod data;
mod epoch;
pub mod example_apps;
pub mod hash;
pub mod heuristics;
pub mod ingestion;
pub mod kvp;
pub mod metadata;
pub mod metrics;
pub mod sampler;
pub mod source;
pub mod splits;
pub mod types;
pub mod utils;
mod errors;
pub use chunking::{ChunkingAlgorithm, SlidingWindowChunker};
pub use config::{
ChunkingStrategy, NegativeStrategy, SamplerConfig, Selector, TextRecipe, TripletRecipe,
};
pub use data::{
DataRecord, PairLabel, QualityScore, RecordChunk, SampleBatch, SamplePair, SampleTriplet,
SectionRole, TextBatch, TextSample, TripletBatch,
};
pub use errors::SamplerError;
pub use hash::stable_hash_str;
pub use ingestion::{IngestionManager, RecordCache};
pub use kvp::{KvpField, KvpPrefixSampler};
pub use sampler::{BatchPrefetcher, Sampler, TripletSampler};
#[cfg(feature = "huggingface")]
pub use source::backends::huggingface_source::{
HfListRoots, HfSourceEntry, build_hf_sources, load_hf_sources_from_list,
managed_hf_list_snapshot_dir, managed_hf_snapshot_dir, parse_csv_fields, parse_hf_source_line,
parse_hf_uri, resolve_hf_list_roots,
};
pub use source::{DataSource, SourceCursor};
#[cfg(feature = "huggingface")]
pub use source::{HuggingFaceRowSource, HuggingFaceRowsConfig};
pub use splits::{DeterministicSplitStore, FileSplitStore, SplitLabel, SplitRatios, SplitStore};
pub use types::{
CategoryId, HashPart, KvpValue, LogMessage, MetaValue, PathString, RecipeKey, RecordId,
Sentence, SourceId, TaxonomyValue,
};