1#![doc = include_str!("../README.md")]
2#![warn(missing_docs)]
3
4pub mod chunking;
6pub mod config;
8pub mod constants;
10pub mod data;
12mod epoch;
13
14pub mod hash;
16pub mod heuristics;
18pub mod ingestion;
20pub mod kvp;
22pub mod metadata;
24pub mod metrics;
26pub mod preprocessor;
28pub mod sampler;
30pub mod source;
32pub mod splits;
34
35pub mod tokenizer;
37pub mod types;
39pub mod utils;
41
42mod errors;
43
44pub use chunking::{ChunkingAlgorithm, SlidingWindowChunker};
45pub use config::{
46 ChunkingStrategy, DenoiserConfig, NegativeStrategy, SamplerConfig, Selector, TextRecipe,
47 TripletRecipe,
48};
49pub use data::{
50 DataRecord, PairLabel, QualityScore, RecordChunk, SampleBatch, SamplePair, SampleTriplet,
51 SectionRole, TextBatch, TextSample, TripletBatch,
52};
53pub use errors::SamplerError;
54pub use hash::stable_hash_str;
55pub use ingestion::{IngestionManager, RecordCache};
56pub use kvp::{KvpField, KvpPrefixSampler};
57pub use preprocessor::TextPreprocessor;
58pub use preprocessor::backends::denoiser_preprocessor::DenoiserPreprocessor;
59pub use sampler::{BatchPrefetcher, Sampler, TripletSampler};
60pub use source::InMemorySource;
61pub use source::backends::csv_source::{CsvSource, CsvSourceConfig};
62pub use source::{DataSource, SourceCursor};
63pub use splits::{DeterministicSplitStore, FileSplitStore, SplitLabel, SplitRatios, SplitStore};
64pub use types::{
65 CategoryId, HashPart, KvpValue, LogMessage, MetaValue, PathString, RecipeKey, RecordId,
66 Sentence, SourceId, TaxonomyValue,
67};