Skip to main content

triplets_core/
lib.rs

1#![doc = include_str!("../README.md")]
2#![warn(missing_docs)]
3
4/// Pluggable chunking algorithms and default sliding-window implementation.
5pub mod chunking;
6/// Sampling configuration types.
7pub mod config;
8/// Centralized constants used across sampler, splits, and sources.
9pub mod constants;
10/// Data record and sample batch types.
11pub mod data;
12mod epoch;
13
14/// Stable deterministic hashing utilities.
15pub mod hash;
16/// Capacity and sampling estimation helpers.
17pub mod heuristics;
18/// Background ingestion and caching infrastructure.
19pub mod ingestion;
20/// Key/value prefix sampling helpers.
21pub mod kvp;
22/// Metadata keys and helpers.
23pub mod metadata;
24/// Aggregate metrics helpers.
25pub mod metrics;
26/// OCR denoising and markdown-table cleanup for text chunks.
27pub mod preprocessor;
28/// Sampler implementations and public sampling API.
29pub mod sampler;
30/// Data source traits and built-in sources.
31pub mod source;
32/// Split stores and persistence helpers.
33pub mod splits;
34
35/// Structural text tokenizer trait and whitespace implementation.
36pub mod tokenizer;
37/// Shared type aliases.
38pub mod types;
39/// Text normalization helpers.
40pub mod utils;
41
42mod errors;
43
44pub use chunking::{ChunkingAlgorithm, SlidingWindowChunker};
45pub use config::{
46    ChunkingStrategy, DenoiserConfig, NegativeStrategy, SamplerConfig, Selector, TextRecipe,
47    TripletRecipe,
48};
49pub use data::{
50    DataRecord, PairLabel, QualityScore, RecordChunk, SampleBatch, SamplePair, SampleTriplet,
51    SectionRole, TextBatch, TextSample, TripletBatch,
52};
53pub use errors::SamplerError;
54pub use hash::stable_hash_str;
55pub use ingestion::{IngestionManager, RecordCache};
56pub use kvp::{KvpField, KvpPrefixSampler};
57pub use preprocessor::TextPreprocessor;
58pub use preprocessor::backends::denoiser_preprocessor::DenoiserPreprocessor;
59pub use sampler::{BatchPrefetcher, Sampler, TripletSampler};
60pub use source::InMemorySource;
61pub use source::backends::csv_source::{CsvSource, CsvSourceConfig};
62pub use source::{DataSource, SourceCursor};
63pub use splits::{DeterministicSplitStore, FileSplitStore, SplitLabel, SplitRatios, SplitStore};
64pub use types::{
65    CategoryId, HashPart, KvpValue, LogMessage, MetaValue, PathString, RecipeKey, RecordId,
66    Sentence, SourceId, TaxonomyValue,
67};