Skip to main content

alimentar/
lib.rs

1//! alimentar - Data Loading, Distribution and Tooling in Pure Rust
2//!
3//! A sovereign-first data loading library for the paiml AI stack.
4//! Provides HuggingFace-compatible functionality without mandatory cloud
5//! dependency.
6//!
7//! # Design Principles
8//!
9//! 1. **Sovereign-first** - Local storage default, no mandatory cloud
10//!    dependency
11//! 2. **Pure Rust** - No Python, no FFI (WASM-compatible)
12//! 3. **Zero-copy** - Arrow `RecordBatch` throughout
13//! 4. **Ecosystem aligned** - Arrow 53, Parquet 53
14//!
15//! # Quick Start
16//!
17//! ```no_run
18//! use alimentar::{ArrowDataset, DataLoader};
19//!
20//! // Load a parquet file
21//! let dataset = ArrowDataset::from_parquet("data/train.parquet").unwrap();
22//!
23//! // Create a data loader
24//! let loader = DataLoader::new(dataset).batch_size(32).shuffle(true);
25//!
26//! // Iterate over batches
27//! for batch in loader {
28//!     println!("Batch with {} rows", batch.num_rows());
29//! }
30//! ```
31// unsafe_code is forbidden except where explicitly allowed (e.g., mmap module)
32#![deny(unsafe_code)]
33#![deny(missing_docs)]
34#![allow(clippy::unwrap_used)]
35#![allow(clippy::expect_used)]
36#![allow(clippy::cast_precision_loss)]
37#![allow(clippy::cast_sign_loss)]
38#![allow(clippy::cast_possible_truncation)]
39#![allow(clippy::cast_possible_wrap)]
40#![allow(clippy::cast_lossless)]
41#![allow(clippy::approx_constant)]
42#![allow(clippy::len_zero)]
43#![allow(clippy::redundant_closure)]
44#![allow(clippy::redundant_clone)]
45#![allow(clippy::float_cmp)]
46#![allow(clippy::unreadable_literal)]
47#![allow(clippy::needless_collect)]
48#![allow(clippy::too_many_lines)]
49#![allow(clippy::bool_to_int_with_if)]
50#![allow(clippy::similar_names)]
51#![allow(clippy::doc_markdown)]
52#![allow(clippy::uninlined_format_args)]
53#![allow(clippy::redundant_closure_for_method_calls)]
54#![allow(clippy::map_unwrap_or)]
55#![allow(clippy::useless_conversion)]
56#![allow(clippy::iter_on_single_items)]
57#![allow(clippy::suboptimal_flops)]
58#![allow(clippy::cloned_ref_to_slice_refs)]
59#[macro_use]
60#[allow(unused_macros)]
61mod generated_contracts;
62#[cfg(feature = "tokio-runtime")]
63pub mod async_prefetch;
64pub mod backend;
65/// CLI module for command-line interface
66#[cfg(feature = "cli")]
67pub mod cli;
68pub mod dataloader;
69pub mod dataset;
70pub mod datasets;
71#[cfg(feature = "doctest")]
72pub mod doctest;
73pub mod drift;
74pub mod error;
75pub mod federated;
76pub mod format;
77#[cfg(feature = "hf-hub")]
78pub mod hf_hub;
79pub mod imbalance;
80#[cfg(feature = "mmap")]
81pub mod mmap;
82pub mod parallel;
83pub mod quality;
84pub mod registry;
85#[cfg(feature = "repl")]
86pub mod repl;
87pub mod serve;
88pub mod sketch;
89pub mod split;
90pub mod streaming;
91pub mod tensor;
92pub mod transform;
93/// TUI dataset viewer module
94pub mod tui;
95#[cfg(feature = "shuffle")]
96pub mod weighted;
97// Re-exports for convenience
98// Re-export arrow types commonly needed
99pub use arrow::{
100    array::RecordBatch,
101    datatypes::{Schema, SchemaRef},
102};
103#[cfg(feature = "tokio-runtime")]
104pub use async_prefetch::{AsyncPrefetchBuilder, AsyncPrefetchDataset, SyncPrefetchDataset};
105pub use dataloader::DataLoader;
106pub use dataset::{ArrowDataset, CsvOptions, Dataset, JsonOptions};
107#[cfg(feature = "doctest")]
108pub use doctest::{DocTest, DocTestCorpus, DocTestParser};
109pub use drift::{ColumnDrift, DriftDetector, DriftReport, DriftSeverity, DriftTest};
110pub use error::{Error, Result};
111pub use federated::{
112    FederatedSplitCoordinator, FederatedSplitStrategy, GlobalSplitReport, NodeSplitInstruction,
113    NodeSplitManifest, NodeSummary, SplitQualityIssue,
114};
115#[cfg(feature = "shuffle")]
116pub use imbalance::resample;
117pub use imbalance::{
118    sqrt_inverse_weights, ClassDistribution, ImbalanceDetector, ImbalanceMetrics,
119    ImbalanceRecommendation, ImbalanceReport, ImbalanceSeverity, ResampleStrategy,
120};
121#[cfg(feature = "mmap")]
122pub use mmap::{MmapDataset, MmapDatasetBuilder};
123pub use parallel::{ParallelDataLoader, ParallelDataLoaderBuilder};
124pub use quality::{
125    ColumnQuality, QualityChecker, QualityIssue, QualityProfile, QualityReport, TextColumnStats,
126};
127pub use sketch::{
128    Centroid, DDSketch, DataSketch, DistributedDriftDetector, SketchDriftResult, SketchType,
129    TDigest,
130};
131pub use split::DatasetSplit;
132pub use transform::{
133    Cast, Chain, Drop, FillNull, FillStrategy, Filter, Map, NormMethod, Normalize, Rename, Select,
134    Skip, Sort, SortOrder, Take, Transform, Unique,
135};
136#[cfg(feature = "shuffle")]
137pub use transform::{Fim, FimFormat, FimTokens, Sample, Shuffle};
138pub use tui::{DatasetAdapter, DatasetViewer, RowDetailView, SchemaInspector, TuiError, TuiResult};
139#[cfg(feature = "shuffle")]
140pub use weighted::WeightedDataLoader;