Skip to main content

base_d/
lib.rs

1//! # base-d
2//!
3//! A universal, multi-dictionary encoding library for Rust.
4//!
5//! Encode binary data using numerous dictionaries including RFC standards, ancient scripts,
6//! emoji, playing cards, and more. Supports three encoding modes: radix (true base
7//! conversion), RFC 4648 chunked encoding, and direct byte-range mapping.
8//!
9//! ## Quick Start
10//!
11//! ```
12//! use base_d::{DictionaryRegistry, Dictionary, encode, decode};
13//!
14//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
15//! // Load built-in dictionaries
16//! let config = DictionaryRegistry::load_default()?;
17//! let base64_config = config.get_dictionary("base64").unwrap();
18//!
19//! // Create dictionary
20//! let chars: Vec<char> = base64_config.chars.chars().collect();
21//! let padding = base64_config.padding.as_ref().and_then(|s| s.chars().next());
22//! let mut builder = Dictionary::builder()
23//!     .chars(chars)
24//!     .mode(base64_config.effective_mode());
25//! if let Some(p) = padding {
26//!     builder = builder.padding(p);
27//! }
28//! let dictionary = builder.build()?;
29//!
30//! // Encode and decode
31//! let data = b"Hello, World!";
32//! let encoded = encode(data, &dictionary);
33//! let decoded = decode(&encoded, &dictionary)?;
34//! assert_eq!(data, &decoded[..]);
35//! # Ok(())
36//! # }
37//! ```
38//!
39//! ## Features
40//!
41//! - **33 Built-in Dictionaries**: RFC standards, emoji, ancient scripts, and more
42//! - **3 Encoding Modes**: Radix, chunked (RFC-compliant), byte-range
43//! - **Streaming Support**: Memory-efficient processing for large files
44//! - **Custom Dictionaries**: Define your own via TOML configuration
45//! - **User Configuration**: Load dictionaries from `~/.config/base-d/dictionaries.toml`
46//! - **SIMD Acceleration**: AVX2/SSSE3 on x86_64, NEON on aarch64 (enabled by default)
47//!
48//! ## Cargo Features
49//!
50//! - `simd` (default): Enable SIMD acceleration for encoding/decoding.
51//!   Disable with `--no-default-features` for scalar-only builds.
52//!
53//! ## Encoding Modes
54//!
55//! ### Radix Base Conversion
56//!
57//! True base conversion treating data as a large number. Works with any dictionary size.
58//!
59//! ```
60//! use base_d::{Dictionary, EncodingMode, encode};
61//!
62//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
63//! let chars: Vec<char> = "😀😁😂🤣😃😄😅😆".chars().collect();
64//! let dictionary = Dictionary::builder()
65//!     .chars(chars)
66//!     .mode(EncodingMode::Radix)
67//!     .build()?;
68//!
69//! let encoded = encode(b"Hi", &dictionary);
70//! # Ok(())
71//! # }
72//! ```
73//!
74//! ### Chunked Mode (RFC 4648)
75//!
76//! Fixed-size bit groups, compatible with standard base64/base32.
77//!
78//! ```
79//! use base_d::{Dictionary, EncodingMode, encode};
80//!
81//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
82//! let chars: Vec<char> = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
83//!     .chars().collect();
84//! let dictionary = Dictionary::builder()
85//!     .chars(chars)
86//!     .mode(EncodingMode::Chunked)
87//!     .padding('=')
88//!     .build()?;
89//!
90//! let encoded = encode(b"Hello", &dictionary);
91//! assert_eq!(encoded, "SGVsbG8=");
92//! # Ok(())
93//! # }
94//! ```
95//!
96//! ### Byte Range Mode
97//!
98//! Direct 1:1 byte-to-emoji mapping. Zero encoding overhead.
99//!
100//! ```
101//! use base_d::{Dictionary, EncodingMode, encode};
102//!
103//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
104//! let dictionary = Dictionary::builder()
105//!     .mode(EncodingMode::ByteRange)
106//!     .start_codepoint(127991)  // U+1F3F7
107//!     .build()?;
108//!
109//! let data = b"Hi";
110//! let encoded = encode(data, &dictionary);
111//! assert_eq!(encoded.chars().count(), 2);  // 1:1 mapping
112//! # Ok(())
113//! # }
114//! ```
115//!
116//! ## Streaming
117//!
118//! For large files, use streaming to avoid loading entire file into memory:
119//!
120//! ```no_run
121//! use base_d::{DictionaryRegistry, StreamingEncoder};
122//! use std::fs::File;
123//!
124//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
125//! let config = DictionaryRegistry::load_default()?;
126//! let dictionary_config = config.get_dictionary("base64").unwrap();
127//!
128//! // ... create dictionary from config
129//! # let chars: Vec<char> = dictionary_config.chars.chars().collect();
130//! # let padding = dictionary_config.padding.as_ref().and_then(|s| s.chars().next());
131//! # let mut builder = base_d::Dictionary::builder().chars(chars).mode(dictionary_config.effective_mode());
132//! # if let Some(p) = padding { builder = builder.padding(p); }
133//! # let dictionary = builder.build()?;
134//!
135//! let mut input = File::open("large_file.bin")?;
136//! let output = File::create("encoded.txt")?;
137//!
138//! let mut encoder = StreamingEncoder::new(&dictionary, output);
139//! encoder.encode(&mut input)?;
140//! # Ok(())
141//! # }
142//! ```
143
144mod core;
145mod encoders;
146mod features;
147
148#[cfg(feature = "simd")]
149mod simd;
150
151#[cfg(feature = "wasm")]
152pub mod wasm;
153
154pub mod bench;
155pub mod convenience;
156pub mod prelude;
157pub mod wordlists;
158
159pub use convenience::{
160    CompressEncodeResult, HashEncodeResult, compress_encode, compress_encode_with, hash_encode,
161    hash_encode_with,
162};
163pub use core::alternating_dictionary::AlternatingWordDictionary;
164pub use core::config::{
165    CompressionConfig, DictionaryConfig, DictionaryRegistry, DictionaryType, EncodingMode, Settings,
166};
167pub use core::dictionary::is_safe_byte_range;
168pub use core::dictionary::{Dictionary, DictionaryBuilder};
169pub use core::word_dictionary::{WordDictionary, WordDictionaryBuilder};
170pub use encoders::algorithms::{
171    DecodeError, DictionaryNotFoundError, EncodeError, find_closest_dictionary,
172};
173
174/// Word-based encoding using radix conversion.
175///
176/// Same mathematical approach as character-based radix encoding,
177/// but outputs words joined by a delimiter instead of concatenated characters.
178pub mod word {
179    pub use crate::encoders::algorithms::word::{decode, encode};
180}
181
182/// Alternating word-based encoding for PGP-style biometric word lists.
183///
184/// Provides direct 1:1 byte-to-word mapping where the dictionary selection
185/// alternates based on byte position (e.g., even/odd bytes use different dictionaries).
186pub mod word_alternating {
187    pub use crate::encoders::algorithms::word_alternating::{decode, encode};
188}
189pub use encoders::streaming::{StreamingDecoder, StreamingEncoder};
190
191// Expose schema encoding functions for CLI
192pub use encoders::algorithms::schema::{
193    SchemaCompressionAlgo, decode_schema, decode_stele, decode_stele_path, encode_markdown_stele,
194    encode_markdown_stele_ascii, encode_markdown_stele_light, encode_markdown_stele_markdown,
195    encode_markdown_stele_readable, encode_schema, encode_stele, encode_stele_ascii,
196    encode_stele_light, encode_stele_minified, encode_stele_path, encode_stele_readable,
197};
198
199// Expose stele auto-detection
200pub use encoders::algorithms::schema::stele_analyzer::{DetectedMode, detect_stele_mode};
201
202/// Schema encoding types and traits for building custom frontends
203///
204/// This module provides the intermediate representation (IR) layer for schema encoding,
205/// allowing library users to implement custom parsers (YAML, CSV, TOML, etc.) and
206/// serializers that leverage the binary encoding backend.
207///
208/// # Architecture
209///
210/// The schema encoding pipeline has three layers:
211///
212/// 1. **Input layer**: Parse custom formats into IR
213///    - Implement `InputParser` trait
214///    - Reference: `JsonParser`
215///
216/// 2. **Binary layer**: Pack/unpack IR to/from binary
217///    - `pack()` - IR to binary bytes
218///    - `unpack()` - Binary bytes to IR
219///    - `encode_framed()` - Binary to display96 with delimiters
220///    - `decode_framed()` - Display96 to binary
221///
222/// 3. **Output layer**: Serialize IR to custom formats
223///    - Implement `OutputSerializer` trait
224///    - Reference: `JsonSerializer`
225///
226/// # Example: Custom CSV Parser
227///
228/// ```ignore
229/// use base_d::schema::{
230///     InputParser, IntermediateRepresentation, SchemaHeader, FieldDef,
231///     FieldType, SchemaValue, SchemaError, pack, encode_framed,
232/// };
233///
234/// struct CsvParser;
235///
236/// impl InputParser for CsvParser {
237///     type Error = SchemaError;
238///
239///     fn parse(input: &str) -> Result<IntermediateRepresentation, Self::Error> {
240///         // Parse CSV headers
241///         let lines: Vec<&str> = input.lines().collect();
242///         let headers: Vec<&str> = lines[0].split(',').collect();
243///
244///         // Infer types and build fields
245///         let fields: Vec<FieldDef> = headers.iter()
246///             .map(|h| FieldDef::new(h.to_string(), FieldType::String))
247///             .collect();
248///
249///         // Parse rows
250///         let row_count = lines.len() - 1;
251///         let mut values = Vec::new();
252///         for line in &lines[1..] {
253///             for cell in line.split(',') {
254///                 values.push(SchemaValue::String(cell.to_string()));
255///             }
256///         }
257///
258///         let header = SchemaHeader::new(row_count, fields);
259///         IntermediateRepresentation::new(header, values)
260///     }
261/// }
262///
263/// // Encode CSV to schema format
264/// let csv = "name,age\nalice,30\nbob,25";
265/// let ir = CsvParser::parse(csv)?;
266/// let binary = pack(&ir);
267/// let encoded = encode_framed(&binary);
268/// ```
269///
270/// # IR Structure
271///
272/// The `IntermediateRepresentation` consists of:
273///
274/// * **Header**: Schema metadata
275///   - Field definitions (name + type)
276///   - Row count
277///   - Optional root key
278///   - Optional null bitmap
279///
280/// * **Values**: Flat array in row-major order
281///   - `[row0_field0, row0_field1, row1_field0, row1_field1, ...]`
282///
283/// # Type System
284///
285/// Supported field types:
286///
287/// * `U64` - Unsigned 64-bit integer
288/// * `I64` - Signed 64-bit integer
289/// * `F64` - 64-bit floating point
290/// * `String` - UTF-8 string
291/// * `Bool` - Boolean
292/// * `Null` - Null value
293/// * `Array(T)` - Homogeneous array of type T
294/// * `Any` - Mixed-type values
295///
296/// # Compression
297///
298/// Optional compression algorithms:
299///
300/// * `SchemaCompressionAlgo::Brotli` - Best ratio
301/// * `SchemaCompressionAlgo::Lz4` - Fastest
302/// * `SchemaCompressionAlgo::Zstd` - Balanced
303///
304/// # See Also
305///
306/// * [SCHEMA.md](../SCHEMA.md) - Full format specification
307/// * `encode_schema()` / `decode_schema()` - High-level JSON functions
308pub mod schema {
309    pub use crate::encoders::algorithms::schema::{
310        // IR types
311        FieldDef,
312        FieldType,
313        // Traits
314        InputParser,
315        IntermediateRepresentation,
316        // Reference implementations
317        JsonParser,
318        JsonSerializer,
319        OutputSerializer,
320        // Compression
321        SchemaCompressionAlgo,
322        // Errors
323        SchemaError,
324        SchemaHeader,
325        SchemaValue,
326        // Binary layer
327        decode_framed,
328        // High-level API
329        decode_schema,
330        encode_framed,
331        encode_schema,
332        pack,
333        unpack,
334    };
335}
336pub use features::{
337    CompressionAlgorithm, DictionaryDetector, DictionaryMatch, HashAlgorithm, XxHashConfig,
338    compress, decompress, detect_dictionary, hash, hash_with_config,
339};
340
341/// Encodes binary data using the specified dictionary.
342///
343/// Automatically selects the appropriate encoding strategy based on the
344/// dictionary's mode (Radix, Chunked, or ByteRange).
345///
346/// # Arguments
347///
348/// * `data` - The binary data to encode
349/// * `dictionary` - The dictionary to use for encoding
350///
351/// # Returns
352///
353/// A string containing the encoded data
354///
355/// # Examples
356///
357/// ```
358/// use base_d::{Dictionary, EncodingMode};
359///
360/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
361/// let chars: Vec<char> = "01".chars().collect();
362/// let dictionary = Dictionary::builder()
363///     .chars(chars)
364///     .mode(EncodingMode::Radix)
365///     .build()?;
366/// let encoded = base_d::encode(b"Hi", &dictionary);
367/// # Ok(())
368/// # }
369/// ```
370pub fn encode(data: &[u8], dictionary: &Dictionary) -> String {
371    match dictionary.mode() {
372        EncodingMode::Radix => encoders::algorithms::radix::encode(data, dictionary),
373        EncodingMode::Chunked => encoders::algorithms::chunked::encode_chunked(data, dictionary),
374        EncodingMode::ByteRange => encoders::algorithms::byte_range::encode_byte_range(
375            data, dictionary,
376        )
377        .expect(
378            "ByteRange encode failed: dictionary should have been validated at construction time",
379        ),
380    }
381}
382
383/// Decodes a string back to binary data using the specified dictionary.
384///
385/// Automatically selects the appropriate decoding strategy based on the
386/// dictionary's mode (Radix, Chunked, or ByteRange).
387///
388/// # Arguments
389///
390/// * `encoded` - The encoded string to decode
391/// * `dictionary` - The dictionary used for encoding
392///
393/// # Returns
394///
395/// A `Result` containing the decoded binary data, or a `DecodeError` if
396/// the input is invalid
397///
398/// # Errors
399///
400/// Returns `DecodeError` if:
401/// - The input contains invalid characters
402/// - The input is empty
403/// - The padding is invalid (for chunked mode)
404///
405/// # Examples
406///
407/// ```
408/// use base_d::{Dictionary, EncodingMode, encode, decode};
409///
410/// # fn main() -> Result<(), Box<dyn std::error::Error>> {
411/// let chars: Vec<char> = "01".chars().collect();
412/// let dictionary = Dictionary::builder()
413///     .chars(chars)
414///     .mode(EncodingMode::Radix)
415///     .build()?;
416/// let data = b"Hi";
417/// let encoded = encode(data, &dictionary);
418/// let decoded = decode(&encoded, &dictionary)?;
419/// assert_eq!(data, &decoded[..]);
420/// # Ok(())
421/// # }
422/// ```
423pub fn decode(encoded: &str, dictionary: &Dictionary) -> Result<Vec<u8>, DecodeError> {
424    match dictionary.mode() {
425        EncodingMode::Radix => encoders::algorithms::radix::decode(encoded, dictionary),
426        EncodingMode::Chunked => encoders::algorithms::chunked::decode_chunked(encoded, dictionary),
427        EncodingMode::ByteRange => {
428            encoders::algorithms::byte_range::decode_byte_range(encoded, dictionary)
429        }
430    }
431}
432
433#[cfg(test)]
434mod tests;