sym_adv_encoding/
utf8.rs

1//! Utilities for encoding bytes/UTF-8 text into ring elements.
2//!
3//! This module provides two related but distinct encodings for turning a byte
4//! string into a sequence of `RingElement`s with a given `modulus`:
5//!
6//! - Bit-packing (`encode_bytes` / `decode_bytes`):
7//!   Packs the input bitstream into chunks of b = floor(log2(modulus)) bits.
8//!   Every produced element satisfies value < 2^b <= modulus, so some residues
9//!   are intentionally unused when the modulus is not a power of two.
10//!   This format does not include the original byte length.
11//!
12//! - Length-delimited base-m transduction
13//!   (`encode_bytes_base_m_len` / `decode_bytes_base_m_len`):
14//!   Encodes the byte length and a fixed rANS state as base-modulus digits,
15//!   then converts the byte stream into base-modulus payload digits using a
16//!   uniform rANS transducer. Every produced digit satisfies digit < modulus,
17//!   so it uses all residues. Decoding is length-delimited and ignores trailing
18//!   elements.
19//!
20//! ## Target behavior (base-m-len variant)
21//!
22//! - Uses all residues: every emitted `RingElement` satisfies value < modulus,
23//!   with no "[0, 2^b)" restriction.
24//! - Near-linear time: amortized O(1) work per input byte, plus a constant-size
25//!   header (2 * k digits for a fixed k determined by `modulus`).
26//! - Same public API: the public `encode_bytes_base_m_len` /
27//!   `decode_bytes_base_m_len` signatures and
28//!   `encode_text_base_m_len` / `decode_text_base_m_len` behavior are unchanged.
29//! - Padding tolerance: appending extra elements at the end (for example, block
30//!   padding) does not affect decoding; the decoder stops after emitting the
31//!   byte count from the length prefix.
32//!
33//! ## Design: uniform rANS as a radix transducer
34//!
35//! Treat each input byte as a uniform symbol (frequency = 1 over 256 symbols).
36//! The symbol update is:
37//!
38//! - encode: x = x * 256 + byte
39//! - decode: byte = x % 256; x = x / 256
40//!
41//! Renormalization uses base = modulus, so every emitted digit is a valid ring
42//! residue (digit < modulus). The stream stores a fixed-size initial state
43//! immediately after the length prefix so decoding can proceed FIFO
44//! (left-to-right) and ignore any trailing padding.
45//!
46//! ## Future work
47//!
48//! Investigate non-uniform rANS as a radix transducer to incorporate empirical
49//! byte distributions while retaining the base-modulus digit stream interface.
50//!
51//! The bitstream in `encode_bytes` is packed little-endian: earlier bytes
52//! occupy lower bits of the internal buffer and are emitted first.
53//!
54//! Important limitations (bit-packing variant):
55//!
56//! - The encoding does not include the original byte length. For some parameter
57//!   choices (notably when b > 8), decoding may yield extra trailing 0x00 bytes
58//!   that come from zero-padding the final partial chunk. If you need exact
59//!   round-trips, store the original length separately and truncate after
60//!   decoding.
61//! - Decoding assumes elements are in the canonical range produced by this
62//!   module (each element value fits in b bits). If you modify elements via
63//!   ring arithmetic, decoding will generally not recover the original bytes.
64//!
65//! # References (ANS / rANS)
66//!
67//! - J. Duda, "Asymmetric numeral systems: entropy coding combining speed of
68//!   Huffman coding with compression rate of arithmetic coding",
69//!   <https://arxiv.org/abs/1311.2540>
70//! - F. Giesen, "rANS notes":
71//!   <https://fgiesen.wordpress.com/2014/02/02/rans-notes/>
72//! - F. Giesen, "rANS with static probability distributions":
73//!   <https://fgiesen.wordpress.com/2014/02/18/rans-with-static-probability-distributions/>
74//! - Y. Collet, "Finite State Entropy, a new breed of entropy coders" (FSE/tANS):
75//!   <https://fastcompression.blogspot.com/2013/12/finite-state-entropy-new-breed-of.html>
76
77use sym_adv_ring::RingElement;
78
79#[derive(Clone, Copy, Debug, PartialEq, Eq)]
80pub enum Utf8EncodingType {
81    LengthDelimitedBaseMTransduction,
82    BitPacking,
83}
84
85enum Utf8EncodingTypeInner {
86    LengthDelimitedBaseMTransduction {
87        length_prefix_digits: usize,
88        decoder_lower_bound: u64,
89        encoder_threshold: u64,
90    },
91    BitPacking,
92}
93
94pub struct Utf8Encoding {
95    modulus: u64,
96    encoding_type: Utf8EncodingTypeInner,
97}
98
99impl Utf8Encoding {
100    /// Builds an encoding configuration for the provided `modulus`.
101    ///
102    /// # Errors
103    ///
104    /// - Returns [`EncodingError::ModulusTooSmall`] if `modulus < 2`.
105    /// - Returns [`EncodingError::DecodingError`] when base-m rANS parameters are
106    ///   invalid for the provided `modulus`.
107    pub fn try_from(modulus: u64, encoding_type: Utf8EncodingType) -> Result<Self, EncodingError> {
108        if modulus < 2 {
109            return Err(EncodingError::ModulusTooSmall);
110        }
111
112        Ok(Self {
113            modulus,
114            encoding_type: Self::build_inner_encoding_type(modulus, encoding_type)?,
115        })
116    }
117
118    fn build_inner_encoding_type(
119        modulus: u64,
120        encoding_type: Utf8EncodingType,
121    ) -> Result<Utf8EncodingTypeInner, EncodingError> {
122        match encoding_type {
123            Utf8EncodingType::LengthDelimitedBaseMTransduction => {
124                let (decoder_lower_bound, encoder_threshold) = Self::rans_params(modulus)?;
125
126                Ok(Utf8EncodingTypeInner::LengthDelimitedBaseMTransduction {
127                    length_prefix_digits: Self::length_prefix_digits(modulus),
128                    decoder_lower_bound,
129                    encoder_threshold,
130                })
131            }
132            Utf8EncodingType::BitPacking => Ok(Utf8EncodingTypeInner::BitPacking),
133        }
134    }
135}
136
137impl Utf8Encoding {
138    /// Returns the fixed base-`modulus` digit count used for u64 prefixes.
139    ///
140    /// This computes the smallest `k` such that `modulus^k >= 2^64`. The
141    /// length-delimited `*_base_m_len` format encodes both the byte length and the
142    /// rANS state using exactly `k` base-`modulus` digits, which makes the stream
143    /// layout self-delimiting (`length || state || payload`) without separators.
144    ///
145    /// Digits are interpreted little-endian (least significant digit first).
146    ///
147    /// # Math
148    ///
149    /// Let m = modulus (with m >= 2). We choose the minimal digit count k >= 0 such
150    /// that every 64-bit value fits into k base-m digits:
151    ///
152    ///   k = min { k in N0 : m^k >= 2^64 }.
153    ///
154    /// Equivalently:
155    ///
156    ///   k = ceil( `log_m(2^64)` )
157    ///     = ceil( log(2^64) / log(m) ).
158    ///
159    /// This guarantees that any x in [0, 2^64 - 1] can be represented using exactly
160    /// k base-m digits (padding with leading zero digits as needed).
161    ///
162    /// # Complexity
163    ///
164    /// Runs in O(k) time and O(1) extra space, where k is the returned digit count
165    /// (the number of times we multiply by `modulus` until reaching 2^64).
166    ///
167    /// In terms of `modulus = m` (m >= 2), k = `ceil(log_m(2^64))`, so the loop runs
168    /// about ceil(64 / log2(m)) iterations (worst case at m = 2 gives k = 64).
169    fn length_prefix_digits(modulus: u64) -> usize {
170        let target: u128 = 1u128 << 64;
171        let base: u128 = u128::from(modulus);
172        let mut pow: u128 = 1;
173        let mut k: usize = 0;
174        while pow < target {
175            pow = pow.saturating_mul(base);
176            k += 1;
177        }
178
179        k
180    }
181
182    /// Chooses renormalization parameters for the uniform (256-symbol) rANS transducer.
183    ///
184    /// The `*_base_m_len` encoding treats bytes as symbols in a uniform 256-ary
185    /// alphabet and produces payload digits in radix `modulus` (so every digit is
186    /// `< modulus`). The internal state `x` is maintained in a `u64`.
187    ///
188    /// This function returns `(L, x_max)`:
189    /// - `L` is a decoder lower bound (chosen as a multiple of 256),
190    /// - `x_max = (L / 256) * modulus` is an encoder threshold.
191    ///
192    /// The encoder emits base-`modulus` digits while `x >= x_max` to ensure the
193    /// update `x = x * 256 + byte` cannot overflow `u64`. The decoder performs the
194    /// inverse operation by consuming digits while `x < L`.
195    ///
196    /// # Errors
197    ///
198    /// - Returns [`EncodingError::DecodingError`] if `modulus` is so large that no
199    ///   valid `L >= 256` can be chosen while keeping the state in `u64`.
200    fn rans_params(modulus: u64) -> Result<(u64, u64), EncodingError> {
201        let max_l = u64::MAX / modulus;
202        let l = (max_l / 256) * 256;
203        if l < 256 {
204            return Err(EncodingError::DecodingError(
205                "Modulus too large for base-m-len encoding".to_string(),
206            ));
207        }
208
209        let x_max_u128 = u128::from(l / 256) * u128::from(modulus);
210        if x_max_u128 > u128::from(u64::MAX) {
211            return Err(EncodingError::DecodingError(
212                "Invalid rANS parameters".to_string(),
213            ));
214        }
215
216        let x_max = u64::try_from(x_max_u128)
217            .map_err(|_| EncodingError::DecodingError("Invalid rANS parameters".to_string()))?;
218        Ok((l, x_max))
219    }
220
221    /// Encodes a `u64` as exactly `k` base-`modulus` digits (little-endian).
222    ///
223    /// Here `k = length_prefix_digits(modulus)`, so every `u64` fits. The returned
224    /// vector always has length `k`; most-significant digits are `0` when `value` is
225    /// small.
226    fn encode_u64_base_m_fixed(&self, value: u64) -> Vec<u64> {
227        let Utf8EncodingTypeInner::LengthDelimitedBaseMTransduction {
228            length_prefix_digits,
229            ..
230        } = self.encoding_type
231        else {
232            unreachable!()
233        };
234
235        let mut digits = Vec::with_capacity(length_prefix_digits);
236        let mut v = value;
237        for _ in 0..length_prefix_digits {
238            digits.push(v % self.modulus);
239            v /= self.modulus;
240        }
241
242        digits
243    }
244
245    /// Decodes a fixed-width base-`modulus` digit sequence (little-endian) into a `u64`.
246    ///
247    /// Each digit must satisfy `digit < modulus`. The digit slice is typically of
248    /// length `k = length_prefix_digits(modulus)`, as produced by
249    /// [`encode_u64_base_m_fixed`].
250    ///
251    /// # Errors
252    ///
253    /// - Returns [`EncodingError::DecodingError`] if any digit is out of range or if
254    ///   the decoded value does not fit in `u64`.
255    fn decode_u64_base_m_fixed(&self, digits: &[u64]) -> Result<u64, EncodingError> {
256        let mut value: u128 = 0;
257        let mut pow: u128 = 1;
258        let base: u128 = u128::from(self.modulus);
259        for &d in digits {
260            if d >= self.modulus {
261                return Err(EncodingError::DecodingError(
262                    "Invalid length prefix digit".to_string(),
263                ));
264            }
265            value += u128::from(d) * pow;
266            pow = pow.saturating_mul(base);
267        }
268
269        if value > u128::from(u64::MAX) {
270            return Err(EncodingError::DecodingError(
271                "Decoded length exceeds u64".to_string(),
272            ));
273        }
274
275        u64::try_from(value)
276            .map_err(|_| EncodingError::DecodingError("Decoded length exceeds u64".to_string()))
277    }
278
279    /// Uniform rANS encoder for bytes, emitting digits in radix `modulus`.
280    ///
281    /// Given `bytes`, this returns a pair `(state, stream_digits)` such that
282    /// [`rans_decode_uniform_bytes_base_m`] can reconstruct the original bytes when
283    /// provided with:
284    /// - the original `len = bytes.len()`,
285    /// - the returned `state` (encoded separately as fixed-width base-`modulus`
286    ///   digits),
287    /// - the returned `stream_digits` as the payload digit stream.
288    ///
289    /// Implementation details:
290    /// - Symbols are bytes with a *uniform* distribution, so the rANS update reduces
291    ///   to `x = x * 256 + byte`.
292    /// - Renormalization emits digits in radix `modulus`, ensuring every output
293    ///   digit is `< modulus` (i.e., all residues are available).
294    /// - Bytes are processed in reverse order, as is standard for streaming rANS.
295    ///
296    fn rans_encode_uniform_bytes_base_m(&self, bytes: &[u8]) -> (u64, Vec<u64>) {
297        let Utf8EncodingTypeInner::LengthDelimitedBaseMTransduction {
298            decoder_lower_bound,
299            encoder_threshold,
300            ..
301        } = self.encoding_type
302        else {
303            unreachable!()
304        };
305
306        let mut x = decoder_lower_bound;
307        let mut stream_digits = Vec::new();
308
309        for &b in bytes.iter().rev() {
310            while x >= encoder_threshold {
311                stream_digits.push(x % self.modulus);
312                x /= self.modulus;
313            }
314
315            x = x * 256 + u64::from(b);
316        }
317
318        stream_digits.reverse();
319        (x, stream_digits)
320    }
321
322    /// Uniform rANS decoder for bytes, consuming digits in radix `modulus`.
323    ///
324    /// This is the inverse of [`rans_encode_uniform_bytes_base_m`]. It reconstructs
325    /// exactly `len` bytes from an initial `state` and a digit stream `stream`.
326    ///
327    /// In the uniform 256-symbol case, symbol extraction corresponds to:
328    /// - decode: `byte = x % 256; x = x / 256`
329    ///
330    /// Only as many payload digits as necessary are consumed; any remaining trailing
331    /// elements in `stream` are ignored. This makes the overall `*_base_m_len`
332    /// decoding robust to zero-padding or garbage appended at the end.
333    ///
334    /// # Errors
335    ///
336    /// Returns [`EncodingError::DecodingError`] if:
337    /// - `state` is invalid for the chosen parameters,
338    /// - there are not enough payload digits to decode `len` bytes,
339    /// - any payload digit is out of range (`>= modulus`), or
340    /// - the reconstructed internal state would overflow `u64`.
341    fn rans_decode_uniform_bytes_base_m(
342        &self,
343        len: usize,
344        state: u64,
345        stream: &[RingElement],
346    ) -> Result<Vec<u8>, EncodingError> {
347        let Utf8EncodingTypeInner::LengthDelimitedBaseMTransduction {
348            decoder_lower_bound,
349            ..
350        } = self.encoding_type
351        else {
352            unreachable!()
353        };
354
355        if len == 0 {
356            return Ok(Vec::new());
357        }
358
359        if state < decoder_lower_bound {
360            return Err(EncodingError::DecodingError(
361                "Invalid rANS state".to_string(),
362            ));
363        }
364
365        let mut x = state;
366        let mut out = Vec::with_capacity(len);
367        let mut i = 0usize;
368
369        for _ in 0..len {
370            out.push(
371                u8::try_from(x & u64::from(u8::MAX))
372                    .expect("low 8 bits of the rANS state must fit in u8"),
373            );
374            x >>= 8;
375
376            while x < decoder_lower_bound {
377                let d = stream.get(i).ok_or_else(|| {
378                    EncodingError::DecodingError("Not enough payload digits".to_string())
379                })?;
380                let dv = d.value();
381                if dv >= self.modulus {
382                    return Err(EncodingError::DecodingError(
383                        "Invalid payload digit".to_string(),
384                    ));
385                }
386                let new_x = u128::from(x) * u128::from(self.modulus) + u128::from(dv);
387                if new_x > u128::from(u64::MAX) {
388                    return Err(EncodingError::DecodingError(
389                        "Invalid rANS state".to_string(),
390                    ));
391                }
392                x = u64::try_from(new_x)
393                    .map_err(|_| EncodingError::DecodingError("Invalid rANS state".to_string()))?;
394                i += 1;
395            }
396        }
397
398        Ok(out)
399    }
400
401    /// Encodes bytes into base-`modulus` digits with an embedded byte length.
402    ///
403    /// This encoding is **length-delimited** and **self-contained**: the output
404    /// embeds the original byte length (as a fixed-width base-`modulus` prefix) and
405    /// uses a uniform rANS transducer to convert the bytes into payload digits, all
406    /// strictly `< modulus`.
407    ///
408    /// # Target behavior
409    ///
410    /// - **Uses all residues**: digits are only required to satisfy `value < modulus`
411    ///   (there is no `"[0, 2^b)"` restriction as in [`encode_bytes`]).
412    /// - **Near-linear time**: amortized O(1) work per input byte, plus a constant-size
413    ///   header for the given modulus.
414    /// - **Padding tolerance**: trailing elements do not affect decoding because the
415    ///   decoded length is explicit.
416    /// - **Same public API**: this preserves the existing function signature and
417    ///   high-level behavior (but the on-wire representation is not backward compatible).
418    ///
419    /// # Wire format
420    ///
421    /// For `k = length_prefix_digits(modulus)`, the returned element sequence is:
422    ///
423    /// - `k` digits: `len` as a `u64` in base `modulus` (little-endian),
424    /// - `k` digits: `state` as a `u64` in base `modulus` (little-endian),
425    /// - `n` digits: payload stream digits produced by uniform rANS (each `< modulus`).
426    ///
427    /// The payload digit count `n` depends on `bytes.len()` and `modulus`.
428    ///
429    /// The rANS `state` is placed immediately after the length prefix so decoding can
430    /// proceed FIFO (left-to-right): read `len`, read `state`, then consume as many
431    /// payload digits as needed to emit exactly `len` bytes.
432    ///
433    /// # Decoding behavior
434    ///
435    /// [`decode_bytes_base_m_len`] uses the embedded `len` to decode exactly that
436    /// many bytes and ignores any trailing elements beyond what is required. This
437    /// makes the representation robust to padding or garbage appended at the end.
438    ///
439    /// # Complexity
440    ///
441    /// Near-linear in `bytes.len()` (amortized O(1) renormalization per byte),
442    /// avoiding quadratic base-conversion of large payloads.
443    ///
444    /// # Compatibility
445    ///
446    /// This is **not** compatible with older `*_base_m_len` encodings that used a
447    /// different payload conversion scheme.
448    ///
449    /// # Errors
450    ///
451    /// - Returns [`EncodingError::ModulusTooSmall`] if `modulus < 2`.
452    /// - Returns [`EncodingError::DecodingError`] if the byte length does not fit in
453    ///   `u64` or if valid rANS parameters cannot be chosen for the given modulus.
454    pub fn encode_bytes_base_m_len(&self, bytes: &[u8]) -> Result<Vec<RingElement>, EncodingError> {
455        let len_u64 = u64::try_from(bytes.len()).map_err(|_| {
456            EncodingError::DecodingError("Input length does not fit in u64".to_string())
457        })?;
458
459        let mut result: Vec<RingElement> = Vec::new();
460
461        let len_digits_le = self.encode_u64_base_m_fixed(len_u64);
462        result.extend(
463            len_digits_le
464                .into_iter()
465                .map(|d| RingElement::new(d, self.modulus)),
466        );
467
468        let (state, stream_digits) = self.rans_encode_uniform_bytes_base_m(bytes);
469
470        let state_digits_le = self.encode_u64_base_m_fixed(state);
471        result.extend(
472            state_digits_le
473                .into_iter()
474                .map(|d| RingElement::new(d, self.modulus)),
475        );
476
477        result.extend(
478            stream_digits
479                .into_iter()
480                .map(|d| RingElement::new(d, self.modulus)),
481        );
482
483        Ok(result)
484    }
485
486    /// Decodes bytes encoded by [`encode_bytes_base_m_len`].
487    ///
488    /// This parses the fixed-width base-`modulus` `len` prefix and `state`, then
489    /// uses the uniform rANS decoder to reconstruct exactly `len` bytes.
490    ///
491    /// Any trailing elements after the consumed payload are ignored, so appending
492    /// zero-padding or garbage does not change the decoded result.
493    ///
494    /// Decoding is FIFO: it reads the `len` prefix, then the fixed-width `state`, and
495    /// then consumes only as many payload digits as needed to emit `len` bytes.
496    ///
497    /// # Errors
498    ///
499    /// - Returns [`EncodingError::DecodingError`] if the stream is malformed (not
500    ///   enough prefix/payload digits, out-of-range digits, invalid state, or length
501    ///   that does not fit in `usize`).
502    pub fn decode_bytes_base_m_len(
503        &self,
504        elements: &[RingElement],
505    ) -> Result<Vec<u8>, EncodingError> {
506        let Utf8EncodingTypeInner::LengthDelimitedBaseMTransduction {
507            length_prefix_digits,
508            ..
509        } = self.encoding_type
510        else {
511            unreachable!()
512        };
513
514        if elements.len() < length_prefix_digits {
515            return Err(EncodingError::DecodingError(
516                "Not enough elements for length prefix".to_string(),
517            ));
518        }
519
520        let len_digits: Vec<u64> = elements[..length_prefix_digits]
521            .iter()
522            .map(RingElement::value)
523            .collect();
524        let len_u64 = self.decode_u64_base_m_fixed(&len_digits)?;
525        let len_usize = usize::try_from(len_u64).map_err(|_| {
526            EncodingError::DecodingError("Decoded length does not fit in usize".to_string())
527        })?;
528
529        if len_usize == 0 {
530            return Ok(Vec::new());
531        }
532
533        if elements.len() < 2 * length_prefix_digits {
534            return Err(EncodingError::DecodingError(
535                "Not enough elements for rANS state".to_string(),
536            ));
537        }
538
539        let state_digits: Vec<u64> = elements[length_prefix_digits..2 * length_prefix_digits]
540            .iter()
541            .map(RingElement::value)
542            .collect();
543        let state = self.decode_u64_base_m_fixed(&state_digits)?;
544
545        let payload_stream = &elements[2 * length_prefix_digits..];
546        self.rans_decode_uniform_bytes_base_m(len_usize, state, payload_stream)
547    }
548
549    /// Encodes a UTF-8 string using [`encode_bytes_base_m_len`].
550    ///
551    /// # Errors
552    ///
553    /// - Propagates any error returned by [`Self::encode_bytes_base_m_len`].
554    pub fn encode_text_base_m_len(&self, text: &str) -> Result<Vec<RingElement>, EncodingError> {
555        self.encode_bytes_base_m_len(text.as_bytes())
556    }
557
558    /// Decodes text encoded by [`encode_text_base_m_len`].
559    ///
560    /// This is [`decode_bytes_base_m_len`] followed by UTF-8 validation.
561    ///
562    /// # Errors
563    ///
564    /// - Returns [`EncodingError::DecodingError`] if the element stream is malformed
565    ///   or the decoded bytes are not valid UTF-8.
566    pub fn decode_text_base_m_len(
567        &self,
568        elements: &[RingElement],
569    ) -> Result<String, EncodingError> {
570        let bytes = self.decode_bytes_base_m_len(elements)?;
571        String::from_utf8(bytes).map_err(|e| EncodingError::DecodingError(e.to_string()))
572    }
573}
574
575impl Utf8Encoding {
576    /// Computes how many payload bits can be stored per element for a given modulus.
577    ///
578    /// Returns `b = floor(log2(modulus))`. With this choice,
579    /// every `b`-bit value is guaranteed to be `< modulus`, even when `modulus` is
580    /// not a power of two.
581    fn bits_per_element(&self) -> usize {
582        usize::try_from(self.modulus.ilog2()).expect("u32 bit width must fit in usize")
583    }
584
585    /// Encodes a byte slice into a sequence of [`RingElement`]s.
586    ///
587    /// The input bytes are treated as a little-endian bitstream and packed into
588    /// chunks of `b` bits, where `b = floor(log2(modulus))`. Each chunk becomes the
589    /// `value` of a `RingElement` with the provided `modulus`.
590    ///
591    /// This encoding uses only values in the range `[0, 2^b)`, which is always a
592    /// subset of the ring when `modulus` is not a power of two.
593    ///
594    /// # Notes
595    ///
596    /// - The output does not carry the original byte length. If the final chunk is
597    ///   only partially filled, it is emitted with implicit zero-padding in the
598    ///   high bits. For some moduli (notably when `b > 8`), this can cause decoding
599    ///   to produce extra trailing `0x00` bytes unless you truncate to the original
600    ///   length.
601    #[must_use]
602    pub fn encode_bytes(&self, bytes: &[u8]) -> Vec<RingElement> {
603        let bits_per_elem = self.bits_per_element();
604
605        let mut result = Vec::new();
606        let mut bit_buffer: u64 = 0;
607        let mut bits_in_buffer: usize = 0;
608
609        for &byte in bytes {
610            bit_buffer |= u64::from(byte) << bits_in_buffer;
611            bits_in_buffer += 8;
612
613            while bits_in_buffer >= bits_per_elem {
614                let mask = (1u64 << bits_per_elem) - 1;
615                let value = bit_buffer & mask;
616
617                result.push(RingElement::new(value, self.modulus));
618
619                bit_buffer >>= bits_per_elem;
620                bits_in_buffer -= bits_per_elem;
621            }
622        }
623
624        if bits_in_buffer > 0 {
625            result.push(RingElement::new(bit_buffer, self.modulus));
626        }
627
628        result
629    }
630
631    /// Decodes a sequence of [`RingElement`]s back into bytes.
632    ///
633    /// This reconstructs the same little-endian bitstream produced by
634    /// [`encode_bytes`], assuming `b = floor(log2(modulus))` bits of payload per
635    /// element and emitting 8-bit bytes from the stream.
636    ///
637    /// # Notes
638    ///
639    /// - This function does not validate that the provided `modulus` matches the
640    ///   modulus stored in each element; they must agree for meaningful results.
641    /// - This function assumes each element value fits in `b` bits (i.e., it lies
642    ///   in `[0, 2^b)`) as produced by [`encode_bytes`]. If elements were modified
643    ///   (or originate elsewhere), higher bits can leak into the reconstructed byte
644    ///   stream.
645    /// - Because the encoding does not include the original length, decoding may
646    ///   yield extra trailing `0x00` bytes for some parameter choices; truncate to
647    ///   the known original length when exact recovery is required.
648    ///
649    /// # Panics
650    ///
651    /// Panics only if an internal invariant is violated and extracting the low
652    /// 8 bits of the bit buffer fails to fit in a `u8`.
653    #[must_use]
654    pub fn decode_bytes(&self, elements: &[RingElement]) -> Vec<u8> {
655        if elements.is_empty() {
656            return Vec::new();
657        }
658
659        let bits_per_elem = self.bits_per_element();
660
661        let mut result = Vec::new();
662        let mut bit_buffer: u64 = 0;
663        let mut bits_in_buffer: usize = 0;
664
665        for elem in elements {
666            bit_buffer |= elem.value() << bits_in_buffer;
667            bits_in_buffer += bits_per_elem;
668
669            while bits_in_buffer >= 8 {
670                result.push(
671                    u8::try_from(bit_buffer & u64::from(u8::MAX))
672                        .expect("low 8 bits of the bit buffer must fit in u8"),
673                );
674                bit_buffer >>= 8;
675                bits_in_buffer -= 8;
676            }
677        }
678
679        result
680    }
681
682    /// Encodes a UTF-8 string as bytes using [`encode_bytes`].
683    #[must_use]
684    pub fn encode_text(&self, text: &str) -> Vec<RingElement> {
685        self.encode_bytes(text.as_bytes())
686    }
687
688    /// Decodes elements into bytes with [`decode_bytes`] and interprets them as UTF-8.
689    ///
690    /// # Errors
691    ///
692    /// - Returns [`EncodingError::DecodingError`] if the decoded bytes are not
693    ///   valid UTF-8.
694    ///
695    /// # Notes
696    ///
697    /// This encoding does not embed the original byte length. Depending on the
698    /// modulus and upstream padding, the decoded byte stream may contain trailing
699    /// `0x00` bytes, which are valid UTF-8 and will appear as `\0` characters in the
700    /// returned string.
701    pub fn decode_text(&self, elements: &[RingElement]) -> Result<String, EncodingError> {
702        let bytes = self.decode_bytes(elements);
703        String::from_utf8(bytes).map_err(|e| EncodingError::DecodingError(e.to_string()))
704    }
705}
706
707/// Errors that can occur while encoding or decoding.
708#[derive(Debug, Clone, thiserror::Error)]
709pub enum EncodingError {
710    /// The provided modulus is too small to encode any payload bits.
711    #[error("Modulus too small for encoding")]
712    ModulusTooSmall,
713    /// Encoding/decoding failed due to malformed input or invalid UTF-8.
714    #[error("Decoding error: {0}")]
715    DecodingError(String),
716}
717
718#[cfg(test)]
719mod tests {
720    use super::*;
721    use proptest::prelude::*;
722
723    #[test]
724    fn test_text_roundtrip_mod_256() {
725        let encoder = Utf8Encoding::try_from(256, Utf8EncodingType::BitPacking).unwrap();
726
727        let text = "Hello, World!";
728        let enc = encoder.encode_text(text);
729        let dec = encoder.decode_text(&enc).unwrap();
730        assert_eq!(dec, text);
731    }
732
733    proptest! {
734        #[test]
735        fn proptest_text_roundtrip_default(
736            text in proptest::string::string_regex(r"[\s\S]{0,200000}").unwrap(),
737            modulus in 2u64..=511,
738        ) {
739            let encoder = Utf8Encoding::try_from(modulus, Utf8EncodingType::BitPacking).unwrap();
740
741            let enc = encoder.encode_text(&text);
742            let dec = encoder.decode_text(&enc).unwrap();
743            prop_assert_eq!(dec, text);
744        }
745    }
746
747    #[test]
748    fn test_bit_stream_roundtrip_small_modulus() {
749        let encoder = Utf8Encoding::try_from(16, Utf8EncodingType::BitPacking).unwrap();
750
751        let data = vec![0xFF, 0x00, 0xAB, 0xCD];
752        let enc = encoder.encode_bytes(&data); // bits_per_elem = 4
753        let dec = encoder.decode_bytes(&enc);
754        assert_eq!(dec[..data.len()], data[..]);
755    }
756
757    #[test]
758    fn test_empty() {
759        let encoder = Utf8Encoding::try_from(16, Utf8EncodingType::BitPacking).unwrap();
760
761        let enc = encoder.encode_bytes(&[]);
762        let dec = encoder.decode_bytes(&enc);
763        assert!(dec.is_empty());
764    }
765
766    #[test]
767    fn test_modulus_too_small() {
768        let encoder = Utf8Encoding::try_from(1, Utf8EncodingType::BitPacking);
769        assert!(matches!(encoder, Err(EncodingError::ModulusTooSmall)));
770    }
771
772    #[test]
773    fn test_base_m_len_roundtrip_bytes_various_moduli() {
774        let data = vec![0x12, 0x00, 0x00, 0xFF, 0x00];
775        let moduli = [2u64, 3, 13, 128, 257, 65535];
776        for &m in &moduli {
777            let encoder =
778                Utf8Encoding::try_from(m, Utf8EncodingType::LengthDelimitedBaseMTransduction)
779                    .unwrap();
780            let enc = encoder.encode_bytes_base_m_len(&data).unwrap();
781            assert!(enc.iter().all(|e| e.value() < m));
782            let dec = encoder.decode_bytes_base_m_len(&enc).unwrap();
783            assert_eq!(dec, data);
784        }
785    }
786
787    #[test]
788    fn test_base_m_len_zero_padding_invariance() {
789        let data = vec![0x00, 0x00, 0x01, 0x02, 0x00, 0x00];
790
791        let encoder =
792            Utf8Encoding::try_from(257u64, Utf8EncodingType::LengthDelimitedBaseMTransduction)
793                .unwrap();
794
795        let enc = encoder.encode_bytes_base_m_len(&data).unwrap();
796        let mut padded = enc;
797        for _ in 0..17 {
798            padded.push(RingElement::zero(257u64));
799        }
800        let dec = encoder.decode_bytes_base_m_len(&padded).unwrap();
801        assert_eq!(dec, data);
802    }
803
804    #[test]
805    fn test_base_m_len_trailing_garbage_ignored() {
806        let data = vec![0x10, 0x20, 0x00, 0xFF, 0x01];
807        let modulus = 257u64;
808
809        let encoder =
810            Utf8Encoding::try_from(modulus, Utf8EncodingType::LengthDelimitedBaseMTransduction)
811                .unwrap();
812
813        let mut enc = encoder.encode_bytes_base_m_len(&data).unwrap();
814        enc.push(RingElement::new(1, modulus));
815        enc.push(RingElement::new(200, modulus));
816        enc.push(RingElement::new(256, modulus));
817        let dec = encoder.decode_bytes_base_m_len(&enc).unwrap();
818        assert_eq!(dec, data);
819    }
820
821    #[test]
822    fn test_base_m_len_decode_empty_errors() {
823        let encoder =
824            Utf8Encoding::try_from(10, Utf8EncodingType::LengthDelimitedBaseMTransduction).unwrap();
825        let res = encoder.decode_bytes_base_m_len(&[]);
826        assert!(matches!(res, Err(EncodingError::DecodingError(_))));
827    }
828
829    #[test]
830    fn test_base_m_len_text_roundtrip() {
831        let text = "Hello\u{0000}\u{0000}";
832        let encoder =
833            Utf8Encoding::try_from(257, Utf8EncodingType::LengthDelimitedBaseMTransduction)
834                .unwrap();
835        let enc = encoder.encode_text_base_m_len(text).unwrap();
836        let dec = encoder.decode_text_base_m_len(&enc).unwrap();
837        assert_eq!(dec, text);
838    }
839
840    proptest! {
841        #[test]
842        fn proptest_text_roundtrip_with_base_m_len(
843            text in proptest::string::string_regex(r"[\s\S]{0,200000}").unwrap(),
844            modulus in 2u64..=400,
845        ) {
846            let encoder = Utf8Encoding::try_from(modulus, Utf8EncodingType::LengthDelimitedBaseMTransduction).unwrap();
847            let enc = encoder.encode_text_base_m_len(&text).unwrap();
848            let dec = encoder.decode_text_base_m_len(&enc).unwrap();
849            prop_assert_eq!(dec, text);
850        }
851    }
852}
sym_adv_encoding/utf8.rs

sym_adv_encoding/
utf8.rs