Skip to main content

md_codec/
chunk.rs

1//! Chunk header per SPEC v0.30 §2.2.
2//!
3//! Encodes the 37-bit chunked wire-format header. First-symbol layout
4//! MSB-first: `[v3][v2][v1][v0][chunked]` (4-bit version + 1-bit chunked-flag).
5//! Remainder: 20-bit chunk-set-id + 6-bit count-minus-1 + 6-bit index.
6//! Total = 4 + 1 + 20 + 6 + 6 = 37 bits.
7
8use crate::bitstream::{BitReader, BitWriter};
9use crate::error::Error;
10use crate::header::Header;
11
12/// Wire header for a single chunk in a chunked v0.30 payload.
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
14pub struct ChunkHeader {
15    /// Wire-format version (4 bits). v0.30 = 4.
16    pub version: u8,
17    /// 20-bit chunk-set identifier shared by all chunks in a set.
18    pub chunk_set_id: u32,
19    /// Total number of chunks in the set; valid range `1..=64`.
20    pub count: u8,
21    /// Zero-based index of this chunk within the set; must be `< count`.
22    pub index: u8,
23}
24
25impl ChunkHeader {
26    /// Encode the chunk header into `w` as 37 bits.
27    ///
28    /// Returns an error if `count`, `index`, or `chunk_set_id` are out of range.
29    pub fn write(&self, w: &mut BitWriter) -> Result<(), Error> {
30        if !(1..=64).contains(&(self.count as u32)) {
31            return Err(Error::ChunkCountOutOfRange { count: self.count });
32        }
33        if self.index >= self.count {
34            return Err(Error::ChunkIndexOutOfRange {
35                index: self.index,
36                count: self.count,
37            });
38        }
39        if self.chunk_set_id >= (1 << 20) {
40            return Err(Error::ChunkSetIdOutOfRange {
41                id: self.chunk_set_id,
42            });
43        }
44        w.write_bits(u64::from(self.version & 0b1111), 4);
45        w.write_bits(1, 1); // chunked = 1
46        w.write_bits(u64::from(self.chunk_set_id), 20);
47        w.write_bits((self.count - 1) as u64, 6); // count-1 offset
48        w.write_bits(u64::from(self.index), 6);
49        Ok(())
50    }
51
52    /// Decode a chunk header (37 bits) from `r`.
53    ///
54    /// Returns [`Error::WireVersionMismatch`] if the 4-bit version field
55    /// is not `WF_REDESIGN_VERSION` per SPEC §2.5 (e.g., v0.x chunked
56    /// payloads where version=0 in the first 3 wire bits become version=0
57    /// or version=1 under the v0.30 4-bit read depending on prior bits).
58    /// Returns [`Error::ChunkHeaderChunkedFlagMissing`] if the chunked-flag
59    /// bit is not set after the version check passes.
60    pub fn read(r: &mut BitReader) -> Result<Self, Error> {
61        let version = r.read_bits(4)? as u8;
62        if version != Header::WF_REDESIGN_VERSION {
63            return Err(Error::WireVersionMismatch { got: version });
64        }
65        let chunked = r.read_bits(1)? != 0;
66        if !chunked {
67            return Err(Error::ChunkHeaderChunkedFlagMissing);
68        }
69        let chunk_set_id = r.read_bits(20)? as u32;
70        let count = (r.read_bits(6)? + 1) as u8;
71        let index = r.read_bits(6)? as u8;
72        Ok(Self {
73            version,
74            chunk_set_id,
75            count,
76            index,
77        })
78    }
79}
80
81#[cfg(test)]
82mod tests {
83    use super::*;
84    use crate::header::Header;
85
86    #[test]
87    fn chunk_header_round_trip() {
88        let h = ChunkHeader {
89            version: Header::WF_REDESIGN_VERSION,
90            chunk_set_id: 0xABCDE,
91            count: 3,
92            index: 1,
93        };
94        let mut w = BitWriter::new();
95        h.write(&mut w).unwrap();
96        // 4 + 1 + 20 + 6 + 6 = 37 bits
97        assert_eq!(w.bit_len(), 37);
98        let bytes = w.into_bytes();
99        let mut r = BitReader::new(&bytes);
100        assert_eq!(ChunkHeader::read(&mut r).unwrap(), h);
101    }
102
103    #[test]
104    fn chunk_header_count_64_round_trip() {
105        let h = ChunkHeader {
106            version: Header::WF_REDESIGN_VERSION,
107            chunk_set_id: 0,
108            count: 64,
109            index: 63,
110        };
111        let mut w = BitWriter::new();
112        h.write(&mut w).unwrap();
113        let bytes = w.into_bytes();
114        let mut r = BitReader::new(&bytes);
115        assert_eq!(ChunkHeader::read(&mut r).unwrap(), h);
116    }
117
118    #[test]
119    fn chunk_header_count_zero_rejected() {
120        let h = ChunkHeader {
121            version: Header::WF_REDESIGN_VERSION,
122            chunk_set_id: 0,
123            count: 0,
124            index: 0,
125        };
126        let mut w = BitWriter::new();
127        assert!(matches!(
128            h.write(&mut w),
129            Err(Error::ChunkCountOutOfRange { count: 0 })
130        ));
131    }
132
133    /// SPEC v0.30 §2.5 v0.x rejection for chunk-header path. A wire crafted
134    /// with version=0 and chunked-flag=1 (the v0.30-layout interpretation of
135    /// what a v0.x chunked first-symbol becomes when reordered) must be
136    /// rejected with `WireVersionMismatch { got: 0 }`.
137    #[test]
138    fn chunk_header_rejects_v0x_version() {
139        // Construct first 5 bits MSB-first: [v3=0][v2=0][v1=0][v0=0][chunked=1]
140        //   = 0b00001 (numeric 1)
141        // Pad with 32 zero bits (chunk_set_id + count-1 + index) to reach
142        // the full 37-bit chunk header length. 37 bits packed MSB-first into
143        // 5 bytes (with 3 trailing zero bits beyond the bit limit).
144        // Easier: use BitWriter to build the wire deterministically.
145        let mut w = BitWriter::new();
146        w.write_bits(0, 4); // version = 0 (v0.x)
147        w.write_bits(1, 1); // chunked = 1
148        w.write_bits(0, 20); // chunk_set_id
149        w.write_bits(0, 6); // count-1
150        w.write_bits(0, 6); // index
151        assert_eq!(w.bit_len(), 37);
152        let bytes = w.into_bytes();
153        let mut r = BitReader::new(&bytes);
154        assert!(matches!(
155            ChunkHeader::read(&mut r),
156            Err(Error::WireVersionMismatch { got: 0 })
157        ));
158    }
159}
160
161use crate::identity::Md1EncodingId;
162
163/// Derive the 20-bit chunk-set-id from a [`Md1EncodingId`] by taking the
164/// top 20 bits of the underlying 16-byte hash, MSB-first.
165///
166/// The chunk-set-id groups chunks belonging to the same encoded payload.
167/// Returned value is in the range `0..=0xFFFFF`.
168pub fn derive_chunk_set_id(id: &Md1EncodingId) -> u32 {
169    // First 20 bits of Md1EncodingId[0..16], MSB-first.
170    let bytes = id.as_bytes();
171    ((bytes[0] as u32) << 12) | ((bytes[1] as u32) << 4) | ((bytes[2] as u32) >> 4)
172}
173
174#[cfg(test)]
175mod chunk_set_id_tests {
176    use super::*;
177
178    #[test]
179    fn derive_chunk_set_id_deterministic() {
180        let mut bytes = [0u8; 16];
181        bytes[0] = 0xab;
182        bytes[1] = 0xcd;
183        bytes[2] = 0xe1;
184        bytes[3] = 0x23;
185        let id = Md1EncodingId::new(bytes);
186        let csid_a = derive_chunk_set_id(&id);
187        let csid_b = derive_chunk_set_id(&id);
188        assert_eq!(csid_a, csid_b);
189    }
190
191    #[test]
192    fn derive_chunk_set_id_msb_first_extraction() {
193        // bytes[0]=0xAB, [1]=0xCD, [2]=0xEF: top 20 bits = 0xABCDE
194        let mut bytes = [0u8; 16];
195        bytes[0] = 0xAB;
196        bytes[1] = 0xCD;
197        bytes[2] = 0xEF;
198        let id = Md1EncodingId::new(bytes);
199        assert_eq!(derive_chunk_set_id(&id), 0xABCDE);
200    }
201}
202
203use crate::encode::Descriptor;
204
205/// Threshold (in payload bits) above which chunking is required. Derived from
206/// codex32 *regular*-form's 80-char data-part limit (per BIP 93): 3 HRP + 1
207/// separator + 64 data + 13 checksum (see `codex32::REGULAR_CHECKSUM_SYMBOLS`).
208/// Long-form codex32 was dropped in v0.12.0, so the legal data-symbol budget
209/// per chunk is 64 = 320 bits.
210/// Encoders attempt single-string emit first; if the codex32 wrapping reports
211/// "too long", split into N chunks.
212pub const SINGLE_STRING_PAYLOAD_BIT_LIMIT: usize = 64 * 5;
213
214/// Split a [`Descriptor`] into N codex32 md1 strings, each carrying a chunk
215/// header and a slice of the canonical payload.
216///
217/// Algorithm:
218/// 1. Encode the full payload (`encode_payload`).
219/// 2. Compute [`crate::identity::Md1EncodingId`]; derive `ChunkSetId`.
220/// 3. Choose chunk count N such that each chunk fits in codex32 long form
221///    after adding the 37-bit chunk header.
222/// 4. Split the payload into N approximately-equal byte-boundary slices.
223/// 5. For each chunk i: prepend chunk header (37 bits), wrap via codex32 with
224///    the chunked-flag bit set, emit md1 string.
225///
226/// Note: `bytes_per_chunk` could be 0 if `payload_bytes` were empty, but the
227/// encoder validates `n ≥ 1` so the payload is always non-empty.
228pub fn split(d: &Descriptor) -> Result<Vec<String>, Error> {
229    use crate::bitstream::BitWriter;
230    use crate::encode::encode_payload;
231    use crate::identity::compute_md1_encoding_id;
232
233    let (payload_bytes, _payload_bits) = encode_payload(d)?;
234
235    // Compute ChunkSetId from full-encoding hash.
236    let md1_id = compute_md1_encoding_id(d)?;
237    let chunk_set_id = derive_chunk_set_id(&md1_id);
238
239    // Choose chunk count from payload byte count (≤7 bits of trailing
240    // codex32-padding are tolerated by the reassembled-stream TLV-rollback).
241    let payload_bit_count_for_sizing = payload_bytes.len() * 8;
242    let chunks_needed = payload_bit_count_for_sizing.div_ceil(SINGLE_STRING_PAYLOAD_BIT_LIMIT);
243    if chunks_needed > 64 {
244        return Err(Error::ChunkCountExceedsMax {
245            needed: chunks_needed,
246        });
247    }
248    let count: u8 = if chunks_needed == 0 {
249        1
250    } else {
251        chunks_needed as u8
252    };
253
254    // Split payload into `count` byte-boundary slices.
255    let bytes_per_chunk = payload_bytes.len().div_ceil(count as usize);
256
257    let mut chunks = Vec::with_capacity(count as usize);
258    for index in 0..count {
259        let start_byte = (index as usize) * bytes_per_chunk;
260        let end_byte = ((index as usize + 1) * bytes_per_chunk).min(payload_bytes.len());
261        let chunk_payload_bytes = &payload_bytes[start_byte..end_byte];
262
263        // Build per-chunk wire: 37-bit chunk header + chunk-payload bytes
264        // (full 8 bits per byte, no further fractional content). Chunk's
265        // exact bit count = 37 + 8 × |chunk_payload_bytes|.
266        let header = ChunkHeader {
267            version: Header::WF_REDESIGN_VERSION,
268            chunk_set_id,
269            count,
270            index,
271        };
272        let mut w = BitWriter::new();
273        header.write(&mut w)?;
274        for byte in chunk_payload_bytes {
275            w.write_bits(u64::from(*byte), 8);
276        }
277        let chunk_bit_count = 37 + 8 * chunk_payload_bytes.len();
278        let bytes = w.into_bytes();
279        let s = crate::codex32::wrap_payload(&bytes, chunk_bit_count)?;
280        chunks.push(s);
281    }
282    Ok(chunks)
283}
284
285use crate::decode::decode_payload;
286
287/// Reassemble a [`Descriptor`] from N md1 codex32 strings.
288///
289/// Algorithm:
290/// 1. Unwrap each string via the codex32 layer (verifies BCH per chunk).
291/// 2. Parse the 37-bit chunk header from each.
292/// 3. Validate consistency: same version, chunk_set_id, count.
293/// 4. Sort by index; verify `0..count-1` with no gaps.
294/// 5. Concatenate per-chunk payload bytes.
295/// 6. Decode the reassembled payload via [`decode_payload`].
296/// 7. Verify the reassembled payload's derived chunk-set-id matches the
297///    chunk-set-id present in every chunk header (cross-chunk integrity).
298pub fn reassemble(strings: &[&str]) -> Result<Descriptor, Error> {
299    use crate::bitstream::BitReader;
300    use crate::codex32::unwrap_string;
301    use crate::identity::compute_md1_encoding_id;
302
303    if strings.is_empty() {
304        return Err(Error::ChunkSetEmpty);
305    }
306
307    // Unwrap each, parse 37-bit chunk header, then read whole payload bytes.
308    // Use the symbol-aligned bit count returned by `unwrap_string` (NOT
309    // `bytes.len() * 8`, which would over-estimate by up to 7 bits and break
310    // round-trip for chunks where symbol-padding plus byte-padding crosses a
311    // byte boundary — e.g. N=3, N=8, etc.).
312    let mut parsed: Vec<(ChunkHeader, Vec<u8>)> = Vec::with_capacity(strings.len());
313    for s in strings {
314        let (bytes, symbol_aligned_bit_count) = unwrap_string(s)?;
315        let mut r = BitReader::with_bit_limit(&bytes, symbol_aligned_bit_count);
316        let header = ChunkHeader::read(&mut r)?;
317        // Per encoder contract: chunk wire is exactly 37 + 8N bits. The
318        // symbol-aligned bit count is `ceil((37+8N)/5) * 5`, which is in
319        // [37+8N, 37+8N+4]. So `(symbol_aligned_bit_count - 37) / 8`
320        // (floor) recovers exactly N.
321        let payload_byte_count = (symbol_aligned_bit_count - 37) / 8;
322        let mut chunk_payload_bytes = Vec::with_capacity(payload_byte_count);
323        for _ in 0..payload_byte_count {
324            let v = r.read_bits(8)? as u8;
325            chunk_payload_bytes.push(v);
326        }
327        // Trailing ≤4 symbol-padding bits remain in r; discard.
328        parsed.push((header, chunk_payload_bytes));
329    }
330
331    // Validate consistency.
332    let (h0, _) = &parsed[0];
333    let expected_count = h0.count;
334    let expected_csid = h0.chunk_set_id;
335    let expected_version = h0.version;
336    for (h, _) in &parsed {
337        if h.count != expected_count
338            || h.chunk_set_id != expected_csid
339            || h.version != expected_version
340        {
341            return Err(Error::ChunkSetInconsistent);
342        }
343    }
344    if parsed.len() != expected_count as usize {
345        return Err(Error::ChunkSetIncomplete {
346            got: parsed.len(),
347            expected: expected_count as usize,
348        });
349    }
350
351    // Sort by index; verify 0..count-1 with no gaps.
352    parsed.sort_by_key(|(h, _)| h.index);
353    for (i, (h, _)) in parsed.iter().enumerate() {
354        if h.index as usize != i {
355            return Err(Error::ChunkIndexGap {
356                expected: i as u8,
357                got: h.index,
358            });
359        }
360    }
361
362    // Concatenate chunk payload bytes.
363    let mut full_bytes = Vec::new();
364    for (_, chunk_bytes) in &parsed {
365        full_bytes.extend_from_slice(chunk_bytes);
366    }
367
368    // Decode payload. bit_len = bytes.len() * 8; TLV-rollback handles trailing padding.
369    let descriptor = decode_payload(&full_bytes, full_bytes.len() * 8)?;
370
371    // Cross-chunk integrity check.
372    let md1_id = compute_md1_encoding_id(&descriptor)?;
373    let derived_csid = derive_chunk_set_id(&md1_id);
374    if derived_csid != expected_csid {
375        return Err(Error::ChunkSetIdMismatch {
376            expected: expected_csid,
377            derived: derived_csid,
378        });
379    }
380
381    Ok(descriptor)
382}