Skip to main content

iscc_lib/
lib.rs

1//! High-performance Rust implementation of ISO 24138:2024 (ISCC).
2//!
3//! This crate provides the core ISCC algorithm implementations. All 9 `gen_*_v0`
4//! functions are the public Tier 1 API surface, designed to be compatible with
5//! the `iscc-core` Python reference implementation.
6
7pub mod cdc;
8pub mod codec;
9pub mod conformance;
10pub(crate) mod dct;
11pub mod minhash;
12pub mod simhash;
13pub mod streaming;
14pub mod types;
15pub mod utils;
16pub(crate) mod wtahash;
17
18pub use cdc::alg_cdc_chunks;
19pub use codec::encode_base64;
20pub use codec::iscc_decompose;
21pub use conformance::conformance_selftest;
22pub use minhash::alg_minhash_256;
23pub use simhash::{alg_simhash, sliding_window};
24pub use streaming::{DataHasher, InstanceHasher};
25pub use types::*;
26pub use utils::{text_clean, text_collapse, text_remove_newlines, text_trim};
27
28/// Max UTF-8 byte length for name metadata trimming.
29pub const META_TRIM_NAME: usize = 128;
30
31/// Max UTF-8 byte length for description metadata trimming.
32pub const META_TRIM_DESCRIPTION: usize = 4096;
33
34/// Buffer size in bytes for streaming file reads (4 MB).
35pub const IO_READ_SIZE: usize = 4_194_304;
36
37/// Character n-gram width for text content features.
38pub const TEXT_NGRAM_SIZE: usize = 13;
39
40/// Error type for ISCC operations.
41#[derive(Debug, thiserror::Error)]
42pub enum IsccError {
43    /// Input data is invalid.
44    #[error("invalid input: {0}")]
45    InvalidInput(String),
46}
47
48/// Result type alias for ISCC operations.
49pub type IsccResult<T> = Result<T, IsccError>;
50
51/// Interleave two 32-byte SimHash digests in 4-byte chunks.
52///
53/// Takes the first 16 bytes of each digest and interleaves them into
54/// a 32-byte result: 4 bytes from `a`, 4 bytes from `b`, alternating
55/// for 4 rounds (8 chunks total).
56fn interleave_digests(a: &[u8], b: &[u8]) -> Vec<u8> {
57    let mut result = vec![0u8; 32];
58    for chunk in 0..4 {
59        let src = chunk * 4;
60        let dst_a = chunk * 8;
61        let dst_b = chunk * 8 + 4;
62        result[dst_a..dst_a + 4].copy_from_slice(&a[src..src + 4]);
63        result[dst_b..dst_b + 4].copy_from_slice(&b[src..src + 4]);
64    }
65    result
66}
67
68/// Compute a SimHash digest from the name text for meta hashing.
69///
70/// Applies `text_collapse`, generates width-3 sliding window n-grams,
71/// hashes each with BLAKE3, and produces a SimHash.
72fn meta_name_simhash(name: &str) -> Vec<u8> {
73    let collapsed_name = utils::text_collapse(name);
74    let name_ngrams = simhash::sliding_window_strs(&collapsed_name, 3);
75    let name_hashes: Vec<[u8; 32]> = name_ngrams
76        .iter()
77        .map(|ng| *blake3::hash(ng.as_bytes()).as_bytes())
78        .collect();
79    simhash::alg_simhash_inner(&name_hashes)
80}
81
82/// Compute a similarity-preserving 256-bit hash from metadata text.
83///
84/// Produces a SimHash digest from `name` n-grams. When `extra` is provided,
85/// interleaves the name and extra SimHash digests in 4-byte chunks.
86fn soft_hash_meta_v0(name: &str, extra: Option<&str>) -> Vec<u8> {
87    let name_simhash = meta_name_simhash(name);
88
89    match extra {
90        None | Some("") => name_simhash,
91        Some(extra_str) => {
92            let collapsed_extra = utils::text_collapse(extra_str);
93            let extra_ngrams = simhash::sliding_window_strs(&collapsed_extra, 3);
94            let extra_hashes: Vec<[u8; 32]> = extra_ngrams
95                .iter()
96                .map(|ng| *blake3::hash(ng.as_bytes()).as_bytes())
97                .collect();
98            let extra_simhash = simhash::alg_simhash_inner(&extra_hashes);
99
100            interleave_digests(&name_simhash, &extra_simhash)
101        }
102    }
103}
104
105/// Compute a similarity-preserving 256-bit hash from name text and raw bytes.
106///
107/// Like `soft_hash_meta_v0` but the extra data is raw bytes instead of text.
108/// Uses width-4 byte n-grams (no `text_collapse`) for the bytes path,
109/// and interleaves name/bytes SimHash digests in 4-byte chunks.
110fn soft_hash_meta_v0_with_bytes(name: &str, extra: &[u8]) -> Vec<u8> {
111    let name_simhash = meta_name_simhash(name);
112
113    if extra.is_empty() {
114        return name_simhash;
115    }
116
117    let byte_ngrams = simhash::sliding_window_bytes(extra, 4);
118    let byte_hashes: Vec<[u8; 32]> = byte_ngrams
119        .iter()
120        .map(|ng| *blake3::hash(ng).as_bytes())
121        .collect();
122    let byte_simhash = simhash::alg_simhash_inner(&byte_hashes);
123
124    interleave_digests(&name_simhash, &byte_simhash)
125}
126
127/// Decode a Data-URL's base64 payload.
128///
129/// Expects a string starting with `"data:"`. Splits on the first `,` and
130/// decodes the remainder as standard base64. Returns `InvalidInput` on
131/// missing comma or invalid base64.
132fn decode_data_url(data_url: &str) -> IsccResult<Vec<u8>> {
133    let payload_b64 = data_url
134        .split_once(',')
135        .map(|(_, b64)| b64)
136        .ok_or_else(|| IsccError::InvalidInput("Data-URL missing comma separator".into()))?;
137    data_encoding::BASE64
138        .decode(payload_b64.as_bytes())
139        .map_err(|e| IsccError::InvalidInput(format!("invalid base64 in Data-URL: {e}")))
140}
141
142/// Parse a meta string as JSON and re-serialize to RFC 8785 (JCS) canonical bytes.
143fn parse_meta_json(meta_str: &str) -> IsccResult<Vec<u8>> {
144    let parsed: serde_json::Value = serde_json::from_str(meta_str)
145        .map_err(|e| IsccError::InvalidInput(format!("invalid JSON in meta: {e}")))?;
146    let mut buf = Vec::new();
147    serde_json_canonicalizer::to_writer(&parsed, &mut buf)
148        .map_err(|e| IsccError::InvalidInput(format!("JSON canonicalization failed: {e}")))?;
149    Ok(buf)
150}
151
152/// Build a Data-URL from canonical JSON bytes.
153///
154/// Uses `application/ld+json` media type if the JSON has an `@context` key,
155/// otherwise `application/json`. Encodes payload as standard base64 with padding.
156fn build_meta_data_url(json_bytes: &[u8], json_value: &serde_json::Value) -> String {
157    let media_type = if json_value.get("@context").is_some() {
158        "application/ld+json"
159    } else {
160        "application/json"
161    };
162    let b64 = data_encoding::BASE64.encode(json_bytes);
163    format!("data:{media_type};base64,{b64}")
164}
165
166/// Encode a raw digest into an ISCC unit string.
167///
168/// Takes integer type identifiers (matching `MainType`, `SubType`, `Version` enum values)
169/// and a raw digest, returns a base32-encoded ISCC unit string.
170///
171/// # Errors
172///
173/// Returns `IsccError::InvalidInput` if enum values are out of range, if `mtype` is
174/// `MainType::Iscc` (5), or if `digest.len() < bit_length / 8`.
175pub fn encode_component(
176    mtype: u8,
177    stype: u8,
178    version: u8,
179    bit_length: u32,
180    digest: &[u8],
181) -> IsccResult<String> {
182    let mt = codec::MainType::try_from(mtype)?;
183    let st = codec::SubType::try_from(stype)?;
184    let vs = codec::Version::try_from(version)?;
185    let needed = (bit_length / 8) as usize;
186    if digest.len() < needed {
187        return Err(IsccError::InvalidInput(format!(
188            "digest length {} < bit_length/8 ({})",
189            digest.len(),
190            needed
191        )));
192    }
193    codec::encode_component(mt, st, vs, bit_length, digest)
194}
195
196/// Decode an ISCC unit string into its header components and raw digest.
197///
198/// Inverse of [`encode_component`]. Strips an optional `"ISCC:"` prefix and
199/// dashes, base32-decodes the string, parses the variable-length header, and
200/// returns the digest truncated to exactly the encoded bit-length.
201///
202/// Returns `(maintype, subtype, version, length_index, digest)` where the
203/// integer fields match [`codec::MainType`], [`codec::SubType`], and
204/// [`codec::Version`] enum values.
205///
206/// # Errors
207///
208/// Returns `IsccError::InvalidInput` on invalid base32 input, malformed
209/// header, or if the decoded body is shorter than the expected digest length.
210pub fn iscc_decode(iscc: &str) -> IsccResult<(u8, u8, u8, u8, Vec<u8>)> {
211    // Strip optional "ISCC:" prefix (case-sensitive, matching iscc_decompose)
212    let clean = iscc.strip_prefix("ISCC:").unwrap_or(iscc);
213    // Remove dashes (matching iscc_clean behavior for base32 input)
214    let clean = clean.replace('-', "");
215    let raw = codec::decode_base32(&clean)?;
216    let (mt, st, vs, length_index, tail) = codec::decode_header(&raw)?;
217    let bit_length = codec::decode_length(mt, length_index, st);
218    let nbytes = (bit_length / 8) as usize;
219    if tail.len() < nbytes {
220        return Err(IsccError::InvalidInput(format!(
221            "decoded body too short: expected {nbytes} digest bytes, got {}",
222            tail.len()
223        )));
224    }
225    Ok((
226        mt as u8,
227        st as u8,
228        vs as u8,
229        length_index as u8,
230        tail[..nbytes].to_vec(),
231    ))
232}
233
234/// Convert a JSON string into a `data:` URL with JCS canonicalization.
235///
236/// Parses the JSON, re-serializes to [RFC 8785 (JCS)](https://www.rfc-editor.org/rfc/rfc8785)
237/// canonical form, base64-encodes the result, and wraps it in a `data:` URL.
238/// Uses `application/ld+json` media type when the JSON contains an `@context`
239/// key, otherwise `application/json`.
240///
241/// This enables all language bindings to support dict/object meta parameters
242/// by serializing to JSON once (language-specific) then delegating encoding
243/// to Rust.
244///
245/// # Errors
246///
247/// Returns [`IsccError::InvalidInput`] if `json` is not valid JSON or if
248/// JCS canonicalization fails.
249///
250/// # Examples
251///
252/// ```
253/// # use iscc_lib::json_to_data_url;
254/// let url = json_to_data_url(r#"{"key": "value"}"#).unwrap();
255/// assert!(url.starts_with("data:application/json;base64,"));
256///
257/// let ld_url = json_to_data_url(r#"{"@context": "https://schema.org"}"#).unwrap();
258/// assert!(ld_url.starts_with("data:application/ld+json;base64,"));
259/// ```
260pub fn json_to_data_url(json: &str) -> IsccResult<String> {
261    let parsed: serde_json::Value = serde_json::from_str(json)
262        .map_err(|e| IsccError::InvalidInput(format!("invalid JSON: {e}")))?;
263    let mut canonical_bytes = Vec::new();
264    serde_json_canonicalizer::to_writer(&parsed, &mut canonical_bytes)
265        .map_err(|e| IsccError::InvalidInput(format!("JSON canonicalization failed: {e}")))?;
266    Ok(build_meta_data_url(&canonical_bytes, &parsed))
267}
268
269/// Generate a Meta-Code from name and optional metadata.
270///
271/// Produces an ISCC Meta-Code by hashing the provided name, description,
272/// and metadata fields using the SimHash algorithm. When `meta` is provided,
273/// it is treated as either a Data-URL (if starting with `"data:"`) or a JSON
274/// string, and the decoded/serialized bytes are used for similarity hashing
275/// and metahash computation.
276pub fn gen_meta_code_v0(
277    name: &str,
278    description: Option<&str>,
279    meta: Option<&str>,
280    bits: u32,
281) -> IsccResult<MetaCodeResult> {
282    // Normalize name: clean → remove newlines → trim to 128 bytes
283    let name = utils::text_clean(name);
284    let name = utils::text_remove_newlines(&name);
285    let name = utils::text_trim(&name, META_TRIM_NAME);
286
287    if name.is_empty() {
288        return Err(IsccError::InvalidInput(
289            "name is empty after normalization".into(),
290        ));
291    }
292
293    // Normalize description: clean → trim to 4096 bytes
294    let desc_str = description.unwrap_or("");
295    let desc_clean = utils::text_clean(desc_str);
296    let desc_clean = utils::text_trim(&desc_clean, META_TRIM_DESCRIPTION);
297
298    // Resolve meta payload bytes (if meta is provided)
299    let meta_payload: Option<Vec<u8>> = match meta {
300        Some(meta_str) if meta_str.starts_with("data:") => Some(decode_data_url(meta_str)?),
301        Some(meta_str) => Some(parse_meta_json(meta_str)?),
302        None => None,
303    };
304
305    // Branch: meta bytes path vs. description text path
306    if let Some(ref payload) = meta_payload {
307        let meta_code_digest = soft_hash_meta_v0_with_bytes(&name, payload);
308        let metahash = utils::multi_hash_blake3(payload);
309
310        let meta_code = codec::encode_component(
311            codec::MainType::Meta,
312            codec::SubType::None,
313            codec::Version::V0,
314            bits,
315            &meta_code_digest,
316        )?;
317
318        // Build the meta Data-URL for the result
319        let meta_value = match meta {
320            Some(meta_str) if meta_str.starts_with("data:") => meta_str.to_string(),
321            Some(meta_str) => {
322                let parsed: serde_json::Value = serde_json::from_str(meta_str)
323                    .map_err(|e| IsccError::InvalidInput(format!("invalid JSON: {e}")))?;
324                build_meta_data_url(payload, &parsed)
325            }
326            None => unreachable!(),
327        };
328
329        Ok(MetaCodeResult {
330            iscc: format!("ISCC:{meta_code}"),
331            name: name.clone(),
332            description: if desc_clean.is_empty() {
333                None
334            } else {
335                Some(desc_clean)
336            },
337            meta: Some(meta_value),
338            metahash,
339        })
340    } else {
341        // Compute metahash from normalized text payload
342        let payload = if desc_clean.is_empty() {
343            name.clone()
344        } else {
345            format!("{name} {desc_clean}")
346        };
347        let payload = payload.trim().to_string();
348        let metahash = utils::multi_hash_blake3(payload.as_bytes());
349
350        // Compute similarity digest
351        let extra = if desc_clean.is_empty() {
352            None
353        } else {
354            Some(desc_clean.as_str())
355        };
356        let meta_code_digest = soft_hash_meta_v0(&name, extra);
357
358        let meta_code = codec::encode_component(
359            codec::MainType::Meta,
360            codec::SubType::None,
361            codec::Version::V0,
362            bits,
363            &meta_code_digest,
364        )?;
365
366        Ok(MetaCodeResult {
367            iscc: format!("ISCC:{meta_code}"),
368            name: name.clone(),
369            description: if desc_clean.is_empty() {
370                None
371            } else {
372                Some(desc_clean)
373            },
374            meta: None,
375            metahash,
376        })
377    }
378}
379
380/// Compute a 256-bit similarity-preserving hash from collapsed text.
381///
382/// Generates character n-grams with a sliding window of width 13,
383/// hashes each with xxh32, then applies MinHash to produce a 32-byte digest.
384fn soft_hash_text_v0(text: &str) -> Vec<u8> {
385    let ngrams = simhash::sliding_window_strs(text, TEXT_NGRAM_SIZE);
386    let features: Vec<u32> = ngrams
387        .iter()
388        .map(|ng| xxhash_rust::xxh32::xxh32(ng.as_bytes(), 0))
389        .collect();
390    minhash::alg_minhash_256(&features)
391}
392
393/// Generate a Text-Code from plain text content.
394///
395/// Produces an ISCC Content-Code for text by collapsing the input,
396/// extracting character n-gram features, and applying MinHash to
397/// create a similarity-preserving fingerprint.
398pub fn gen_text_code_v0(text: &str, bits: u32) -> IsccResult<TextCodeResult> {
399    let collapsed = utils::text_collapse(text);
400    let characters = collapsed.chars().count();
401    let hash_digest = soft_hash_text_v0(&collapsed);
402    let component = codec::encode_component(
403        codec::MainType::Content,
404        codec::SubType::TEXT,
405        codec::Version::V0,
406        bits,
407        &hash_digest,
408    )?;
409    Ok(TextCodeResult {
410        iscc: format!("ISCC:{component}"),
411        characters,
412    })
413}
414
415/// Transpose a matrix represented as a Vec of Vecs.
416fn transpose_matrix(matrix: &[Vec<f64>]) -> Vec<Vec<f64>> {
417    let rows = matrix.len();
418    if rows == 0 {
419        return vec![];
420    }
421    let cols = matrix[0].len();
422    let mut result = vec![vec![0.0f64; rows]; cols];
423    for (r, row) in matrix.iter().enumerate() {
424        for (c, &val) in row.iter().enumerate() {
425            result[c][r] = val;
426        }
427    }
428    result
429}
430
431/// Extract an 8×8 block from a matrix and flatten to 64 values.
432///
433/// Block position `(col, row)` means the block starts at
434/// `matrix[row][col]` and spans 8 rows and 8 columns.
435fn flatten_8x8(matrix: &[Vec<f64>], col: usize, row: usize) -> Vec<f64> {
436    let mut flat = Vec::with_capacity(64);
437    for matrix_row in matrix.iter().skip(row).take(8) {
438        for &val in matrix_row.iter().skip(col).take(8) {
439            flat.push(val);
440        }
441    }
442    flat
443}
444
445/// Compute the median of a slice of f64 values.
446///
447/// For even-length slices, returns the average of the two middle values
448/// (matching Python `statistics.median` behavior).
449fn compute_median(values: &[f64]) -> f64 {
450    let mut sorted: Vec<f64> = values.to_vec();
451    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
452    let n = sorted.len();
453    if n % 2 == 1 {
454        sorted[n / 2]
455    } else {
456        (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
457    }
458}
459
460/// Convert a slice of bools to a byte vector (MSB first per byte).
461fn bits_to_bytes(bits: &[bool]) -> Vec<u8> {
462    bits.chunks(8)
463        .map(|chunk| {
464            let mut byte = 0u8;
465            for (i, &bit) in chunk.iter().enumerate() {
466                if bit {
467                    byte |= 1 << (7 - i);
468                }
469            }
470            byte
471        })
472        .collect()
473}
474
475/// Compute a DCT-based perceptual hash from 32×32 grayscale pixels.
476///
477/// Applies a 2D DCT to the pixel matrix, extracts four 8×8 low-frequency
478/// blocks, and generates a bitstring by comparing each coefficient against
479/// the block median. Returns up to `bits` bits as a byte vector.
480fn soft_hash_image_v0(pixels: &[u8], bits: u32) -> IsccResult<Vec<u8>> {
481    if pixels.len() != 1024 {
482        return Err(IsccError::InvalidInput(format!(
483            "expected 1024 pixels, got {}",
484            pixels.len()
485        )));
486    }
487    if bits > 256 {
488        return Err(IsccError::InvalidInput(format!(
489            "bits must be <= 256, got {bits}"
490        )));
491    }
492
493    // Step 1: Row-wise DCT (32 rows of 32 pixels)
494    let rows: Vec<Vec<f64>> = pixels
495        .chunks(32)
496        .map(|row| {
497            let row_f64: Vec<f64> = row.iter().map(|&p| p as f64).collect();
498            dct::alg_dct(&row_f64)
499        })
500        .collect::<IsccResult<Vec<Vec<f64>>>>()?;
501
502    // Step 2: Transpose
503    let transposed = transpose_matrix(&rows);
504
505    // Step 3: Column-wise DCT
506    let dct_cols: Vec<Vec<f64>> = transposed
507        .iter()
508        .map(|col| dct::alg_dct(col))
509        .collect::<IsccResult<Vec<Vec<f64>>>>()?;
510
511    // Step 4: Transpose back → dct_matrix
512    let dct_matrix = transpose_matrix(&dct_cols);
513
514    // Step 5: Extract 8×8 blocks at positions (0,0), (1,0), (0,1), (1,1)
515    let positions = [(0, 0), (1, 0), (0, 1), (1, 1)];
516    let mut bitstring = Vec::<bool>::with_capacity(256);
517
518    for (col, row) in positions {
519        let flat = flatten_8x8(&dct_matrix, col, row);
520        let median = compute_median(&flat);
521        for val in &flat {
522            bitstring.push(*val > median);
523        }
524        if bitstring.len() >= bits as usize {
525            break;
526        }
527    }
528
529    // Step 6: Convert first `bits` bools to bytes
530    Ok(bits_to_bytes(&bitstring[..bits as usize]))
531}
532
533/// Generate an Image-Code from pixel data.
534///
535/// Produces an ISCC Content-Code for images from a sequence of 1024
536/// grayscale pixel values (32×32, values 0-255) using a DCT-based
537/// perceptual hash.
538pub fn gen_image_code_v0(pixels: &[u8], bits: u32) -> IsccResult<ImageCodeResult> {
539    let hash_digest = soft_hash_image_v0(pixels, bits)?;
540    let component = codec::encode_component(
541        codec::MainType::Content,
542        codec::SubType::Image,
543        codec::Version::V0,
544        bits,
545        &hash_digest,
546    )?;
547    Ok(ImageCodeResult {
548        iscc: format!("ISCC:{component}"),
549    })
550}
551
552/// Split a slice into `n` parts, distributing remainder across first chunks.
553///
554/// Equivalent to `numpy.array_split` / `more_itertools.divide`:
555/// each part gets `len / n` elements, and the first `len % n` parts
556/// get one extra element. Returns empty slices for excess parts.
557fn array_split<T>(slice: &[T], n: usize) -> Vec<&[T]> {
558    if n == 0 {
559        return vec![];
560    }
561    let len = slice.len();
562    let base = len / n;
563    let remainder = len % n;
564    let mut parts = Vec::with_capacity(n);
565    let mut offset = 0;
566    for i in 0..n {
567        let size = base + if i < remainder { 1 } else { 0 };
568        parts.push(&slice[offset..offset + size]);
569        offset += size;
570    }
571    parts
572}
573
574/// Compute a multi-stage SimHash digest from Chromaprint features.
575///
576/// Builds a 32-byte digest by concatenating 4-byte SimHash chunks:
577/// - Stage 1: overall SimHash of all features (4 bytes)
578/// - Stage 2: SimHash of each quarter of features (4 × 4 = 16 bytes)
579/// - Stage 3: SimHash of each third of sorted features (3 × 4 = 12 bytes)
580fn soft_hash_audio_v0(cv: &[i32]) -> Vec<u8> {
581    // Convert each i32 to 4-byte big-endian digest
582    let digests: Vec<[u8; 4]> = cv.iter().map(|&v| v.to_be_bytes()).collect();
583
584    if digests.is_empty() {
585        return vec![0u8; 32];
586    }
587
588    // Stage 1: overall SimHash (4 bytes)
589    let mut parts: Vec<u8> = simhash::alg_simhash_inner(&digests);
590
591    // Stage 2: quarter-based SimHashes (4 × 4 = 16 bytes)
592    let quarters = array_split(&digests, 4);
593    for quarter in &quarters {
594        if quarter.is_empty() {
595            parts.extend_from_slice(&[0u8; 4]);
596        } else {
597            parts.extend_from_slice(&simhash::alg_simhash_inner(quarter));
598        }
599    }
600
601    // Stage 3: sorted-third-based SimHashes (3 × 4 = 12 bytes)
602    let mut sorted_values: Vec<i32> = cv.to_vec();
603    sorted_values.sort();
604    let sorted_digests: Vec<[u8; 4]> = sorted_values.iter().map(|&v| v.to_be_bytes()).collect();
605    let thirds = array_split(&sorted_digests, 3);
606    for third in &thirds {
607        if third.is_empty() {
608            parts.extend_from_slice(&[0u8; 4]);
609        } else {
610            parts.extend_from_slice(&simhash::alg_simhash_inner(third));
611        }
612    }
613
614    parts
615}
616
617/// Generate an Audio-Code from a Chromaprint feature vector.
618///
619/// Produces an ISCC Content-Code for audio from a Chromaprint signed
620/// integer fingerprint vector using multi-stage SimHash.
621pub fn gen_audio_code_v0(cv: &[i32], bits: u32) -> IsccResult<AudioCodeResult> {
622    let hash_digest = soft_hash_audio_v0(cv);
623    let component = codec::encode_component(
624        codec::MainType::Content,
625        codec::SubType::Audio,
626        codec::Version::V0,
627        bits,
628        &hash_digest,
629    )?;
630    Ok(AudioCodeResult {
631        iscc: format!("ISCC:{component}"),
632    })
633}
634
635/// Compute a similarity-preserving hash from video frame signatures.
636///
637/// Deduplicates frame signatures, computes column-wise sums across all
638/// unique frames, then applies WTA-Hash to produce a digest of `bits/8` bytes.
639pub fn soft_hash_video_v0<S: AsRef<[i32]> + Ord>(
640    frame_sigs: &[S],
641    bits: u32,
642) -> IsccResult<Vec<u8>> {
643    if frame_sigs.is_empty() {
644        return Err(IsccError::InvalidInput(
645            "frame_sigs must not be empty".into(),
646        ));
647    }
648
649    // Deduplicate using BTreeSet (S: Ord)
650    let unique: std::collections::BTreeSet<&S> = frame_sigs.iter().collect();
651
652    // Column-wise sum into i64 to avoid overflow
653    let cols = frame_sigs[0].as_ref().len();
654    let mut vecsum = vec![0i64; cols];
655    for sig in &unique {
656        for (c, &val) in sig.as_ref().iter().enumerate() {
657            vecsum[c] += val as i64;
658        }
659    }
660
661    wtahash::alg_wtahash(&vecsum, bits)
662}
663
664/// Generate a Video-Code from frame signature data.
665///
666/// Produces an ISCC Content-Code for video from a sequence of MPEG-7 frame
667/// signatures. Each frame signature is a 380-element integer vector.
668pub fn gen_video_code_v0<S: AsRef<[i32]> + Ord>(
669    frame_sigs: &[S],
670    bits: u32,
671) -> IsccResult<VideoCodeResult> {
672    let digest = soft_hash_video_v0(frame_sigs, bits)?;
673    let component = codec::encode_component(
674        codec::MainType::Content,
675        codec::SubType::Video,
676        codec::Version::V0,
677        bits,
678        &digest,
679    )?;
680    Ok(VideoCodeResult {
681        iscc: format!("ISCC:{component}"),
682    })
683}
684
685/// Combine multiple Content-Code digests into a single similarity hash.
686///
687/// Takes raw decoded ISCC bytes (header + body) for each Content-Code and
688/// produces a SimHash digest. Each input is trimmed to `bits/8` bytes by
689/// keeping the first header byte (encodes type info) plus `nbytes-1` body bytes.
690/// Requires at least 2 codes, all of MainType::Content.
691fn soft_hash_codes_v0(cc_digests: &[Vec<u8>], bits: u32) -> IsccResult<Vec<u8>> {
692    if cc_digests.len() < 2 {
693        return Err(IsccError::InvalidInput(
694            "at least 2 Content-Codes required for mixing".into(),
695        ));
696    }
697
698    let nbytes = (bits / 8) as usize;
699    let mut prepared: Vec<Vec<u8>> = Vec::with_capacity(cc_digests.len());
700
701    for raw in cc_digests {
702        let (mtype, stype, _ver, blen, body) = codec::decode_header(raw)?;
703        if mtype != codec::MainType::Content {
704            return Err(IsccError::InvalidInput(
705                "all codes must be Content-Codes".into(),
706            ));
707        }
708        let unit_bits = codec::decode_length(mtype, blen, stype);
709        if unit_bits < bits {
710            return Err(IsccError::InvalidInput(format!(
711                "Content-Code too short for {bits}-bit length (has {unit_bits} bits)"
712            )));
713        }
714        let mut entry = Vec::with_capacity(nbytes);
715        entry.push(raw[0]); // first byte preserves type info
716        let take = std::cmp::min(nbytes - 1, body.len());
717        entry.extend_from_slice(&body[..take]);
718        // Pad with zeros if body is shorter than nbytes-1
719        while entry.len() < nbytes {
720            entry.push(0);
721        }
722        prepared.push(entry);
723    }
724
725    Ok(simhash::alg_simhash_inner(&prepared))
726}
727
728/// Generate a Mixed-Code from multiple Content-Code strings.
729///
730/// Produces a Mixed Content-Code by combining multiple ISCC Content-Codes
731/// of different types (text, image, audio, video) using SimHash. Input codes
732/// may optionally include the "ISCC:" prefix.
733pub fn gen_mixed_code_v0(codes: &[&str], bits: u32) -> IsccResult<MixedCodeResult> {
734    let decoded: Vec<Vec<u8>> = codes
735        .iter()
736        .map(|code| {
737            let clean = code.strip_prefix("ISCC:").unwrap_or(code);
738            codec::decode_base32(clean)
739        })
740        .collect::<IsccResult<Vec<Vec<u8>>>>()?;
741
742    let digest = soft_hash_codes_v0(&decoded, bits)?;
743
744    let component = codec::encode_component(
745        codec::MainType::Content,
746        codec::SubType::Mixed,
747        codec::Version::V0,
748        bits,
749        &digest,
750    )?;
751
752    Ok(MixedCodeResult {
753        iscc: format!("ISCC:{component}"),
754        parts: codes.iter().map(|s| s.to_string()).collect(),
755    })
756}
757
758/// Generate a Data-Code from raw byte data.
759///
760/// Produces an ISCC Data-Code by splitting data into content-defined chunks,
761/// hashing each chunk with xxh32, and applying MinHash to create a
762/// similarity-preserving fingerprint.
763pub fn gen_data_code_v0(data: &[u8], bits: u32) -> IsccResult<DataCodeResult> {
764    let chunks = cdc::alg_cdc_chunks(data, false, cdc::DATA_AVG_CHUNK_SIZE);
765    let mut features: Vec<u32> = chunks
766        .iter()
767        .map(|chunk| xxhash_rust::xxh32::xxh32(chunk, 0))
768        .collect();
769
770    // Defensive: ensure at least one feature (alg_cdc_chunks guarantees >= 1 chunk)
771    if features.is_empty() {
772        features.push(xxhash_rust::xxh32::xxh32(b"", 0));
773    }
774
775    let digest = minhash::alg_minhash_256(&features);
776    let component = codec::encode_component(
777        codec::MainType::Data,
778        codec::SubType::None,
779        codec::Version::V0,
780        bits,
781        &digest,
782    )?;
783
784    Ok(DataCodeResult {
785        iscc: format!("ISCC:{component}"),
786    })
787}
788
789/// Generate an Instance-Code from raw byte data.
790///
791/// Produces an ISCC Instance-Code by hashing the complete byte stream
792/// with BLAKE3. Captures the exact binary identity of the data.
793pub fn gen_instance_code_v0(data: &[u8], bits: u32) -> IsccResult<InstanceCodeResult> {
794    let digest = blake3::hash(data);
795    let datahash = utils::multi_hash_blake3(data);
796    let filesize = data.len() as u64;
797    let component = codec::encode_component(
798        codec::MainType::Instance,
799        codec::SubType::None,
800        codec::Version::V0,
801        bits,
802        digest.as_bytes(),
803    )?;
804    Ok(InstanceCodeResult {
805        iscc: format!("ISCC:{component}"),
806        datahash,
807        filesize,
808    })
809}
810
811/// Generate a composite ISCC-CODE from individual ISCC unit codes.
812///
813/// Combines multiple ISCC unit codes (Meta-Code, Content-Code, Data-Code,
814/// Instance-Code) into a single composite ISCC-CODE. Input codes may
815/// optionally include the "ISCC:" prefix. At least Data-Code and
816/// Instance-Code are required. When `wide` is true and exactly two
817/// 128-bit+ codes (Data + Instance) are provided, produces a 256-bit
818/// wide-mode code.
819pub fn gen_iscc_code_v0(codes: &[&str], wide: bool) -> IsccResult<IsccCodeResult> {
820    // Step 1: Clean inputs — strip "ISCC:" prefix
821    let cleaned: Vec<&str> = codes
822        .iter()
823        .map(|c| c.strip_prefix("ISCC:").unwrap_or(c))
824        .collect();
825
826    // Step 2: Validate minimum count
827    if cleaned.len() < 2 {
828        return Err(IsccError::InvalidInput(
829            "at least 2 ISCC unit codes required".into(),
830        ));
831    }
832
833    // Step 3: Validate minimum length (16 base32 chars = 64-bit minimum)
834    for code in &cleaned {
835        if code.len() < 16 {
836            return Err(IsccError::InvalidInput(format!(
837                "ISCC unit code too short (min 16 chars): {code}"
838            )));
839        }
840    }
841
842    // Step 4: Decode each code
843    let mut decoded: Vec<(
844        codec::MainType,
845        codec::SubType,
846        codec::Version,
847        u32,
848        Vec<u8>,
849    )> = Vec::with_capacity(cleaned.len());
850    for code in &cleaned {
851        let raw = codec::decode_base32(code)?;
852        let header = codec::decode_header(&raw)?;
853        decoded.push(header);
854    }
855
856    // Step 5: Sort by MainType (ascending)
857    decoded.sort_by_key(|&(mt, ..)| mt);
858
859    // Step 6: Extract main_types
860    let main_types: Vec<codec::MainType> = decoded.iter().map(|&(mt, ..)| mt).collect();
861
862    // Step 7: Validate last two are Data + Instance (mandatory)
863    let n = main_types.len();
864    if main_types[n - 2] != codec::MainType::Data || main_types[n - 1] != codec::MainType::Instance
865    {
866        return Err(IsccError::InvalidInput(
867            "Data-Code and Instance-Code are mandatory".into(),
868        ));
869    }
870
871    // Step 8: Determine wide composite
872    let is_wide = wide
873        && decoded.len() == 2
874        && main_types == [codec::MainType::Data, codec::MainType::Instance]
875        && decoded
876            .iter()
877            .all(|&(mt, st, _, len, _)| codec::decode_length(mt, len, st) >= 128);
878
879    // Step 9: Determine SubType
880    let st = if is_wide {
881        codec::SubType::Wide
882    } else {
883        // Collect SubTypes of Semantic/Content units
884        let sc_subtypes: Vec<codec::SubType> = decoded
885            .iter()
886            .filter(|&&(mt, ..)| mt == codec::MainType::Semantic || mt == codec::MainType::Content)
887            .map(|&(_, st, ..)| st)
888            .collect();
889
890        if !sc_subtypes.is_empty() {
891            // All must be the same
892            let first = sc_subtypes[0];
893            if sc_subtypes.iter().all(|&s| s == first) {
894                first
895            } else {
896                return Err(IsccError::InvalidInput(
897                    "mixed SubTypes among Content/Semantic units".into(),
898                ));
899            }
900        } else if decoded.len() == 2 {
901            codec::SubType::Sum
902        } else {
903            codec::SubType::IsccNone
904        }
905    };
906
907    // Step 10–11: Get optional MainTypes and encode
908    let optional_types = &main_types[..n - 2];
909    let encoded_length = codec::encode_units(optional_types)?;
910
911    // Step 12: Build digest body
912    let bytes_per_unit = if is_wide { 16 } else { 8 };
913    let mut digest = Vec::with_capacity(decoded.len() * bytes_per_unit);
914    for (_, _, _, _, tail) in &decoded {
915        let take = bytes_per_unit.min(tail.len());
916        digest.extend_from_slice(&tail[..take]);
917    }
918
919    // Step 13–14: Encode header + digest as base32
920    let header = codec::encode_header(
921        codec::MainType::Iscc,
922        st,
923        codec::Version::V0,
924        encoded_length,
925    )?;
926    let mut code_bytes = header;
927    code_bytes.extend_from_slice(&digest);
928    let code = codec::encode_base32(&code_bytes);
929
930    // Step 15: Return with prefix
931    Ok(IsccCodeResult {
932        iscc: format!("ISCC:{code}"),
933    })
934}
935
936#[cfg(test)]
937mod tests {
938    use super::*;
939
940    #[test]
941    fn test_gen_meta_code_v0_title_only() {
942        let result = gen_meta_code_v0("Die Unendliche Geschichte", None, None, 64).unwrap();
943        assert_eq!(result.iscc, "ISCC:AAAZXZ6OU74YAZIM");
944        assert_eq!(result.name, "Die Unendliche Geschichte");
945        assert_eq!(result.description, None);
946        assert_eq!(result.meta, None);
947    }
948
949    #[test]
950    fn test_gen_meta_code_v0_title_description() {
951        let result = gen_meta_code_v0(
952            "Die Unendliche Geschichte",
953            Some("Von Michael Ende"),
954            None,
955            64,
956        )
957        .unwrap();
958        assert_eq!(result.iscc, "ISCC:AAAZXZ6OU4E45RB5");
959        assert_eq!(result.name, "Die Unendliche Geschichte");
960        assert_eq!(result.description, Some("Von Michael Ende".to_string()));
961        assert_eq!(result.meta, None);
962    }
963
964    #[test]
965    fn test_gen_meta_code_v0_json_meta() {
966        let result = gen_meta_code_v0("Hello", None, Some(r#"{"some":"object"}"#), 64).unwrap();
967        assert_eq!(result.iscc, "ISCC:AAAWKLHFXN63LHL2");
968        assert!(result.meta.is_some());
969        assert!(
970            result
971                .meta
972                .unwrap()
973                .starts_with("data:application/json;base64,")
974        );
975    }
976
977    #[test]
978    fn test_gen_meta_code_v0_data_url_meta() {
979        let result = gen_meta_code_v0(
980            "Hello",
981            None,
982            Some("data:application/json;charset=utf-8;base64,eyJzb21lIjogIm9iamVjdCJ9"),
983            64,
984        )
985        .unwrap();
986        assert_eq!(result.iscc, "ISCC:AAAWKLHFXN43ICP2");
987        // Data-URL is passed through as-is
988        assert_eq!(
989            result.meta,
990            Some("data:application/json;charset=utf-8;base64,eyJzb21lIjogIm9iamVjdCJ9".to_string())
991        );
992    }
993
994    /// Verify that JSON metadata with float values is canonicalized per RFC 8785 (JCS).
995    ///
996    /// JCS serializes `1.0` as `1` (integer form), while `serde_json` preserves `1.0`.
997    /// This causes different canonical bytes, different metahash, and different ISCC codes.
998    /// Expected values generated by `iscc-core` with `jcs.canonicalize({"value": 1.0})`.
999    #[test]
1000    fn test_gen_meta_code_v0_jcs_float_canonicalization() {
1001        // JCS canonicalizes {"value": 1.0} → {"value":1} (integer form)
1002        // serde_json produces {"value":1.0} (preserves float notation)
1003        let result = gen_meta_code_v0("Test", None, Some(r#"{"value":1.0}"#), 64).unwrap();
1004
1005        // Expected values from iscc-core (Python) using jcs.canonicalize()
1006        assert_eq!(
1007            result.iscc, "ISCC:AAAX4GX3RZH2I6QZ",
1008            "ISCC mismatch: parse_meta_json must use RFC 8785 (JCS) canonicalization"
1009        );
1010        assert_eq!(
1011            result.meta,
1012            Some("data:application/json;base64,eyJ2YWx1ZSI6MX0=".to_string()),
1013            "meta Data-URL mismatch: JCS should serialize 1.0 as 1"
1014        );
1015        assert_eq!(
1016            result.metahash, "1e2010b291d392b6999ffe4aa4661fb343fc371fca3bfb5bb4e8d8226fdf85743232",
1017            "metahash mismatch: canonical bytes differ between JCS and serde_json"
1018        );
1019    }
1020
1021    /// Verify JCS number formatting for large floats (scientific notation edge case).
1022    ///
1023    /// JCS serializes `1e20` as `100000000000000000000` (expanded integer form).
1024    /// Expected values generated by `iscc-core` with `jcs.canonicalize({"value": 1e20})`.
1025    #[test]
1026    fn test_gen_meta_code_v0_jcs_large_float_canonicalization() {
1027        let result = gen_meta_code_v0("Test", None, Some(r#"{"value":1e20}"#), 64).unwrap();
1028
1029        assert_eq!(
1030            result.iscc, "ISCC:AAAX4GX3R32YH5P7",
1031            "ISCC mismatch: JCS should expand 1e20 to 100000000000000000000"
1032        );
1033        assert_eq!(
1034            result.meta,
1035            Some(
1036                "data:application/json;base64,eyJ2YWx1ZSI6MTAwMDAwMDAwMDAwMDAwMDAwMDAwfQ=="
1037                    .to_string()
1038            ),
1039            "meta Data-URL mismatch: JCS should expand large float to integer form"
1040        );
1041        assert_eq!(
1042            result.metahash, "1e201ff83c1822c348717658a0b4713739646da7c59832691b337a457416ddd1c73d",
1043            "metahash mismatch: canonical bytes differ for large float"
1044        );
1045    }
1046
1047    #[test]
1048    fn test_gen_meta_code_v0_invalid_json() {
1049        assert!(matches!(
1050            gen_meta_code_v0("test", None, Some("not json"), 64),
1051            Err(IsccError::InvalidInput(_))
1052        ));
1053    }
1054
1055    #[test]
1056    fn test_gen_meta_code_v0_invalid_data_url() {
1057        assert!(matches!(
1058            gen_meta_code_v0("test", None, Some("data:no-comma-here"), 64),
1059            Err(IsccError::InvalidInput(_))
1060        ));
1061    }
1062
1063    #[test]
1064    fn test_gen_meta_code_v0_conformance() {
1065        let json_str = include_str!("../tests/data.json");
1066        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1067        let section = &data["gen_meta_code_v0"];
1068        let cases = section.as_object().unwrap();
1069
1070        let mut tested = 0;
1071
1072        for (tc_name, tc) in cases {
1073            let inputs = tc["inputs"].as_array().unwrap();
1074            let input_name = inputs[0].as_str().unwrap();
1075            let input_desc = inputs[1].as_str().unwrap();
1076            let meta_val = &inputs[2];
1077            let bits = inputs[3].as_u64().unwrap() as u32;
1078
1079            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1080            let expected_metahash = tc["outputs"]["metahash"].as_str().unwrap();
1081
1082            // Dispatch meta parameter based on JSON value type
1083            let meta_arg: Option<String> = match meta_val {
1084                serde_json::Value::Null => None,
1085                serde_json::Value::String(s) => Some(s.clone()),
1086                serde_json::Value::Object(_) => Some(serde_json::to_string(meta_val).unwrap()),
1087                other => panic!("unexpected meta type in {tc_name}: {other:?}"),
1088            };
1089
1090            let desc = if input_desc.is_empty() {
1091                None
1092            } else {
1093                Some(input_desc)
1094            };
1095
1096            // Verify ISCC output from struct
1097            let result = gen_meta_code_v0(input_name, desc, meta_arg.as_deref(), bits)
1098                .unwrap_or_else(|e| panic!("gen_meta_code_v0 failed for {tc_name}: {e}"));
1099            assert_eq!(
1100                result.iscc, expected_iscc,
1101                "ISCC mismatch in test case {tc_name}"
1102            );
1103
1104            // Verify metahash from struct
1105            assert_eq!(
1106                result.metahash, expected_metahash,
1107                "metahash mismatch in test case {tc_name}"
1108            );
1109
1110            // Verify name from struct
1111            if let Some(expected_name) = tc["outputs"].get("name") {
1112                let expected_name = expected_name.as_str().unwrap();
1113                assert_eq!(
1114                    result.name, expected_name,
1115                    "name mismatch in test case {tc_name}"
1116                );
1117            }
1118
1119            // Verify description from struct
1120            if let Some(expected_desc) = tc["outputs"].get("description") {
1121                let expected_desc = expected_desc.as_str().unwrap();
1122                assert_eq!(
1123                    result.description.as_deref(),
1124                    Some(expected_desc),
1125                    "description mismatch in test case {tc_name}"
1126                );
1127            }
1128
1129            // Verify meta from struct
1130            if meta_arg.is_some() {
1131                assert!(
1132                    result.meta.is_some(),
1133                    "meta should be present in test case {tc_name}"
1134                );
1135            } else {
1136                assert!(
1137                    result.meta.is_none(),
1138                    "meta should be absent in test case {tc_name}"
1139                );
1140            }
1141
1142            tested += 1;
1143        }
1144
1145        assert_eq!(tested, 16, "expected 16 conformance tests to run");
1146    }
1147
1148    #[test]
1149    fn test_gen_text_code_v0_empty() {
1150        let result = gen_text_code_v0("", 64).unwrap();
1151        assert_eq!(result.iscc, "ISCC:EAASL4F2WZY7KBXB");
1152        assert_eq!(result.characters, 0);
1153    }
1154
1155    #[test]
1156    fn test_gen_text_code_v0_hello_world() {
1157        let result = gen_text_code_v0("Hello World", 64).unwrap();
1158        assert_eq!(result.iscc, "ISCC:EAASKDNZNYGUUF5A");
1159        assert_eq!(result.characters, 10); // "helloworld" after collapse
1160    }
1161
1162    #[test]
1163    fn test_gen_text_code_v0_conformance() {
1164        let json_str = include_str!("../tests/data.json");
1165        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1166        let section = &data["gen_text_code_v0"];
1167        let cases = section.as_object().unwrap();
1168
1169        let mut tested = 0;
1170
1171        for (tc_name, tc) in cases {
1172            let inputs = tc["inputs"].as_array().unwrap();
1173            let input_text = inputs[0].as_str().unwrap();
1174            let bits = inputs[1].as_u64().unwrap() as u32;
1175
1176            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1177            let expected_chars = tc["outputs"]["characters"].as_u64().unwrap() as usize;
1178
1179            // Verify ISCC output from struct
1180            let result = gen_text_code_v0(input_text, bits)
1181                .unwrap_or_else(|e| panic!("gen_text_code_v0 failed for {tc_name}: {e}"));
1182            assert_eq!(
1183                result.iscc, expected_iscc,
1184                "ISCC mismatch in test case {tc_name}"
1185            );
1186
1187            // Verify character count from struct
1188            assert_eq!(
1189                result.characters, expected_chars,
1190                "character count mismatch in test case {tc_name}"
1191            );
1192
1193            tested += 1;
1194        }
1195
1196        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1197    }
1198
1199    #[test]
1200    fn test_gen_image_code_v0_all_black() {
1201        let pixels = vec![0u8; 1024];
1202        let result = gen_image_code_v0(&pixels, 64).unwrap();
1203        assert_eq!(result.iscc, "ISCC:EEAQAAAAAAAAAAAA");
1204    }
1205
1206    #[test]
1207    fn test_gen_image_code_v0_all_white() {
1208        let pixels = vec![255u8; 1024];
1209        let result = gen_image_code_v0(&pixels, 128).unwrap();
1210        assert_eq!(result.iscc, "ISCC:EEBYAAAAAAAAAAAAAAAAAAAAAAAAA");
1211    }
1212
1213    #[test]
1214    fn test_gen_image_code_v0_invalid_pixel_count() {
1215        assert!(gen_image_code_v0(&[0u8; 100], 64).is_err());
1216    }
1217
1218    #[test]
1219    fn test_gen_image_code_v0_conformance() {
1220        let json_str = include_str!("../tests/data.json");
1221        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1222        let section = &data["gen_image_code_v0"];
1223        let cases = section.as_object().unwrap();
1224
1225        let mut tested = 0;
1226
1227        for (tc_name, tc) in cases {
1228            let inputs = tc["inputs"].as_array().unwrap();
1229            let pixels_json = inputs[0].as_array().unwrap();
1230            let bits = inputs[1].as_u64().unwrap() as u32;
1231            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1232
1233            let pixels: Vec<u8> = pixels_json
1234                .iter()
1235                .map(|v| v.as_u64().unwrap() as u8)
1236                .collect();
1237
1238            let result = gen_image_code_v0(&pixels, bits)
1239                .unwrap_or_else(|e| panic!("gen_image_code_v0 failed for {tc_name}: {e}"));
1240            assert_eq!(
1241                result.iscc, expected_iscc,
1242                "ISCC mismatch in test case {tc_name}"
1243            );
1244
1245            tested += 1;
1246        }
1247
1248        assert_eq!(tested, 3, "expected 3 conformance tests to run");
1249    }
1250
1251    #[test]
1252    fn test_gen_audio_code_v0_empty() {
1253        let result = gen_audio_code_v0(&[], 64).unwrap();
1254        assert_eq!(result.iscc, "ISCC:EIAQAAAAAAAAAAAA");
1255    }
1256
1257    #[test]
1258    fn test_gen_audio_code_v0_single() {
1259        let result = gen_audio_code_v0(&[1], 128).unwrap();
1260        assert_eq!(result.iscc, "ISCC:EIBQAAAAAEAAAAABAAAAAAAAAAAAA");
1261    }
1262
1263    #[test]
1264    fn test_gen_audio_code_v0_negative() {
1265        let result = gen_audio_code_v0(&[-1, 0, 1], 256).unwrap();
1266        assert_eq!(
1267            result.iscc,
1268            "ISCC:EIDQAAAAAH777777AAAAAAAAAAAACAAAAAAP777774AAAAAAAAAAAAI"
1269        );
1270    }
1271
1272    #[test]
1273    fn test_gen_audio_code_v0_conformance() {
1274        let json_str = include_str!("../tests/data.json");
1275        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1276        let section = &data["gen_audio_code_v0"];
1277        let cases = section.as_object().unwrap();
1278
1279        let mut tested = 0;
1280
1281        for (tc_name, tc) in cases {
1282            let inputs = tc["inputs"].as_array().unwrap();
1283            let cv_json = inputs[0].as_array().unwrap();
1284            let bits = inputs[1].as_u64().unwrap() as u32;
1285            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1286
1287            let cv: Vec<i32> = cv_json.iter().map(|v| v.as_i64().unwrap() as i32).collect();
1288
1289            let result = gen_audio_code_v0(&cv, bits)
1290                .unwrap_or_else(|e| panic!("gen_audio_code_v0 failed for {tc_name}: {e}"));
1291            assert_eq!(
1292                result.iscc, expected_iscc,
1293                "ISCC mismatch in test case {tc_name}"
1294            );
1295
1296            tested += 1;
1297        }
1298
1299        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1300    }
1301
1302    #[test]
1303    fn test_array_split_even() {
1304        let data = vec![1, 2, 3, 4];
1305        let parts = array_split(&data, 4);
1306        assert_eq!(parts, vec![&[1][..], &[2][..], &[3][..], &[4][..]]);
1307    }
1308
1309    #[test]
1310    fn test_array_split_remainder() {
1311        let data = vec![1, 2, 3, 4, 5];
1312        let parts = array_split(&data, 3);
1313        assert_eq!(parts, vec![&[1, 2][..], &[3, 4][..], &[5][..]]);
1314    }
1315
1316    #[test]
1317    fn test_array_split_more_parts_than_elements() {
1318        let data = vec![1, 2];
1319        let parts = array_split(&data, 4);
1320        assert_eq!(
1321            parts,
1322            vec![&[1][..], &[2][..], &[][..] as &[i32], &[][..] as &[i32]]
1323        );
1324    }
1325
1326    #[test]
1327    fn test_array_split_empty() {
1328        let data: Vec<i32> = vec![];
1329        let parts = array_split(&data, 3);
1330        assert_eq!(
1331            parts,
1332            vec![&[][..] as &[i32], &[][..] as &[i32], &[][..] as &[i32]]
1333        );
1334    }
1335
1336    #[test]
1337    fn test_gen_video_code_v0_empty_frames() {
1338        let frames: Vec<Vec<i32>> = vec![];
1339        assert!(matches!(
1340            gen_video_code_v0(&frames, 64),
1341            Err(IsccError::InvalidInput(_))
1342        ));
1343    }
1344
1345    #[test]
1346    fn test_gen_video_code_v0_conformance() {
1347        let json_str = include_str!("../tests/data.json");
1348        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1349        let section = &data["gen_video_code_v0"];
1350        let cases = section.as_object().unwrap();
1351
1352        let mut tested = 0;
1353
1354        for (tc_name, tc) in cases {
1355            let inputs = tc["inputs"].as_array().unwrap();
1356            let frames_json = inputs[0].as_array().unwrap();
1357            let bits = inputs[1].as_u64().unwrap() as u32;
1358            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1359
1360            let frame_sigs: Vec<Vec<i32>> = frames_json
1361                .iter()
1362                .map(|frame| {
1363                    frame
1364                        .as_array()
1365                        .unwrap()
1366                        .iter()
1367                        .map(|v| v.as_i64().unwrap() as i32)
1368                        .collect()
1369                })
1370                .collect();
1371
1372            let result = gen_video_code_v0(&frame_sigs, bits)
1373                .unwrap_or_else(|e| panic!("gen_video_code_v0 failed for {tc_name}: {e}"));
1374            assert_eq!(
1375                result.iscc, expected_iscc,
1376                "ISCC mismatch in test case {tc_name}"
1377            );
1378
1379            tested += 1;
1380        }
1381
1382        assert_eq!(tested, 3, "expected 3 conformance tests to run");
1383    }
1384
1385    #[test]
1386    fn test_gen_mixed_code_v0_conformance() {
1387        let json_str = include_str!("../tests/data.json");
1388        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1389        let section = &data["gen_mixed_code_v0"];
1390        let cases = section.as_object().unwrap();
1391
1392        let mut tested = 0;
1393
1394        for (tc_name, tc) in cases {
1395            let inputs = tc["inputs"].as_array().unwrap();
1396            let codes_json = inputs[0].as_array().unwrap();
1397            let bits = inputs[1].as_u64().unwrap() as u32;
1398            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1399            let expected_parts: Vec<&str> = tc["outputs"]["parts"]
1400                .as_array()
1401                .unwrap()
1402                .iter()
1403                .map(|v| v.as_str().unwrap())
1404                .collect();
1405
1406            let codes: Vec<&str> = codes_json.iter().map(|v| v.as_str().unwrap()).collect();
1407
1408            let result = gen_mixed_code_v0(&codes, bits)
1409                .unwrap_or_else(|e| panic!("gen_mixed_code_v0 failed for {tc_name}: {e}"));
1410            assert_eq!(
1411                result.iscc, expected_iscc,
1412                "ISCC mismatch in test case {tc_name}"
1413            );
1414
1415            // Verify parts from struct match expected
1416            let result_parts: Vec<&str> = result.parts.iter().map(|s| s.as_str()).collect();
1417            assert_eq!(
1418                result_parts, expected_parts,
1419                "parts mismatch in test case {tc_name}"
1420            );
1421
1422            tested += 1;
1423        }
1424
1425        assert_eq!(tested, 2, "expected 2 conformance tests to run");
1426    }
1427
1428    #[test]
1429    fn test_gen_mixed_code_v0_too_few_codes() {
1430        assert!(matches!(
1431            gen_mixed_code_v0(&["EUA6GIKXN42IQV3S"], 64),
1432            Err(IsccError::InvalidInput(_))
1433        ));
1434    }
1435
1436    /// Build raw Content-Code bytes (header + body) for a given bit length.
1437    fn make_content_code_raw(stype: codec::SubType, bit_length: u32) -> Vec<u8> {
1438        let nbytes = (bit_length / 8) as usize;
1439        let body: Vec<u8> = (0..nbytes).map(|i| (i & 0xFF) as u8).collect();
1440        let base32 = codec::encode_component(
1441            codec::MainType::Content,
1442            stype,
1443            codec::Version::V0,
1444            bit_length,
1445            &body,
1446        )
1447        .unwrap();
1448        codec::decode_base32(&base32).unwrap()
1449    }
1450
1451    #[test]
1452    fn test_soft_hash_codes_v0_rejects_short_code() {
1453        // One code with 64 bits, one with only 32 bits — should reject when requesting 64
1454        let code_64 = make_content_code_raw(codec::SubType::None, 64);
1455        let code_32 = make_content_code_raw(codec::SubType::Image, 32);
1456        let result = soft_hash_codes_v0(&[code_64, code_32], 64);
1457        assert!(
1458            matches!(&result, Err(IsccError::InvalidInput(msg)) if msg.contains("too short")),
1459            "expected InvalidInput with 'too short', got {result:?}"
1460        );
1461    }
1462
1463    #[test]
1464    fn test_soft_hash_codes_v0_accepts_exact_length() {
1465        // Two codes with exactly 64 bits each — should succeed when requesting 64
1466        let code_a = make_content_code_raw(codec::SubType::None, 64);
1467        let code_b = make_content_code_raw(codec::SubType::Image, 64);
1468        let result = soft_hash_codes_v0(&[code_a, code_b], 64);
1469        assert!(result.is_ok(), "expected Ok, got {result:?}");
1470    }
1471
1472    #[test]
1473    fn test_soft_hash_codes_v0_accepts_longer_codes() {
1474        // Two codes with 128 bits each — should succeed when requesting 64
1475        let code_a = make_content_code_raw(codec::SubType::None, 128);
1476        let code_b = make_content_code_raw(codec::SubType::Audio, 128);
1477        let result = soft_hash_codes_v0(&[code_a, code_b], 64);
1478        assert!(result.is_ok(), "expected Ok, got {result:?}");
1479    }
1480
1481    #[test]
1482    fn test_gen_data_code_v0_conformance() {
1483        let json_str = include_str!("../tests/data.json");
1484        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1485        let section = &data["gen_data_code_v0"];
1486        let cases = section.as_object().unwrap();
1487
1488        let mut tested = 0;
1489
1490        for (tc_name, tc) in cases {
1491            let inputs = tc["inputs"].as_array().unwrap();
1492            let stream_str = inputs[0].as_str().unwrap();
1493            let bits = inputs[1].as_u64().unwrap() as u32;
1494            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1495
1496            // Parse "stream:" prefix — remainder is hex-encoded bytes
1497            let hex_data = stream_str
1498                .strip_prefix("stream:")
1499                .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {tc_name}"));
1500            let input_bytes = hex::decode(hex_data)
1501                .unwrap_or_else(|e| panic!("invalid hex in test case {tc_name}: {e}"));
1502
1503            let result = gen_data_code_v0(&input_bytes, bits)
1504                .unwrap_or_else(|e| panic!("gen_data_code_v0 failed for {tc_name}: {e}"));
1505            assert_eq!(
1506                result.iscc, expected_iscc,
1507                "ISCC mismatch in test case {tc_name}"
1508            );
1509
1510            tested += 1;
1511        }
1512
1513        assert_eq!(tested, 4, "expected 4 conformance tests to run");
1514    }
1515
1516    #[test]
1517    fn test_gen_instance_code_v0_empty() {
1518        let result = gen_instance_code_v0(b"", 64).unwrap();
1519        assert_eq!(result.iscc, "ISCC:IAA26E2JXH27TING");
1520        assert_eq!(result.filesize, 0);
1521        assert_eq!(
1522            result.datahash,
1523            "1e20af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"
1524        );
1525    }
1526
1527    #[test]
1528    fn test_gen_instance_code_v0_conformance() {
1529        let json_str = include_str!("../tests/data.json");
1530        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1531        let section = &data["gen_instance_code_v0"];
1532        let cases = section.as_object().unwrap();
1533
1534        for (name, tc) in cases {
1535            let inputs = tc["inputs"].as_array().unwrap();
1536            let stream_str = inputs[0].as_str().unwrap();
1537            let bits = inputs[1].as_u64().unwrap() as u32;
1538            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1539
1540            // Parse "stream:" prefix — remainder is hex-encoded bytes
1541            let hex_data = stream_str
1542                .strip_prefix("stream:")
1543                .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {name}"));
1544            let input_bytes = hex::decode(hex_data)
1545                .unwrap_or_else(|e| panic!("invalid hex in test case {name}: {e}"));
1546
1547            let result = gen_instance_code_v0(&input_bytes, bits)
1548                .unwrap_or_else(|e| panic!("gen_instance_code_v0 failed for {name}: {e}"));
1549            assert_eq!(
1550                result.iscc, expected_iscc,
1551                "ISCC mismatch in test case {name}"
1552            );
1553
1554            // Verify datahash from struct
1555            if let Some(expected_datahash) = tc["outputs"].get("datahash") {
1556                let expected_datahash = expected_datahash.as_str().unwrap();
1557                assert_eq!(
1558                    result.datahash, expected_datahash,
1559                    "datahash mismatch in test case {name}"
1560                );
1561            }
1562
1563            // Verify filesize from struct
1564            if let Some(expected_filesize) = tc["outputs"].get("filesize") {
1565                let expected_filesize = expected_filesize.as_u64().unwrap();
1566                assert_eq!(
1567                    result.filesize, expected_filesize,
1568                    "filesize mismatch in test case {name}"
1569                );
1570            }
1571
1572            // Also verify filesize matches input data length
1573            assert_eq!(
1574                result.filesize,
1575                input_bytes.len() as u64,
1576                "filesize should match input length in test case {name}"
1577            );
1578        }
1579    }
1580
1581    #[test]
1582    fn test_gen_iscc_code_v0_conformance() {
1583        let json_str = include_str!("../tests/data.json");
1584        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1585        let section = &data["gen_iscc_code_v0"];
1586        let cases = section.as_object().unwrap();
1587
1588        let mut tested = 0;
1589
1590        for (tc_name, tc) in cases {
1591            let inputs = tc["inputs"].as_array().unwrap();
1592            let codes_json = inputs[0].as_array().unwrap();
1593            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1594
1595            let codes: Vec<&str> = codes_json.iter().map(|v| v.as_str().unwrap()).collect();
1596
1597            let result = gen_iscc_code_v0(&codes, false)
1598                .unwrap_or_else(|e| panic!("gen_iscc_code_v0 failed for {tc_name}: {e}"));
1599            assert_eq!(
1600                result.iscc, expected_iscc,
1601                "ISCC mismatch in test case {tc_name}"
1602            );
1603
1604            tested += 1;
1605        }
1606
1607        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1608    }
1609
1610    #[test]
1611    fn test_gen_iscc_code_v0_too_few_codes() {
1612        assert!(matches!(
1613            gen_iscc_code_v0(&["AAAWKLHFPV6OPKDG"], false),
1614            Err(IsccError::InvalidInput(_))
1615        ));
1616    }
1617
1618    #[test]
1619    fn test_gen_iscc_code_v0_missing_instance() {
1620        // Two Meta codes — missing Data and Instance
1621        assert!(matches!(
1622            gen_iscc_code_v0(&["AAAWKLHFPV6OPKDG", "AAAWKLHFPV6OPKDG"], false),
1623            Err(IsccError::InvalidInput(_))
1624        ));
1625    }
1626
1627    #[test]
1628    fn test_gen_iscc_code_v0_short_code() {
1629        // Code too short (< 16 chars)
1630        assert!(matches!(
1631            gen_iscc_code_v0(&["AAAWKLHFPV6", "AAAWKLHFPV6OPKDG"], false),
1632            Err(IsccError::InvalidInput(_))
1633        ));
1634    }
1635
1636    /// Verify that a Data-URL with empty base64 payload enters the meta bytes path.
1637    ///
1638    /// Python reference: `if meta:` is truthy for `"data:application/json;base64,"` (non-empty
1639    /// string), so it enters the meta branch with `payload = b""`. The result must have
1640    /// `meta = Some(...)` containing the original Data-URL and `metahash` equal to
1641    /// `multi_hash_blake3(&[])` (BLAKE3 of empty bytes).
1642    #[test]
1643    fn test_gen_meta_code_empty_data_url_enters_meta_branch() {
1644        let result =
1645            gen_meta_code_v0("Test", None, Some("data:application/json;base64,"), 64).unwrap();
1646
1647        // Result should be Ok
1648        assert_eq!(result.name, "Test");
1649
1650        // meta should contain the original Data-URL string (not None)
1651        assert_eq!(
1652            result.meta,
1653            Some("data:application/json;base64,".to_string()),
1654            "empty Data-URL payload should still enter meta branch"
1655        );
1656
1657        // metahash should be BLAKE3 of empty bytes
1658        let expected_metahash = utils::multi_hash_blake3(&[]);
1659        assert_eq!(
1660            result.metahash, expected_metahash,
1661            "metahash should be BLAKE3 of empty bytes"
1662        );
1663    }
1664
1665    /// Verify that `soft_hash_meta_v0_with_bytes` with empty bytes produces the same
1666    /// digest as `soft_hash_meta_v0` with no extra text.
1667    ///
1668    /// Python reference (`code_meta.py:142`): `if extra in {None, "", b""}:` returns
1669    /// name-only simhash without interleaving for all empty-like values.
1670    #[test]
1671    fn test_soft_hash_meta_v0_with_bytes_empty_equals_name_only() {
1672        let name_only = soft_hash_meta_v0("test", None);
1673        let empty_bytes = soft_hash_meta_v0_with_bytes("test", &[]);
1674        assert_eq!(
1675            name_only, empty_bytes,
1676            "empty bytes should produce same digest as name-only (no interleaving)"
1677        );
1678    }
1679
1680    // ---- Algorithm constants tests ----
1681
1682    #[test]
1683    fn test_meta_trim_name_value() {
1684        assert_eq!(META_TRIM_NAME, 128);
1685    }
1686
1687    #[test]
1688    fn test_meta_trim_description_value() {
1689        assert_eq!(META_TRIM_DESCRIPTION, 4096);
1690    }
1691
1692    #[test]
1693    fn test_io_read_size_value() {
1694        assert_eq!(IO_READ_SIZE, 4_194_304);
1695    }
1696
1697    #[test]
1698    fn test_text_ngram_size_value() {
1699        assert_eq!(TEXT_NGRAM_SIZE, 13);
1700    }
1701
1702    // ---- encode_component Tier 1 wrapper tests ----
1703
1704    /// Encode a known digest and verify the output matches the codec version.
1705    #[test]
1706    fn test_encode_component_matches_codec() {
1707        let digest = [0xABu8; 8];
1708        let tier1 = encode_component(3, 0, 0, 64, &digest).unwrap();
1709        let tier2 = codec::encode_component(
1710            codec::MainType::Data,
1711            codec::SubType::None,
1712            codec::Version::V0,
1713            64,
1714            &digest,
1715        )
1716        .unwrap();
1717        assert_eq!(tier1, tier2);
1718    }
1719
1720    /// Round-trip: encode a digest and verify the result is a valid ISCC unit.
1721    #[test]
1722    fn test_encode_component_round_trip() {
1723        let digest = [0x42u8; 32];
1724        let result = encode_component(0, 0, 0, 64, &digest).unwrap();
1725        // Meta-Code with 64-bit digest should start with "AA"
1726        assert!(!result.is_empty());
1727    }
1728
1729    /// Reject MainType::Iscc (value 5).
1730    #[test]
1731    fn test_encode_component_rejects_iscc() {
1732        let result = encode_component(5, 0, 0, 64, &[0u8; 8]);
1733        assert!(result.is_err());
1734    }
1735
1736    /// Reject digest shorter than bit_length / 8.
1737    #[test]
1738    fn test_encode_component_rejects_short_digest() {
1739        let result = encode_component(0, 0, 0, 64, &[0u8; 4]);
1740        assert!(result.is_err());
1741        let err = result.unwrap_err().to_string();
1742        assert!(
1743            err.contains("digest length 4 < bit_length/8 (8)"),
1744            "unexpected error: {err}"
1745        );
1746    }
1747
1748    /// Reject invalid MainType value.
1749    #[test]
1750    fn test_encode_component_rejects_invalid_mtype() {
1751        let result = encode_component(99, 0, 0, 64, &[0u8; 8]);
1752        assert!(result.is_err());
1753    }
1754
1755    /// Reject invalid SubType value.
1756    #[test]
1757    fn test_encode_component_rejects_invalid_stype() {
1758        let result = encode_component(0, 99, 0, 64, &[0u8; 8]);
1759        assert!(result.is_err());
1760    }
1761
1762    /// Reject invalid Version value.
1763    #[test]
1764    fn test_encode_component_rejects_invalid_version() {
1765        let result = encode_component(0, 0, 99, 64, &[0u8; 8]);
1766        assert!(result.is_err());
1767    }
1768
1769    // ---- iscc_decode tests ----
1770
1771    /// Round-trip: encode a Meta-Code digest, decode back, verify all fields match.
1772    #[test]
1773    fn test_iscc_decode_round_trip_meta() {
1774        let digest = [0xaa_u8; 8];
1775        let encoded = encode_component(0, 0, 0, 64, &digest).unwrap();
1776        let (mt, st, vs, li, decoded_digest) = iscc_decode(&encoded).unwrap();
1777        assert_eq!(mt, 0, "MainType::Meta");
1778        assert_eq!(st, 0, "SubType::None");
1779        assert_eq!(vs, 0, "Version::V0");
1780        // encode_length(Meta, 64) → 64/32 - 1 = 1
1781        assert_eq!(li, 1, "length_index");
1782        assert_eq!(decoded_digest, digest.to_vec());
1783    }
1784
1785    /// Round-trip with Content-Code (MainType=2, SubType::TEXT=0).
1786    #[test]
1787    fn test_iscc_decode_round_trip_content() {
1788        let digest = [0xbb_u8; 8];
1789        let encoded = encode_component(2, 0, 0, 64, &digest).unwrap();
1790        let (mt, st, vs, _li, decoded_digest) = iscc_decode(&encoded).unwrap();
1791        assert_eq!(mt, 2, "MainType::Content");
1792        assert_eq!(st, 0, "SubType::TEXT");
1793        assert_eq!(vs, 0, "Version::V0");
1794        assert_eq!(decoded_digest, digest.to_vec());
1795    }
1796
1797    /// Round-trip with Data-Code (MainType=3).
1798    #[test]
1799    fn test_iscc_decode_round_trip_data() {
1800        let digest = [0xcc_u8; 8];
1801        let encoded = encode_component(3, 0, 0, 64, &digest).unwrap();
1802        let (mt, _st, _vs, _li, decoded_digest) = iscc_decode(&encoded).unwrap();
1803        assert_eq!(mt, 3, "MainType::Data");
1804        assert_eq!(decoded_digest, digest.to_vec());
1805    }
1806
1807    /// Round-trip with Instance-Code (MainType=4).
1808    #[test]
1809    fn test_iscc_decode_round_trip_instance() {
1810        let digest = [0xdd_u8; 8];
1811        let encoded = encode_component(4, 0, 0, 64, &digest).unwrap();
1812        let (mt, _st, _vs, _li, decoded_digest) = iscc_decode(&encoded).unwrap();
1813        assert_eq!(mt, 4, "MainType::Instance");
1814        assert_eq!(decoded_digest, digest.to_vec());
1815    }
1816
1817    /// Decode with "ISCC:" prefix produces the same result.
1818    #[test]
1819    fn test_iscc_decode_with_prefix() {
1820        let digest = [0xaa_u8; 8];
1821        let encoded = encode_component(0, 0, 0, 64, &digest).unwrap();
1822        let with_prefix = format!("ISCC:{encoded}");
1823        let (mt, st, vs, li, decoded_digest) = iscc_decode(&with_prefix).unwrap();
1824        assert_eq!(mt, 0);
1825        assert_eq!(st, 0);
1826        assert_eq!(vs, 0);
1827        assert_eq!(li, 1);
1828        assert_eq!(decoded_digest, digest.to_vec());
1829    }
1830
1831    /// Decode with dashes inserted in the string.
1832    #[test]
1833    fn test_iscc_decode_with_dashes() {
1834        let digest = [0xaa_u8; 8];
1835        let encoded = encode_component(0, 0, 0, 64, &digest).unwrap();
1836        // Insert dashes at arbitrary positions
1837        let with_dashes = format!("{}-{}-{}", &encoded[..4], &encoded[4..8], &encoded[8..]);
1838        let (mt, st, vs, li, decoded_digest) = iscc_decode(&with_dashes).unwrap();
1839        assert_eq!(mt, 0);
1840        assert_eq!(st, 0);
1841        assert_eq!(vs, 0);
1842        assert_eq!(li, 1);
1843        assert_eq!(decoded_digest, digest.to_vec());
1844    }
1845
1846    /// Error on invalid base32 characters.
1847    #[test]
1848    fn test_iscc_decode_invalid_base32() {
1849        let result = iscc_decode("!!!INVALID!!!");
1850        assert!(result.is_err());
1851        let err = result.unwrap_err().to_string();
1852        assert!(err.contains("base32"), "expected base32 error: {err}");
1853    }
1854
1855    /// Known value from conformance vectors: Meta-Code "ISCC:AAAZXZ6OU74YAZIM".
1856    /// MainType=Meta(0), SubType=None(0), Version=V0(0), 64-bit digest.
1857    #[test]
1858    fn test_iscc_decode_known_meta_code() {
1859        let (mt, st, vs, li, digest) = iscc_decode("ISCC:AAAZXZ6OU74YAZIM").unwrap();
1860        assert_eq!(mt, 0, "MainType::Meta");
1861        assert_eq!(st, 0, "SubType::None");
1862        assert_eq!(vs, 0, "Version::V0");
1863        assert_eq!(li, 1, "length_index for 64-bit");
1864        assert_eq!(digest.len(), 8, "64-bit = 8 bytes");
1865    }
1866
1867    /// Known value from conformance vectors: Instance-Code "ISCC:IAA26E2JXH27TING".
1868    /// MainType=Instance(4), SubType=None(0), Version=V0(0), 64-bit digest.
1869    #[test]
1870    fn test_iscc_decode_known_instance_code() {
1871        let (mt, st, vs, li, digest) = iscc_decode("ISCC:IAA26E2JXH27TING").unwrap();
1872        assert_eq!(mt, 4, "MainType::Instance");
1873        assert_eq!(st, 0, "SubType::None");
1874        assert_eq!(vs, 0, "Version::V0");
1875        assert_eq!(li, 1, "length_index for 64-bit");
1876        assert_eq!(digest.len(), 8, "64-bit = 8 bytes");
1877    }
1878
1879    /// Known value: Data-Code "ISCC:GAAXL2XYM5BQIAZ3".
1880    /// MainType=Data(3), SubType=None(0), Version=V0(0), 64-bit digest.
1881    #[test]
1882    fn test_iscc_decode_known_data_code() {
1883        let (mt, st, vs, _li, digest) = iscc_decode("ISCC:GAAXL2XYM5BQIAZ3").unwrap();
1884        assert_eq!(mt, 3, "MainType::Data");
1885        assert_eq!(st, 0, "SubType::None");
1886        assert_eq!(vs, 0, "Version::V0");
1887        assert_eq!(digest.len(), 8, "64-bit = 8 bytes");
1888    }
1889
1890    /// Verification criterion: round-trip with specific known values.
1891    /// encode_component(0, 0, 0, 64, &[0xaa;8]) → iscc_decode → (0, 0, 0, 1, vec![0xaa;8])
1892    #[test]
1893    fn test_iscc_decode_verification_round_trip() {
1894        let digest = [0xaa_u8; 8];
1895        let encoded = encode_component(0, 0, 0, 64, &digest).unwrap();
1896        let result = iscc_decode(&encoded).unwrap();
1897        assert_eq!(result, (0, 0, 0, 1, vec![0xaa; 8]));
1898    }
1899
1900    /// Error on truncated input where body is shorter than expected digest length.
1901    #[test]
1902    fn test_iscc_decode_truncated_input() {
1903        // Encode a valid 256-bit Meta-Code, then truncate the base32 string
1904        let digest = [0xff_u8; 32];
1905        let encoded = encode_component(0, 0, 0, 256, &digest).unwrap();
1906        // Truncate to just the header portion (first few chars)
1907        let truncated = &encoded[..6];
1908        let result = iscc_decode(truncated);
1909        assert!(result.is_err(), "should fail on truncated input");
1910    }
1911
1912    // --- json_to_data_url tests ---
1913
1914    /// Basic JSON object produces a data URL with application/json media type.
1915    #[test]
1916    fn test_json_to_data_url_basic() {
1917        let url = json_to_data_url(r#"{"key": "value"}"#).unwrap();
1918        assert!(
1919            url.starts_with("data:application/json;base64,"),
1920            "expected application/json prefix, got: {url}"
1921        );
1922    }
1923
1924    /// JSON with `@context` key uses application/ld+json media type.
1925    #[test]
1926    fn test_json_to_data_url_ld_json() {
1927        let url = json_to_data_url(r#"{"@context": "https://schema.org"}"#).unwrap();
1928        assert!(
1929            url.starts_with("data:application/ld+json;base64,"),
1930            "expected application/ld+json prefix, got: {url}"
1931        );
1932    }
1933
1934    /// JCS canonicalization reorders keys alphabetically.
1935    #[test]
1936    fn test_json_to_data_url_jcs_ordering() {
1937        let url = json_to_data_url(r#"{"b":1,"a":2}"#).unwrap();
1938        // Extract and decode the base64 payload
1939        let b64 = url.split_once(',').unwrap().1;
1940        let decoded = data_encoding::BASE64.decode(b64.as_bytes()).unwrap();
1941        let canonical = std::str::from_utf8(&decoded).unwrap();
1942        assert_eq!(canonical, r#"{"a":2,"b":1}"#, "JCS should sort keys");
1943    }
1944
1945    /// Round-trip: json_to_data_url output fed into decode_data_url recovers
1946    /// the JCS-canonical bytes.
1947    #[test]
1948    fn test_json_to_data_url_round_trip() {
1949        let input = r#"{"hello": "world", "num": 42}"#;
1950        let url = json_to_data_url(input).unwrap();
1951        let decoded_bytes = decode_data_url(&url).unwrap();
1952        // The decoded bytes should be JCS-canonical JSON
1953        let canonical: serde_json::Value =
1954            serde_json::from_slice(&decoded_bytes).expect("decoded bytes should be valid JSON");
1955        let original: serde_json::Value = serde_json::from_str(input).unwrap();
1956        assert_eq!(canonical, original, "round-trip preserves JSON semantics");
1957    }
1958
1959    /// Invalid JSON string returns an error.
1960    #[test]
1961    fn test_json_to_data_url_invalid_json() {
1962        let result = json_to_data_url("not json");
1963        assert!(result.is_err(), "should reject invalid JSON");
1964        let err = result.unwrap_err().to_string();
1965        assert!(
1966            err.contains("invalid JSON"),
1967            "expected 'invalid JSON' in error: {err}"
1968        );
1969    }
1970
1971    /// Compatibility with conformance vector test_0016_meta_data_url.
1972    ///
1973    /// The conformance vector's meta field is:
1974    ///   data:application/json;charset=utf-8;base64,eyJzb21lIjogIm9iamVjdCJ9
1975    /// which encodes `{"some": "object"}` (with space after colon).
1976    ///
1977    /// Our function differs in two ways:
1978    /// 1. No `charset=utf-8` parameter (matching Python's DataURL.from_byte_data)
1979    /// 2. JCS canonicalization removes whitespace: `{"some":"object"}` (no space)
1980    ///
1981    /// We verify: (a) correct media type prefix, and (b) decoded payload equals
1982    /// JCS-canonical form of the same JSON input.
1983    #[test]
1984    fn test_json_to_data_url_conformance_0016() {
1985        let url = json_to_data_url(r#"{"some": "object"}"#).unwrap();
1986        // (a) Correct media type prefix (no charset, no @context → application/json)
1987        assert!(
1988            url.starts_with("data:application/json;base64,"),
1989            "expected application/json prefix"
1990        );
1991        // (b) Decoded payload is JCS-canonical (no whitespace)
1992        let b64 = url.split_once(',').unwrap().1;
1993        let decoded = data_encoding::BASE64.decode(b64.as_bytes()).unwrap();
1994        let canonical = std::str::from_utf8(&decoded).unwrap();
1995        assert_eq!(
1996            canonical, r#"{"some":"object"}"#,
1997            "JCS removes whitespace from JSON"
1998        );
1999    }
2000}