Skip to main content

iscc_lib/
lib.rs

1//! High-performance Rust implementation of ISO 24138:2024 (ISCC).
2//!
3//! This crate provides the core ISCC algorithm implementations. All 10 `gen_*_v0`
4//! functions are the public Tier 1 API surface, designed to be compatible with
5//! the `iscc-core` Python reference implementation.
6
7pub mod cdc;
8pub mod codec;
9pub mod conformance;
10pub(crate) mod dct;
11pub mod minhash;
12pub mod simhash;
13pub mod streaming;
14pub mod types;
15pub mod utils;
16pub(crate) mod wtahash;
17
18pub use cdc::alg_cdc_chunks;
19pub use codec::encode_base64;
20pub use codec::iscc_decompose;
21pub use conformance::conformance_selftest;
22pub use minhash::alg_minhash_256;
23pub use simhash::{alg_simhash, sliding_window};
24pub use streaming::{DataHasher, InstanceHasher};
25pub use types::*;
26pub use utils::{text_clean, text_collapse, text_remove_newlines, text_trim};
27
28/// Max UTF-8 byte length for name metadata trimming.
29pub const META_TRIM_NAME: usize = 128;
30
31/// Max UTF-8 byte length for description metadata trimming.
32pub const META_TRIM_DESCRIPTION: usize = 4096;
33
34/// Max decoded payload size in bytes for the meta element.
35pub const META_TRIM_META: usize = 128_000;
36
37/// Buffer size in bytes for streaming file reads (4 MB).
38pub const IO_READ_SIZE: usize = 4_194_304;
39
40/// Character n-gram width for text content features.
41pub const TEXT_NGRAM_SIZE: usize = 13;
42
43/// Error type for ISCC operations.
44#[derive(Debug, thiserror::Error)]
45pub enum IsccError {
46    /// Input data is invalid.
47    #[error("invalid input: {0}")]
48    InvalidInput(String),
49}
50
51/// Result type alias for ISCC operations.
52pub type IsccResult<T> = Result<T, IsccError>;
53
54/// Interleave two 32-byte SimHash digests in 4-byte chunks.
55///
56/// Takes the first 16 bytes of each digest and interleaves them into
57/// a 32-byte result: 4 bytes from `a`, 4 bytes from `b`, alternating
58/// for 4 rounds (8 chunks total).
59fn interleave_digests(a: &[u8], b: &[u8]) -> Vec<u8> {
60    let mut result = vec![0u8; 32];
61    for chunk in 0..4 {
62        let src = chunk * 4;
63        let dst_a = chunk * 8;
64        let dst_b = chunk * 8 + 4;
65        result[dst_a..dst_a + 4].copy_from_slice(&a[src..src + 4]);
66        result[dst_b..dst_b + 4].copy_from_slice(&b[src..src + 4]);
67    }
68    result
69}
70
71/// Compute a SimHash digest from the name text for meta hashing.
72///
73/// Applies `text_collapse`, generates width-3 sliding window n-grams,
74/// hashes each with BLAKE3, and produces a SimHash.
75fn meta_name_simhash(name: &str) -> Vec<u8> {
76    let collapsed_name = utils::text_collapse(name);
77    let name_ngrams = simhash::sliding_window_strs(&collapsed_name, 3);
78    let name_hashes: Vec<[u8; 32]> = name_ngrams
79        .iter()
80        .map(|ng| *blake3::hash(ng.as_bytes()).as_bytes())
81        .collect();
82    simhash::alg_simhash_inner(&name_hashes)
83}
84
85/// Compute a similarity-preserving 256-bit hash from metadata text.
86///
87/// Produces a SimHash digest from `name` n-grams. When `extra` is provided,
88/// interleaves the name and extra SimHash digests in 4-byte chunks.
89fn soft_hash_meta_v0(name: &str, extra: Option<&str>) -> Vec<u8> {
90    let name_simhash = meta_name_simhash(name);
91
92    match extra {
93        None | Some("") => name_simhash,
94        Some(extra_str) => {
95            let collapsed_extra = utils::text_collapse(extra_str);
96            let extra_ngrams = simhash::sliding_window_strs(&collapsed_extra, 3);
97            let extra_hashes: Vec<[u8; 32]> = extra_ngrams
98                .iter()
99                .map(|ng| *blake3::hash(ng.as_bytes()).as_bytes())
100                .collect();
101            let extra_simhash = simhash::alg_simhash_inner(&extra_hashes);
102
103            interleave_digests(&name_simhash, &extra_simhash)
104        }
105    }
106}
107
108/// Compute a similarity-preserving 256-bit hash from name text and raw bytes.
109///
110/// Like `soft_hash_meta_v0` but the extra data is raw bytes instead of text.
111/// Uses width-4 byte n-grams (no `text_collapse`) for the bytes path,
112/// and interleaves name/bytes SimHash digests in 4-byte chunks.
113fn soft_hash_meta_v0_with_bytes(name: &str, extra: &[u8]) -> Vec<u8> {
114    let name_simhash = meta_name_simhash(name);
115
116    if extra.is_empty() {
117        return name_simhash;
118    }
119
120    let byte_ngrams = simhash::sliding_window_bytes(extra, 4);
121    let byte_hashes: Vec<[u8; 32]> = byte_ngrams
122        .iter()
123        .map(|ng| *blake3::hash(ng).as_bytes())
124        .collect();
125    let byte_simhash = simhash::alg_simhash_inner(&byte_hashes);
126
127    interleave_digests(&name_simhash, &byte_simhash)
128}
129
130/// Decode a Data-URL's base64 payload.
131///
132/// Expects a string starting with `"data:"`. Splits on the first `,` and
133/// decodes the remainder as standard base64. Returns `InvalidInput` on
134/// missing comma or invalid base64.
135fn decode_data_url(data_url: &str) -> IsccResult<Vec<u8>> {
136    let payload_b64 = data_url
137        .split_once(',')
138        .map(|(_, b64)| b64)
139        .ok_or_else(|| IsccError::InvalidInput("Data-URL missing comma separator".into()))?;
140    data_encoding::BASE64
141        .decode(payload_b64.as_bytes())
142        .map_err(|e| IsccError::InvalidInput(format!("invalid base64 in Data-URL: {e}")))
143}
144
145/// Parse a meta string as JSON and re-serialize to RFC 8785 (JCS) canonical bytes.
146fn parse_meta_json(meta_str: &str) -> IsccResult<Vec<u8>> {
147    let parsed: serde_json::Value = serde_json::from_str(meta_str)
148        .map_err(|e| IsccError::InvalidInput(format!("invalid JSON in meta: {e}")))?;
149    let mut buf = Vec::new();
150    serde_json_canonicalizer::to_writer(&parsed, &mut buf)
151        .map_err(|e| IsccError::InvalidInput(format!("JSON canonicalization failed: {e}")))?;
152    Ok(buf)
153}
154
155/// Build a Data-URL from canonical JSON bytes.
156///
157/// Uses `application/ld+json` media type if the JSON has an `@context` key,
158/// otherwise `application/json`. Encodes payload as standard base64 with padding.
159fn build_meta_data_url(json_bytes: &[u8], json_value: &serde_json::Value) -> String {
160    let media_type = if json_value.get("@context").is_some() {
161        "application/ld+json"
162    } else {
163        "application/json"
164    };
165    let b64 = data_encoding::BASE64.encode(json_bytes);
166    format!("data:{media_type};base64,{b64}")
167}
168
169/// Encode a raw digest into an ISCC unit string.
170///
171/// Takes integer type identifiers (matching `MainType`, `SubType`, `Version` enum values)
172/// and a raw digest, returns a base32-encoded ISCC unit string.
173///
174/// # Errors
175///
176/// Returns `IsccError::InvalidInput` if enum values are out of range, if `mtype` is
177/// `MainType::Iscc` (5), or if `digest.len() < bit_length / 8`.
178pub fn encode_component(
179    mtype: u8,
180    stype: u8,
181    version: u8,
182    bit_length: u32,
183    digest: &[u8],
184) -> IsccResult<String> {
185    let mt = codec::MainType::try_from(mtype)?;
186    let st = codec::SubType::try_from(stype)?;
187    let vs = codec::Version::try_from(version)?;
188    let needed = (bit_length / 8) as usize;
189    if digest.len() < needed {
190        return Err(IsccError::InvalidInput(format!(
191            "digest length {} < bit_length/8 ({})",
192            digest.len(),
193            needed
194        )));
195    }
196    codec::encode_component(mt, st, vs, bit_length, digest)
197}
198
199/// Decode an ISCC unit string into its header components and raw digest.
200///
201/// Inverse of [`encode_component`]. Strips an optional `"ISCC:"` prefix and
202/// dashes, base32-decodes the string, parses the variable-length header, and
203/// returns the digest truncated to exactly the encoded bit-length.
204///
205/// Returns `(maintype, subtype, version, length_index, digest)` where the
206/// integer fields match [`codec::MainType`], [`codec::SubType`], and
207/// [`codec::Version`] enum values.
208///
209/// # Errors
210///
211/// Returns `IsccError::InvalidInput` on invalid base32 input, malformed
212/// header, or if the decoded body is shorter than the expected digest length.
213pub fn iscc_decode(iscc: &str) -> IsccResult<(u8, u8, u8, u8, Vec<u8>)> {
214    // Strip optional "ISCC:" prefix (case-sensitive, matching iscc_decompose)
215    let clean = iscc.strip_prefix("ISCC:").unwrap_or(iscc);
216    // Remove dashes (matching iscc_clean behavior for base32 input)
217    let clean = clean.replace('-', "");
218    let raw = codec::decode_base32(&clean)?;
219    let (mt, st, vs, length_index, tail) = codec::decode_header(&raw)?;
220    let bit_length = codec::decode_length(mt, length_index, st);
221    let nbytes = (bit_length / 8) as usize;
222    if tail.len() < nbytes {
223        return Err(IsccError::InvalidInput(format!(
224            "decoded body too short: expected {nbytes} digest bytes, got {}",
225            tail.len()
226        )));
227    }
228    Ok((
229        mt as u8,
230        st as u8,
231        vs as u8,
232        length_index as u8,
233        tail[..nbytes].to_vec(),
234    ))
235}
236
237/// Convert a JSON string into a `data:` URL with JCS canonicalization.
238///
239/// Parses the JSON, re-serializes to [RFC 8785 (JCS)](https://www.rfc-editor.org/rfc/rfc8785)
240/// canonical form, base64-encodes the result, and wraps it in a `data:` URL.
241/// Uses `application/ld+json` media type when the JSON contains an `@context`
242/// key, otherwise `application/json`.
243///
244/// This enables all language bindings to support dict/object meta parameters
245/// by serializing to JSON once (language-specific) then delegating encoding
246/// to Rust.
247///
248/// # Errors
249///
250/// Returns [`IsccError::InvalidInput`] if `json` is not valid JSON or if
251/// JCS canonicalization fails.
252///
253/// # Examples
254///
255/// ```
256/// # use iscc_lib::json_to_data_url;
257/// let url = json_to_data_url(r#"{"key": "value"}"#).unwrap();
258/// assert!(url.starts_with("data:application/json;base64,"));
259///
260/// let ld_url = json_to_data_url(r#"{"@context": "https://schema.org"}"#).unwrap();
261/// assert!(ld_url.starts_with("data:application/ld+json;base64,"));
262/// ```
263pub fn json_to_data_url(json: &str) -> IsccResult<String> {
264    let parsed: serde_json::Value = serde_json::from_str(json)
265        .map_err(|e| IsccError::InvalidInput(format!("invalid JSON: {e}")))?;
266    let mut canonical_bytes = Vec::new();
267    serde_json_canonicalizer::to_writer(&parsed, &mut canonical_bytes)
268        .map_err(|e| IsccError::InvalidInput(format!("JSON canonicalization failed: {e}")))?;
269    Ok(build_meta_data_url(&canonical_bytes, &parsed))
270}
271
272/// Generate a Meta-Code from name and optional metadata.
273///
274/// Produces an ISCC Meta-Code by hashing the provided name, description,
275/// and metadata fields using the SimHash algorithm. When `meta` is provided,
276/// it is treated as either a Data-URL (if starting with `"data:"`) or a JSON
277/// string, and the decoded/serialized bytes are used for similarity hashing
278/// and metahash computation.
279pub fn gen_meta_code_v0(
280    name: &str,
281    description: Option<&str>,
282    meta: Option<&str>,
283    bits: u32,
284) -> IsccResult<MetaCodeResult> {
285    // Normalize name: clean → remove newlines → trim to 128 bytes
286    let name = utils::text_clean(name);
287    let name = utils::text_remove_newlines(&name);
288    let name = utils::text_trim(&name, META_TRIM_NAME);
289
290    if name.is_empty() {
291        return Err(IsccError::InvalidInput(
292            "name is empty after normalization".into(),
293        ));
294    }
295
296    // Normalize description: clean → trim to 4096 bytes
297    let desc_str = description.unwrap_or("");
298    let desc_clean = utils::text_clean(desc_str);
299    let desc_clean = utils::text_trim(&desc_clean, META_TRIM_DESCRIPTION);
300
301    // Pre-decode fast check: reject obviously oversized meta strings
302    if let Some(meta_str) = meta {
303        const PRE_DECODE_LIMIT: usize = META_TRIM_META * 4 / 3 + 256;
304        if meta_str.len() > PRE_DECODE_LIMIT {
305            return Err(IsccError::InvalidInput(format!(
306                "meta string exceeds size limit ({} > {PRE_DECODE_LIMIT} bytes)",
307                meta_str.len()
308            )));
309        }
310    }
311
312    // Resolve meta payload bytes (if meta is provided)
313    let meta_payload: Option<Vec<u8>> = match meta {
314        Some(meta_str) if meta_str.starts_with("data:") => Some(decode_data_url(meta_str)?),
315        Some(meta_str) => Some(parse_meta_json(meta_str)?),
316        None => None,
317    };
318
319    // Post-decode check: reject payloads exceeding META_TRIM_META
320    if let Some(ref payload) = meta_payload {
321        if payload.len() > META_TRIM_META {
322            return Err(IsccError::InvalidInput(format!(
323                "decoded meta payload exceeds size limit ({} > {META_TRIM_META} bytes)",
324                payload.len()
325            )));
326        }
327    }
328
329    // Branch: meta bytes path vs. description text path
330    if let Some(ref payload) = meta_payload {
331        let meta_code_digest = soft_hash_meta_v0_with_bytes(&name, payload);
332        let metahash = utils::multi_hash_blake3(payload);
333
334        let meta_code = codec::encode_component(
335            codec::MainType::Meta,
336            codec::SubType::None,
337            codec::Version::V0,
338            bits,
339            &meta_code_digest,
340        )?;
341
342        // Build the meta Data-URL for the result
343        let meta_value = match meta {
344            Some(meta_str) if meta_str.starts_with("data:") => meta_str.to_string(),
345            Some(meta_str) => {
346                let parsed: serde_json::Value = serde_json::from_str(meta_str)
347                    .map_err(|e| IsccError::InvalidInput(format!("invalid JSON: {e}")))?;
348                build_meta_data_url(payload, &parsed)
349            }
350            None => unreachable!(),
351        };
352
353        Ok(MetaCodeResult {
354            iscc: format!("ISCC:{meta_code}"),
355            name: name.clone(),
356            description: if desc_clean.is_empty() {
357                None
358            } else {
359                Some(desc_clean)
360            },
361            meta: Some(meta_value),
362            metahash,
363        })
364    } else {
365        // Compute metahash from normalized text payload
366        let payload = if desc_clean.is_empty() {
367            name.clone()
368        } else {
369            format!("{name} {desc_clean}")
370        };
371        let payload = payload.trim().to_string();
372        let metahash = utils::multi_hash_blake3(payload.as_bytes());
373
374        // Compute similarity digest
375        let extra = if desc_clean.is_empty() {
376            None
377        } else {
378            Some(desc_clean.as_str())
379        };
380        let meta_code_digest = soft_hash_meta_v0(&name, extra);
381
382        let meta_code = codec::encode_component(
383            codec::MainType::Meta,
384            codec::SubType::None,
385            codec::Version::V0,
386            bits,
387            &meta_code_digest,
388        )?;
389
390        Ok(MetaCodeResult {
391            iscc: format!("ISCC:{meta_code}"),
392            name: name.clone(),
393            description: if desc_clean.is_empty() {
394                None
395            } else {
396                Some(desc_clean)
397            },
398            meta: None,
399            metahash,
400        })
401    }
402}
403
404/// Compute a 256-bit similarity-preserving hash from collapsed text.
405///
406/// Generates character n-grams with a sliding window of width 13,
407/// hashes each with xxh32, then applies MinHash to produce a 32-byte digest.
408fn soft_hash_text_v0(text: &str) -> Vec<u8> {
409    let ngrams = simhash::sliding_window_strs(text, TEXT_NGRAM_SIZE);
410    let features: Vec<u32> = ngrams
411        .iter()
412        .map(|ng| xxhash_rust::xxh32::xxh32(ng.as_bytes(), 0))
413        .collect();
414    minhash::alg_minhash_256(&features)
415}
416
417/// Generate a Text-Code from plain text content.
418///
419/// Produces an ISCC Content-Code for text by collapsing the input,
420/// extracting character n-gram features, and applying MinHash to
421/// create a similarity-preserving fingerprint.
422pub fn gen_text_code_v0(text: &str, bits: u32) -> IsccResult<TextCodeResult> {
423    let collapsed = utils::text_collapse(text);
424    let characters = collapsed.chars().count();
425    let hash_digest = soft_hash_text_v0(&collapsed);
426    let component = codec::encode_component(
427        codec::MainType::Content,
428        codec::SubType::TEXT,
429        codec::Version::V0,
430        bits,
431        &hash_digest,
432    )?;
433    Ok(TextCodeResult {
434        iscc: format!("ISCC:{component}"),
435        characters,
436    })
437}
438
439/// Transpose a matrix represented as a Vec of Vecs.
440fn transpose_matrix(matrix: &[Vec<f64>]) -> Vec<Vec<f64>> {
441    let rows = matrix.len();
442    if rows == 0 {
443        return vec![];
444    }
445    let cols = matrix[0].len();
446    let mut result = vec![vec![0.0f64; rows]; cols];
447    for (r, row) in matrix.iter().enumerate() {
448        for (c, &val) in row.iter().enumerate() {
449            result[c][r] = val;
450        }
451    }
452    result
453}
454
455/// Extract an 8×8 block from a matrix and flatten to 64 values.
456///
457/// Block position `(col, row)` means the block starts at
458/// `matrix[row][col]` and spans 8 rows and 8 columns.
459fn flatten_8x8(matrix: &[Vec<f64>], col: usize, row: usize) -> Vec<f64> {
460    let mut flat = Vec::with_capacity(64);
461    for matrix_row in matrix.iter().skip(row).take(8) {
462        for &val in matrix_row.iter().skip(col).take(8) {
463            flat.push(val);
464        }
465    }
466    flat
467}
468
469/// Compute the median of a slice of f64 values.
470///
471/// For even-length slices, returns the average of the two middle values
472/// (matching Python `statistics.median` behavior).
473fn compute_median(values: &[f64]) -> f64 {
474    let mut sorted: Vec<f64> = values.to_vec();
475    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
476    let n = sorted.len();
477    if n % 2 == 1 {
478        sorted[n / 2]
479    } else {
480        (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
481    }
482}
483
484/// Convert a slice of bools to a byte vector (MSB first per byte).
485fn bits_to_bytes(bits: &[bool]) -> Vec<u8> {
486    bits.chunks(8)
487        .map(|chunk| {
488            let mut byte = 0u8;
489            for (i, &bit) in chunk.iter().enumerate() {
490                if bit {
491                    byte |= 1 << (7 - i);
492                }
493            }
494            byte
495        })
496        .collect()
497}
498
499/// Compute a DCT-based perceptual hash from 32×32 grayscale pixels.
500///
501/// Applies a 2D DCT to the pixel matrix, extracts four 8×8 low-frequency
502/// blocks, and generates a bitstring by comparing each coefficient against
503/// the block median. Returns up to `bits` bits as a byte vector.
504fn soft_hash_image_v0(pixels: &[u8], bits: u32) -> IsccResult<Vec<u8>> {
505    if pixels.len() != 1024 {
506        return Err(IsccError::InvalidInput(format!(
507            "expected 1024 pixels, got {}",
508            pixels.len()
509        )));
510    }
511    if bits > 256 {
512        return Err(IsccError::InvalidInput(format!(
513            "bits must be <= 256, got {bits}"
514        )));
515    }
516
517    // Step 1: Row-wise DCT (32 rows of 32 pixels)
518    let rows: Vec<Vec<f64>> = pixels
519        .chunks(32)
520        .map(|row| {
521            let row_f64: Vec<f64> = row.iter().map(|&p| p as f64).collect();
522            dct::alg_dct(&row_f64)
523        })
524        .collect::<IsccResult<Vec<Vec<f64>>>>()?;
525
526    // Step 2: Transpose
527    let transposed = transpose_matrix(&rows);
528
529    // Step 3: Column-wise DCT
530    let dct_cols: Vec<Vec<f64>> = transposed
531        .iter()
532        .map(|col| dct::alg_dct(col))
533        .collect::<IsccResult<Vec<Vec<f64>>>>()?;
534
535    // Step 4: Transpose back → dct_matrix
536    let dct_matrix = transpose_matrix(&dct_cols);
537
538    // Step 5: Extract 8×8 blocks at positions (0,0), (1,0), (0,1), (1,1)
539    let positions = [(0, 0), (1, 0), (0, 1), (1, 1)];
540    let mut bitstring = Vec::<bool>::with_capacity(256);
541
542    for (col, row) in positions {
543        let flat = flatten_8x8(&dct_matrix, col, row);
544        let median = compute_median(&flat);
545        for val in &flat {
546            bitstring.push(*val > median);
547        }
548        if bitstring.len() >= bits as usize {
549            break;
550        }
551    }
552
553    // Step 6: Convert first `bits` bools to bytes
554    Ok(bits_to_bytes(&bitstring[..bits as usize]))
555}
556
557/// Generate an Image-Code from pixel data.
558///
559/// Produces an ISCC Content-Code for images from a sequence of 1024
560/// grayscale pixel values (32×32, values 0-255) using a DCT-based
561/// perceptual hash.
562pub fn gen_image_code_v0(pixels: &[u8], bits: u32) -> IsccResult<ImageCodeResult> {
563    let hash_digest = soft_hash_image_v0(pixels, bits)?;
564    let component = codec::encode_component(
565        codec::MainType::Content,
566        codec::SubType::Image,
567        codec::Version::V0,
568        bits,
569        &hash_digest,
570    )?;
571    Ok(ImageCodeResult {
572        iscc: format!("ISCC:{component}"),
573    })
574}
575
576/// Split a slice into `n` parts, distributing remainder across first chunks.
577///
578/// Equivalent to `numpy.array_split` / `more_itertools.divide`:
579/// each part gets `len / n` elements, and the first `len % n` parts
580/// get one extra element. Returns empty slices for excess parts.
581fn array_split<T>(slice: &[T], n: usize) -> Vec<&[T]> {
582    if n == 0 {
583        return vec![];
584    }
585    let len = slice.len();
586    let base = len / n;
587    let remainder = len % n;
588    let mut parts = Vec::with_capacity(n);
589    let mut offset = 0;
590    for i in 0..n {
591        let size = base + if i < remainder { 1 } else { 0 };
592        parts.push(&slice[offset..offset + size]);
593        offset += size;
594    }
595    parts
596}
597
598/// Compute a multi-stage SimHash digest from Chromaprint features.
599///
600/// Builds a 32-byte digest by concatenating 4-byte SimHash chunks:
601/// - Stage 1: overall SimHash of all features (4 bytes)
602/// - Stage 2: SimHash of each quarter of features (4 × 4 = 16 bytes)
603/// - Stage 3: SimHash of each third of sorted features (3 × 4 = 12 bytes)
604fn soft_hash_audio_v0(cv: &[i32]) -> Vec<u8> {
605    // Convert each i32 to 4-byte big-endian digest
606    let digests: Vec<[u8; 4]> = cv.iter().map(|&v| v.to_be_bytes()).collect();
607
608    if digests.is_empty() {
609        return vec![0u8; 32];
610    }
611
612    // Stage 1: overall SimHash (4 bytes)
613    let mut parts: Vec<u8> = simhash::alg_simhash_inner(&digests);
614
615    // Stage 2: quarter-based SimHashes (4 × 4 = 16 bytes)
616    let quarters = array_split(&digests, 4);
617    for quarter in &quarters {
618        if quarter.is_empty() {
619            parts.extend_from_slice(&[0u8; 4]);
620        } else {
621            parts.extend_from_slice(&simhash::alg_simhash_inner(quarter));
622        }
623    }
624
625    // Stage 3: sorted-third-based SimHashes (3 × 4 = 12 bytes)
626    let mut sorted_values: Vec<i32> = cv.to_vec();
627    sorted_values.sort();
628    let sorted_digests: Vec<[u8; 4]> = sorted_values.iter().map(|&v| v.to_be_bytes()).collect();
629    let thirds = array_split(&sorted_digests, 3);
630    for third in &thirds {
631        if third.is_empty() {
632            parts.extend_from_slice(&[0u8; 4]);
633        } else {
634            parts.extend_from_slice(&simhash::alg_simhash_inner(third));
635        }
636    }
637
638    parts
639}
640
641/// Generate an Audio-Code from a Chromaprint feature vector.
642///
643/// Produces an ISCC Content-Code for audio from a Chromaprint signed
644/// integer fingerprint vector using multi-stage SimHash.
645pub fn gen_audio_code_v0(cv: &[i32], bits: u32) -> IsccResult<AudioCodeResult> {
646    let hash_digest = soft_hash_audio_v0(cv);
647    let component = codec::encode_component(
648        codec::MainType::Content,
649        codec::SubType::Audio,
650        codec::Version::V0,
651        bits,
652        &hash_digest,
653    )?;
654    Ok(AudioCodeResult {
655        iscc: format!("ISCC:{component}"),
656    })
657}
658
659/// Compute a similarity-preserving hash from video frame signatures.
660///
661/// Deduplicates frame signatures, computes column-wise sums across all
662/// unique frames, then applies WTA-Hash to produce a digest of `bits/8` bytes.
663pub fn soft_hash_video_v0<S: AsRef<[i32]> + Ord>(
664    frame_sigs: &[S],
665    bits: u32,
666) -> IsccResult<Vec<u8>> {
667    if frame_sigs.is_empty() {
668        return Err(IsccError::InvalidInput(
669            "frame_sigs must not be empty".into(),
670        ));
671    }
672
673    // Deduplicate using BTreeSet (S: Ord)
674    let unique: std::collections::BTreeSet<&S> = frame_sigs.iter().collect();
675
676    // Column-wise sum into i64 to avoid overflow
677    let cols = frame_sigs[0].as_ref().len();
678    let mut vecsum = vec![0i64; cols];
679    for sig in &unique {
680        for (c, &val) in sig.as_ref().iter().enumerate() {
681            vecsum[c] += val as i64;
682        }
683    }
684
685    wtahash::alg_wtahash(&vecsum, bits)
686}
687
688/// Generate a Video-Code from frame signature data.
689///
690/// Produces an ISCC Content-Code for video from a sequence of MPEG-7 frame
691/// signatures. Each frame signature is a 380-element integer vector.
692pub fn gen_video_code_v0<S: AsRef<[i32]> + Ord>(
693    frame_sigs: &[S],
694    bits: u32,
695) -> IsccResult<VideoCodeResult> {
696    let digest = soft_hash_video_v0(frame_sigs, bits)?;
697    let component = codec::encode_component(
698        codec::MainType::Content,
699        codec::SubType::Video,
700        codec::Version::V0,
701        bits,
702        &digest,
703    )?;
704    Ok(VideoCodeResult {
705        iscc: format!("ISCC:{component}"),
706    })
707}
708
709/// Combine multiple Content-Code digests into a single similarity hash.
710///
711/// Takes raw decoded ISCC bytes (header + body) for each Content-Code and
712/// produces a SimHash digest. Each input is trimmed to `bits/8` bytes by
713/// keeping the first header byte (encodes type info) plus `nbytes-1` body bytes.
714/// Requires at least 2 codes, all of MainType::Content.
715fn soft_hash_codes_v0(cc_digests: &[Vec<u8>], bits: u32) -> IsccResult<Vec<u8>> {
716    if cc_digests.len() < 2 {
717        return Err(IsccError::InvalidInput(
718            "at least 2 Content-Codes required for mixing".into(),
719        ));
720    }
721
722    let nbytes = (bits / 8) as usize;
723    let mut prepared: Vec<Vec<u8>> = Vec::with_capacity(cc_digests.len());
724
725    for raw in cc_digests {
726        let (mtype, stype, _ver, blen, body) = codec::decode_header(raw)?;
727        if mtype != codec::MainType::Content {
728            return Err(IsccError::InvalidInput(
729                "all codes must be Content-Codes".into(),
730            ));
731        }
732        let unit_bits = codec::decode_length(mtype, blen, stype);
733        if unit_bits < bits {
734            return Err(IsccError::InvalidInput(format!(
735                "Content-Code too short for {bits}-bit length (has {unit_bits} bits)"
736            )));
737        }
738        let mut entry = Vec::with_capacity(nbytes);
739        entry.push(raw[0]); // first byte preserves type info
740        let take = std::cmp::min(nbytes - 1, body.len());
741        entry.extend_from_slice(&body[..take]);
742        // Pad with zeros if body is shorter than nbytes-1
743        while entry.len() < nbytes {
744            entry.push(0);
745        }
746        prepared.push(entry);
747    }
748
749    Ok(simhash::alg_simhash_inner(&prepared))
750}
751
752/// Generate a Mixed-Code from multiple Content-Code strings.
753///
754/// Produces a Mixed Content-Code by combining multiple ISCC Content-Codes
755/// of different types (text, image, audio, video) using SimHash. Input codes
756/// may optionally include the "ISCC:" prefix.
757pub fn gen_mixed_code_v0(codes: &[&str], bits: u32) -> IsccResult<MixedCodeResult> {
758    let decoded: Vec<Vec<u8>> = codes
759        .iter()
760        .map(|code| {
761            let clean = code.strip_prefix("ISCC:").unwrap_or(code);
762            codec::decode_base32(clean)
763        })
764        .collect::<IsccResult<Vec<Vec<u8>>>>()?;
765
766    let digest = soft_hash_codes_v0(&decoded, bits)?;
767
768    let component = codec::encode_component(
769        codec::MainType::Content,
770        codec::SubType::Mixed,
771        codec::Version::V0,
772        bits,
773        &digest,
774    )?;
775
776    Ok(MixedCodeResult {
777        iscc: format!("ISCC:{component}"),
778        parts: codes.iter().map(|s| s.to_string()).collect(),
779    })
780}
781
782/// Generate a Data-Code from raw byte data.
783///
784/// Produces an ISCC Data-Code by splitting data into content-defined chunks,
785/// hashing each chunk with xxh32, and applying MinHash to create a
786/// similarity-preserving fingerprint.
787pub fn gen_data_code_v0(data: &[u8], bits: u32) -> IsccResult<DataCodeResult> {
788    let chunks = cdc::alg_cdc_chunks(data, false, cdc::DATA_AVG_CHUNK_SIZE);
789    let mut features: Vec<u32> = chunks
790        .iter()
791        .map(|chunk| xxhash_rust::xxh32::xxh32(chunk, 0))
792        .collect();
793
794    // Defensive: ensure at least one feature (alg_cdc_chunks guarantees >= 1 chunk)
795    if features.is_empty() {
796        features.push(xxhash_rust::xxh32::xxh32(b"", 0));
797    }
798
799    let digest = minhash::alg_minhash_256(&features);
800    let component = codec::encode_component(
801        codec::MainType::Data,
802        codec::SubType::None,
803        codec::Version::V0,
804        bits,
805        &digest,
806    )?;
807
808    Ok(DataCodeResult {
809        iscc: format!("ISCC:{component}"),
810    })
811}
812
813/// Generate an Instance-Code from raw byte data.
814///
815/// Produces an ISCC Instance-Code by hashing the complete byte stream
816/// with BLAKE3. Captures the exact binary identity of the data.
817pub fn gen_instance_code_v0(data: &[u8], bits: u32) -> IsccResult<InstanceCodeResult> {
818    let digest = blake3::hash(data);
819    let datahash = utils::multi_hash_blake3(data);
820    let filesize = data.len() as u64;
821    let component = codec::encode_component(
822        codec::MainType::Instance,
823        codec::SubType::None,
824        codec::Version::V0,
825        bits,
826        digest.as_bytes(),
827    )?;
828    Ok(InstanceCodeResult {
829        iscc: format!("ISCC:{component}"),
830        datahash,
831        filesize,
832    })
833}
834
835/// Generate a composite ISCC-CODE from individual ISCC unit codes.
836///
837/// Combines multiple ISCC unit codes (Meta-Code, Content-Code, Data-Code,
838/// Instance-Code) into a single composite ISCC-CODE. Input codes may
839/// optionally include the "ISCC:" prefix. At least Data-Code and
840/// Instance-Code are required. When `wide` is true and exactly two
841/// 128-bit+ codes (Data + Instance) are provided, produces a 256-bit
842/// wide-mode code.
843pub fn gen_iscc_code_v0(codes: &[&str], wide: bool) -> IsccResult<IsccCodeResult> {
844    // Step 1: Clean inputs — strip "ISCC:" prefix
845    let cleaned: Vec<&str> = codes
846        .iter()
847        .map(|c| c.strip_prefix("ISCC:").unwrap_or(c))
848        .collect();
849
850    // Step 2: Validate minimum count
851    if cleaned.len() < 2 {
852        return Err(IsccError::InvalidInput(
853            "at least 2 ISCC unit codes required".into(),
854        ));
855    }
856
857    // Step 3: Validate minimum length (16 base32 chars = 64-bit minimum)
858    for code in &cleaned {
859        if code.len() < 16 {
860            return Err(IsccError::InvalidInput(format!(
861                "ISCC unit code too short (min 16 chars): {code}"
862            )));
863        }
864    }
865
866    // Step 4: Decode each code
867    let mut decoded: Vec<(
868        codec::MainType,
869        codec::SubType,
870        codec::Version,
871        u32,
872        Vec<u8>,
873    )> = Vec::with_capacity(cleaned.len());
874    for code in &cleaned {
875        let raw = codec::decode_base32(code)?;
876        let header = codec::decode_header(&raw)?;
877        decoded.push(header);
878    }
879
880    // Step 5: Sort by MainType (ascending)
881    decoded.sort_by_key(|&(mt, ..)| mt);
882
883    // Step 6: Extract main_types
884    let main_types: Vec<codec::MainType> = decoded.iter().map(|&(mt, ..)| mt).collect();
885
886    // Step 7: Validate last two are Data + Instance (mandatory)
887    let n = main_types.len();
888    if main_types[n - 2] != codec::MainType::Data || main_types[n - 1] != codec::MainType::Instance
889    {
890        return Err(IsccError::InvalidInput(
891            "Data-Code and Instance-Code are mandatory".into(),
892        ));
893    }
894
895    // Step 8: Determine wide composite
896    let is_wide = wide
897        && decoded.len() == 2
898        && main_types == [codec::MainType::Data, codec::MainType::Instance]
899        && decoded
900            .iter()
901            .all(|&(mt, st, _, len, _)| codec::decode_length(mt, len, st) >= 128);
902
903    // Step 9: Determine SubType
904    let st = if is_wide {
905        codec::SubType::Wide
906    } else {
907        // Collect SubTypes of Semantic/Content units
908        let sc_subtypes: Vec<codec::SubType> = decoded
909            .iter()
910            .filter(|&&(mt, ..)| mt == codec::MainType::Semantic || mt == codec::MainType::Content)
911            .map(|&(_, st, ..)| st)
912            .collect();
913
914        if !sc_subtypes.is_empty() {
915            // All must be the same
916            let first = sc_subtypes[0];
917            if sc_subtypes.iter().all(|&s| s == first) {
918                first
919            } else {
920                return Err(IsccError::InvalidInput(
921                    "mixed SubTypes among Content/Semantic units".into(),
922                ));
923            }
924        } else if decoded.len() == 2 {
925            codec::SubType::Sum
926        } else {
927            codec::SubType::IsccNone
928        }
929    };
930
931    // Step 10–11: Get optional MainTypes and encode
932    let optional_types = &main_types[..n - 2];
933    let encoded_length = codec::encode_units(optional_types)?;
934
935    // Step 12: Build digest body
936    let bytes_per_unit = if is_wide { 16 } else { 8 };
937    let mut digest = Vec::with_capacity(decoded.len() * bytes_per_unit);
938    for (_, _, _, _, tail) in &decoded {
939        let take = bytes_per_unit.min(tail.len());
940        digest.extend_from_slice(&tail[..take]);
941    }
942
943    // Step 13–14: Encode header + digest as base32
944    let header = codec::encode_header(
945        codec::MainType::Iscc,
946        st,
947        codec::Version::V0,
948        encoded_length,
949    )?;
950    let mut code_bytes = header;
951    code_bytes.extend_from_slice(&digest);
952    let code = codec::encode_base32(&code_bytes);
953
954    // Step 15: Return with prefix
955    Ok(IsccCodeResult {
956        iscc: format!("ISCC:{code}"),
957    })
958}
959
960/// Generate a composite ISCC-CODE from a file in a single pass.
961///
962/// Opens the file at `path`, reads it with an optimal buffer size, and feeds
963/// both `DataHasher` (CDC/MinHash) and `InstanceHasher` (BLAKE3) from the
964/// same read buffer. Composes the final ISCC-CODE from the Data-Code and
965/// Instance-Code internally. This avoids multiple passes over the file and
966/// eliminates per-chunk FFI overhead in language bindings.
967pub fn gen_sum_code_v0(path: &std::path::Path, bits: u32, wide: bool) -> IsccResult<SumCodeResult> {
968    use std::io::Read;
969
970    let mut file = std::fs::File::open(path)
971        .map_err(|e| IsccError::InvalidInput(format!("Cannot open file: {e}")))?;
972
973    let mut data_hasher = streaming::DataHasher::new();
974    let mut instance_hasher = streaming::InstanceHasher::new();
975
976    let mut buf = vec![0u8; IO_READ_SIZE];
977    loop {
978        let n = file
979            .read(&mut buf)
980            .map_err(|e| IsccError::InvalidInput(format!("Cannot read file: {e}")))?;
981        if n == 0 {
982            break;
983        }
984        data_hasher.update(&buf[..n]);
985        instance_hasher.update(&buf[..n]);
986    }
987
988    let data_result = data_hasher.finalize(bits)?;
989    let instance_result = instance_hasher.finalize(bits)?;
990
991    let iscc_result = gen_iscc_code_v0(&[&data_result.iscc, &instance_result.iscc], wide)?;
992
993    Ok(SumCodeResult {
994        iscc: iscc_result.iscc,
995        datahash: instance_result.datahash,
996        filesize: instance_result.filesize,
997    })
998}
999
1000#[cfg(test)]
1001mod tests {
1002    use super::*;
1003
1004    #[test]
1005    fn test_gen_meta_code_v0_title_only() {
1006        let result = gen_meta_code_v0("Die Unendliche Geschichte", None, None, 64).unwrap();
1007        assert_eq!(result.iscc, "ISCC:AAAZXZ6OU74YAZIM");
1008        assert_eq!(result.name, "Die Unendliche Geschichte");
1009        assert_eq!(result.description, None);
1010        assert_eq!(result.meta, None);
1011    }
1012
1013    #[test]
1014    fn test_gen_meta_code_v0_title_description() {
1015        let result = gen_meta_code_v0(
1016            "Die Unendliche Geschichte",
1017            Some("Von Michael Ende"),
1018            None,
1019            64,
1020        )
1021        .unwrap();
1022        assert_eq!(result.iscc, "ISCC:AAAZXZ6OU4E45RB5");
1023        assert_eq!(result.name, "Die Unendliche Geschichte");
1024        assert_eq!(result.description, Some("Von Michael Ende".to_string()));
1025        assert_eq!(result.meta, None);
1026    }
1027
1028    #[test]
1029    fn test_gen_meta_code_v0_json_meta() {
1030        let result = gen_meta_code_v0("Hello", None, Some(r#"{"some":"object"}"#), 64).unwrap();
1031        assert_eq!(result.iscc, "ISCC:AAAWKLHFXN63LHL2");
1032        assert!(result.meta.is_some());
1033        assert!(
1034            result
1035                .meta
1036                .unwrap()
1037                .starts_with("data:application/json;base64,")
1038        );
1039    }
1040
1041    #[test]
1042    fn test_gen_meta_code_v0_data_url_meta() {
1043        let result = gen_meta_code_v0(
1044            "Hello",
1045            None,
1046            Some("data:application/json;charset=utf-8;base64,eyJzb21lIjogIm9iamVjdCJ9"),
1047            64,
1048        )
1049        .unwrap();
1050        assert_eq!(result.iscc, "ISCC:AAAWKLHFXN43ICP2");
1051        // Data-URL is passed through as-is
1052        assert_eq!(
1053            result.meta,
1054            Some("data:application/json;charset=utf-8;base64,eyJzb21lIjogIm9iamVjdCJ9".to_string())
1055        );
1056    }
1057
1058    /// Verify that JSON metadata with float values is canonicalized per RFC 8785 (JCS).
1059    ///
1060    /// JCS serializes `1.0` as `1` (integer form), while `serde_json` preserves `1.0`.
1061    /// This causes different canonical bytes, different metahash, and different ISCC codes.
1062    /// Expected values generated by `iscc-core` with `jcs.canonicalize({"value": 1.0})`.
1063    #[test]
1064    fn test_gen_meta_code_v0_jcs_float_canonicalization() {
1065        // JCS canonicalizes {"value": 1.0} → {"value":1} (integer form)
1066        // serde_json produces {"value":1.0} (preserves float notation)
1067        let result = gen_meta_code_v0("Test", None, Some(r#"{"value":1.0}"#), 64).unwrap();
1068
1069        // Expected values from iscc-core (Python) using jcs.canonicalize()
1070        assert_eq!(
1071            result.iscc, "ISCC:AAAX4GX3RZH2I6QZ",
1072            "ISCC mismatch: parse_meta_json must use RFC 8785 (JCS) canonicalization"
1073        );
1074        assert_eq!(
1075            result.meta,
1076            Some("data:application/json;base64,eyJ2YWx1ZSI6MX0=".to_string()),
1077            "meta Data-URL mismatch: JCS should serialize 1.0 as 1"
1078        );
1079        assert_eq!(
1080            result.metahash, "1e2010b291d392b6999ffe4aa4661fb343fc371fca3bfb5bb4e8d8226fdf85743232",
1081            "metahash mismatch: canonical bytes differ between JCS and serde_json"
1082        );
1083    }
1084
1085    /// Verify JCS number formatting for large floats (scientific notation edge case).
1086    ///
1087    /// JCS serializes `1e20` as `100000000000000000000` (expanded integer form).
1088    /// Expected values generated by `iscc-core` with `jcs.canonicalize({"value": 1e20})`.
1089    #[test]
1090    fn test_gen_meta_code_v0_jcs_large_float_canonicalization() {
1091        let result = gen_meta_code_v0("Test", None, Some(r#"{"value":1e20}"#), 64).unwrap();
1092
1093        assert_eq!(
1094            result.iscc, "ISCC:AAAX4GX3R32YH5P7",
1095            "ISCC mismatch: JCS should expand 1e20 to 100000000000000000000"
1096        );
1097        assert_eq!(
1098            result.meta,
1099            Some(
1100                "data:application/json;base64,eyJ2YWx1ZSI6MTAwMDAwMDAwMDAwMDAwMDAwMDAwfQ=="
1101                    .to_string()
1102            ),
1103            "meta Data-URL mismatch: JCS should expand large float to integer form"
1104        );
1105        assert_eq!(
1106            result.metahash, "1e201ff83c1822c348717658a0b4713739646da7c59832691b337a457416ddd1c73d",
1107            "metahash mismatch: canonical bytes differ for large float"
1108        );
1109    }
1110
1111    #[test]
1112    fn test_gen_meta_code_v0_invalid_json() {
1113        assert!(matches!(
1114            gen_meta_code_v0("test", None, Some("not json"), 64),
1115            Err(IsccError::InvalidInput(_))
1116        ));
1117    }
1118
1119    #[test]
1120    fn test_gen_meta_code_v0_invalid_data_url() {
1121        assert!(matches!(
1122            gen_meta_code_v0("test", None, Some("data:no-comma-here"), 64),
1123            Err(IsccError::InvalidInput(_))
1124        ));
1125    }
1126
1127    #[test]
1128    fn test_gen_meta_code_v0_conformance() {
1129        let json_str = include_str!("../tests/data.json");
1130        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1131        let section = &data["gen_meta_code_v0"];
1132        let cases = section.as_object().unwrap();
1133
1134        let mut tested = 0;
1135
1136        for (tc_name, tc) in cases {
1137            let inputs = tc["inputs"].as_array().unwrap();
1138            let input_name = inputs[0].as_str().unwrap();
1139            let input_desc = inputs[1].as_str().unwrap();
1140            let meta_val = &inputs[2];
1141            let bits = inputs[3].as_u64().unwrap() as u32;
1142
1143            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1144            let expected_metahash = tc["outputs"]["metahash"].as_str().unwrap();
1145
1146            // Dispatch meta parameter based on JSON value type
1147            let meta_arg: Option<String> = match meta_val {
1148                serde_json::Value::Null => None,
1149                serde_json::Value::String(s) => Some(s.clone()),
1150                serde_json::Value::Object(_) => Some(serde_json::to_string(meta_val).unwrap()),
1151                other => panic!("unexpected meta type in {tc_name}: {other:?}"),
1152            };
1153
1154            let desc = if input_desc.is_empty() {
1155                None
1156            } else {
1157                Some(input_desc)
1158            };
1159
1160            // Verify ISCC output from struct
1161            let result = gen_meta_code_v0(input_name, desc, meta_arg.as_deref(), bits)
1162                .unwrap_or_else(|e| panic!("gen_meta_code_v0 failed for {tc_name}: {e}"));
1163            assert_eq!(
1164                result.iscc, expected_iscc,
1165                "ISCC mismatch in test case {tc_name}"
1166            );
1167
1168            // Verify metahash from struct
1169            assert_eq!(
1170                result.metahash, expected_metahash,
1171                "metahash mismatch in test case {tc_name}"
1172            );
1173
1174            // Verify name from struct
1175            if let Some(expected_name) = tc["outputs"].get("name") {
1176                let expected_name = expected_name.as_str().unwrap();
1177                assert_eq!(
1178                    result.name, expected_name,
1179                    "name mismatch in test case {tc_name}"
1180                );
1181            }
1182
1183            // Verify description from struct
1184            if let Some(expected_desc) = tc["outputs"].get("description") {
1185                let expected_desc = expected_desc.as_str().unwrap();
1186                assert_eq!(
1187                    result.description.as_deref(),
1188                    Some(expected_desc),
1189                    "description mismatch in test case {tc_name}"
1190                );
1191            }
1192
1193            // Verify meta from struct
1194            if meta_arg.is_some() {
1195                assert!(
1196                    result.meta.is_some(),
1197                    "meta should be present in test case {tc_name}"
1198                );
1199            } else {
1200                assert!(
1201                    result.meta.is_none(),
1202                    "meta should be absent in test case {tc_name}"
1203                );
1204            }
1205
1206            tested += 1;
1207        }
1208
1209        assert_eq!(tested, 16, "expected 16 conformance tests to run");
1210    }
1211
1212    #[test]
1213    fn test_gen_text_code_v0_empty() {
1214        let result = gen_text_code_v0("", 64).unwrap();
1215        assert_eq!(result.iscc, "ISCC:EAASL4F2WZY7KBXB");
1216        assert_eq!(result.characters, 0);
1217    }
1218
1219    #[test]
1220    fn test_gen_text_code_v0_hello_world() {
1221        let result = gen_text_code_v0("Hello World", 64).unwrap();
1222        assert_eq!(result.iscc, "ISCC:EAASKDNZNYGUUF5A");
1223        assert_eq!(result.characters, 10); // "helloworld" after collapse
1224    }
1225
1226    #[test]
1227    fn test_gen_text_code_v0_conformance() {
1228        let json_str = include_str!("../tests/data.json");
1229        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1230        let section = &data["gen_text_code_v0"];
1231        let cases = section.as_object().unwrap();
1232
1233        let mut tested = 0;
1234
1235        for (tc_name, tc) in cases {
1236            let inputs = tc["inputs"].as_array().unwrap();
1237            let input_text = inputs[0].as_str().unwrap();
1238            let bits = inputs[1].as_u64().unwrap() as u32;
1239
1240            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1241            let expected_chars = tc["outputs"]["characters"].as_u64().unwrap() as usize;
1242
1243            // Verify ISCC output from struct
1244            let result = gen_text_code_v0(input_text, bits)
1245                .unwrap_or_else(|e| panic!("gen_text_code_v0 failed for {tc_name}: {e}"));
1246            assert_eq!(
1247                result.iscc, expected_iscc,
1248                "ISCC mismatch in test case {tc_name}"
1249            );
1250
1251            // Verify character count from struct
1252            assert_eq!(
1253                result.characters, expected_chars,
1254                "character count mismatch in test case {tc_name}"
1255            );
1256
1257            tested += 1;
1258        }
1259
1260        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1261    }
1262
1263    #[test]
1264    fn test_gen_image_code_v0_all_black() {
1265        let pixels = vec![0u8; 1024];
1266        let result = gen_image_code_v0(&pixels, 64).unwrap();
1267        assert_eq!(result.iscc, "ISCC:EEAQAAAAAAAAAAAA");
1268    }
1269
1270    #[test]
1271    fn test_gen_image_code_v0_all_white() {
1272        let pixels = vec![255u8; 1024];
1273        let result = gen_image_code_v0(&pixels, 128).unwrap();
1274        assert_eq!(result.iscc, "ISCC:EEBYAAAAAAAAAAAAAAAAAAAAAAAAA");
1275    }
1276
1277    #[test]
1278    fn test_gen_image_code_v0_invalid_pixel_count() {
1279        assert!(gen_image_code_v0(&[0u8; 100], 64).is_err());
1280    }
1281
1282    #[test]
1283    fn test_gen_image_code_v0_conformance() {
1284        let json_str = include_str!("../tests/data.json");
1285        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1286        let section = &data["gen_image_code_v0"];
1287        let cases = section.as_object().unwrap();
1288
1289        let mut tested = 0;
1290
1291        for (tc_name, tc) in cases {
1292            let inputs = tc["inputs"].as_array().unwrap();
1293            let pixels_json = inputs[0].as_array().unwrap();
1294            let bits = inputs[1].as_u64().unwrap() as u32;
1295            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1296
1297            let pixels: Vec<u8> = pixels_json
1298                .iter()
1299                .map(|v| v.as_u64().unwrap() as u8)
1300                .collect();
1301
1302            let result = gen_image_code_v0(&pixels, bits)
1303                .unwrap_or_else(|e| panic!("gen_image_code_v0 failed for {tc_name}: {e}"));
1304            assert_eq!(
1305                result.iscc, expected_iscc,
1306                "ISCC mismatch in test case {tc_name}"
1307            );
1308
1309            tested += 1;
1310        }
1311
1312        assert_eq!(tested, 3, "expected 3 conformance tests to run");
1313    }
1314
1315    #[test]
1316    fn test_gen_audio_code_v0_empty() {
1317        let result = gen_audio_code_v0(&[], 64).unwrap();
1318        assert_eq!(result.iscc, "ISCC:EIAQAAAAAAAAAAAA");
1319    }
1320
1321    #[test]
1322    fn test_gen_audio_code_v0_single() {
1323        let result = gen_audio_code_v0(&[1], 128).unwrap();
1324        assert_eq!(result.iscc, "ISCC:EIBQAAAAAEAAAAABAAAAAAAAAAAAA");
1325    }
1326
1327    #[test]
1328    fn test_gen_audio_code_v0_negative() {
1329        let result = gen_audio_code_v0(&[-1, 0, 1], 256).unwrap();
1330        assert_eq!(
1331            result.iscc,
1332            "ISCC:EIDQAAAAAH777777AAAAAAAAAAAACAAAAAAP777774AAAAAAAAAAAAI"
1333        );
1334    }
1335
1336    #[test]
1337    fn test_gen_audio_code_v0_conformance() {
1338        let json_str = include_str!("../tests/data.json");
1339        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1340        let section = &data["gen_audio_code_v0"];
1341        let cases = section.as_object().unwrap();
1342
1343        let mut tested = 0;
1344
1345        for (tc_name, tc) in cases {
1346            let inputs = tc["inputs"].as_array().unwrap();
1347            let cv_json = inputs[0].as_array().unwrap();
1348            let bits = inputs[1].as_u64().unwrap() as u32;
1349            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1350
1351            let cv: Vec<i32> = cv_json.iter().map(|v| v.as_i64().unwrap() as i32).collect();
1352
1353            let result = gen_audio_code_v0(&cv, bits)
1354                .unwrap_or_else(|e| panic!("gen_audio_code_v0 failed for {tc_name}: {e}"));
1355            assert_eq!(
1356                result.iscc, expected_iscc,
1357                "ISCC mismatch in test case {tc_name}"
1358            );
1359
1360            tested += 1;
1361        }
1362
1363        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1364    }
1365
1366    #[test]
1367    fn test_array_split_even() {
1368        let data = vec![1, 2, 3, 4];
1369        let parts = array_split(&data, 4);
1370        assert_eq!(parts, vec![&[1][..], &[2][..], &[3][..], &[4][..]]);
1371    }
1372
1373    #[test]
1374    fn test_array_split_remainder() {
1375        let data = vec![1, 2, 3, 4, 5];
1376        let parts = array_split(&data, 3);
1377        assert_eq!(parts, vec![&[1, 2][..], &[3, 4][..], &[5][..]]);
1378    }
1379
1380    #[test]
1381    fn test_array_split_more_parts_than_elements() {
1382        let data = vec![1, 2];
1383        let parts = array_split(&data, 4);
1384        assert_eq!(
1385            parts,
1386            vec![&[1][..], &[2][..], &[][..] as &[i32], &[][..] as &[i32]]
1387        );
1388    }
1389
1390    #[test]
1391    fn test_array_split_empty() {
1392        let data: Vec<i32> = vec![];
1393        let parts = array_split(&data, 3);
1394        assert_eq!(
1395            parts,
1396            vec![&[][..] as &[i32], &[][..] as &[i32], &[][..] as &[i32]]
1397        );
1398    }
1399
1400    #[test]
1401    fn test_gen_video_code_v0_empty_frames() {
1402        let frames: Vec<Vec<i32>> = vec![];
1403        assert!(matches!(
1404            gen_video_code_v0(&frames, 64),
1405            Err(IsccError::InvalidInput(_))
1406        ));
1407    }
1408
1409    #[test]
1410    fn test_gen_video_code_v0_conformance() {
1411        let json_str = include_str!("../tests/data.json");
1412        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1413        let section = &data["gen_video_code_v0"];
1414        let cases = section.as_object().unwrap();
1415
1416        let mut tested = 0;
1417
1418        for (tc_name, tc) in cases {
1419            let inputs = tc["inputs"].as_array().unwrap();
1420            let frames_json = inputs[0].as_array().unwrap();
1421            let bits = inputs[1].as_u64().unwrap() as u32;
1422            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1423
1424            let frame_sigs: Vec<Vec<i32>> = frames_json
1425                .iter()
1426                .map(|frame| {
1427                    frame
1428                        .as_array()
1429                        .unwrap()
1430                        .iter()
1431                        .map(|v| v.as_i64().unwrap() as i32)
1432                        .collect()
1433                })
1434                .collect();
1435
1436            let result = gen_video_code_v0(&frame_sigs, bits)
1437                .unwrap_or_else(|e| panic!("gen_video_code_v0 failed for {tc_name}: {e}"));
1438            assert_eq!(
1439                result.iscc, expected_iscc,
1440                "ISCC mismatch in test case {tc_name}"
1441            );
1442
1443            tested += 1;
1444        }
1445
1446        assert_eq!(tested, 3, "expected 3 conformance tests to run");
1447    }
1448
1449    #[test]
1450    fn test_gen_mixed_code_v0_conformance() {
1451        let json_str = include_str!("../tests/data.json");
1452        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1453        let section = &data["gen_mixed_code_v0"];
1454        let cases = section.as_object().unwrap();
1455
1456        let mut tested = 0;
1457
1458        for (tc_name, tc) in cases {
1459            let inputs = tc["inputs"].as_array().unwrap();
1460            let codes_json = inputs[0].as_array().unwrap();
1461            let bits = inputs[1].as_u64().unwrap() as u32;
1462            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1463            let expected_parts: Vec<&str> = tc["outputs"]["parts"]
1464                .as_array()
1465                .unwrap()
1466                .iter()
1467                .map(|v| v.as_str().unwrap())
1468                .collect();
1469
1470            let codes: Vec<&str> = codes_json.iter().map(|v| v.as_str().unwrap()).collect();
1471
1472            let result = gen_mixed_code_v0(&codes, bits)
1473                .unwrap_or_else(|e| panic!("gen_mixed_code_v0 failed for {tc_name}: {e}"));
1474            assert_eq!(
1475                result.iscc, expected_iscc,
1476                "ISCC mismatch in test case {tc_name}"
1477            );
1478
1479            // Verify parts from struct match expected
1480            let result_parts: Vec<&str> = result.parts.iter().map(|s| s.as_str()).collect();
1481            assert_eq!(
1482                result_parts, expected_parts,
1483                "parts mismatch in test case {tc_name}"
1484            );
1485
1486            tested += 1;
1487        }
1488
1489        assert_eq!(tested, 2, "expected 2 conformance tests to run");
1490    }
1491
1492    #[test]
1493    fn test_gen_mixed_code_v0_too_few_codes() {
1494        assert!(matches!(
1495            gen_mixed_code_v0(&["EUA6GIKXN42IQV3S"], 64),
1496            Err(IsccError::InvalidInput(_))
1497        ));
1498    }
1499
1500    /// Build raw Content-Code bytes (header + body) for a given bit length.
1501    fn make_content_code_raw(stype: codec::SubType, bit_length: u32) -> Vec<u8> {
1502        let nbytes = (bit_length / 8) as usize;
1503        let body: Vec<u8> = (0..nbytes).map(|i| (i & 0xFF) as u8).collect();
1504        let base32 = codec::encode_component(
1505            codec::MainType::Content,
1506            stype,
1507            codec::Version::V0,
1508            bit_length,
1509            &body,
1510        )
1511        .unwrap();
1512        codec::decode_base32(&base32).unwrap()
1513    }
1514
1515    #[test]
1516    fn test_soft_hash_codes_v0_rejects_short_code() {
1517        // One code with 64 bits, one with only 32 bits — should reject when requesting 64
1518        let code_64 = make_content_code_raw(codec::SubType::None, 64);
1519        let code_32 = make_content_code_raw(codec::SubType::Image, 32);
1520        let result = soft_hash_codes_v0(&[code_64, code_32], 64);
1521        assert!(
1522            matches!(&result, Err(IsccError::InvalidInput(msg)) if msg.contains("too short")),
1523            "expected InvalidInput with 'too short', got {result:?}"
1524        );
1525    }
1526
1527    #[test]
1528    fn test_soft_hash_codes_v0_accepts_exact_length() {
1529        // Two codes with exactly 64 bits each — should succeed when requesting 64
1530        let code_a = make_content_code_raw(codec::SubType::None, 64);
1531        let code_b = make_content_code_raw(codec::SubType::Image, 64);
1532        let result = soft_hash_codes_v0(&[code_a, code_b], 64);
1533        assert!(result.is_ok(), "expected Ok, got {result:?}");
1534    }
1535
1536    #[test]
1537    fn test_soft_hash_codes_v0_accepts_longer_codes() {
1538        // Two codes with 128 bits each — should succeed when requesting 64
1539        let code_a = make_content_code_raw(codec::SubType::None, 128);
1540        let code_b = make_content_code_raw(codec::SubType::Audio, 128);
1541        let result = soft_hash_codes_v0(&[code_a, code_b], 64);
1542        assert!(result.is_ok(), "expected Ok, got {result:?}");
1543    }
1544
1545    #[test]
1546    fn test_gen_data_code_v0_conformance() {
1547        let json_str = include_str!("../tests/data.json");
1548        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1549        let section = &data["gen_data_code_v0"];
1550        let cases = section.as_object().unwrap();
1551
1552        let mut tested = 0;
1553
1554        for (tc_name, tc) in cases {
1555            let inputs = tc["inputs"].as_array().unwrap();
1556            let stream_str = inputs[0].as_str().unwrap();
1557            let bits = inputs[1].as_u64().unwrap() as u32;
1558            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1559
1560            // Parse "stream:" prefix — remainder is hex-encoded bytes
1561            let hex_data = stream_str
1562                .strip_prefix("stream:")
1563                .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {tc_name}"));
1564            let input_bytes = hex::decode(hex_data)
1565                .unwrap_or_else(|e| panic!("invalid hex in test case {tc_name}: {e}"));
1566
1567            let result = gen_data_code_v0(&input_bytes, bits)
1568                .unwrap_or_else(|e| panic!("gen_data_code_v0 failed for {tc_name}: {e}"));
1569            assert_eq!(
1570                result.iscc, expected_iscc,
1571                "ISCC mismatch in test case {tc_name}"
1572            );
1573
1574            tested += 1;
1575        }
1576
1577        assert_eq!(tested, 4, "expected 4 conformance tests to run");
1578    }
1579
1580    #[test]
1581    fn test_gen_instance_code_v0_empty() {
1582        let result = gen_instance_code_v0(b"", 64).unwrap();
1583        assert_eq!(result.iscc, "ISCC:IAA26E2JXH27TING");
1584        assert_eq!(result.filesize, 0);
1585        assert_eq!(
1586            result.datahash,
1587            "1e20af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"
1588        );
1589    }
1590
1591    #[test]
1592    fn test_gen_instance_code_v0_conformance() {
1593        let json_str = include_str!("../tests/data.json");
1594        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1595        let section = &data["gen_instance_code_v0"];
1596        let cases = section.as_object().unwrap();
1597
1598        for (name, tc) in cases {
1599            let inputs = tc["inputs"].as_array().unwrap();
1600            let stream_str = inputs[0].as_str().unwrap();
1601            let bits = inputs[1].as_u64().unwrap() as u32;
1602            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1603
1604            // Parse "stream:" prefix — remainder is hex-encoded bytes
1605            let hex_data = stream_str
1606                .strip_prefix("stream:")
1607                .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {name}"));
1608            let input_bytes = hex::decode(hex_data)
1609                .unwrap_or_else(|e| panic!("invalid hex in test case {name}: {e}"));
1610
1611            let result = gen_instance_code_v0(&input_bytes, bits)
1612                .unwrap_or_else(|e| panic!("gen_instance_code_v0 failed for {name}: {e}"));
1613            assert_eq!(
1614                result.iscc, expected_iscc,
1615                "ISCC mismatch in test case {name}"
1616            );
1617
1618            // Verify datahash from struct
1619            if let Some(expected_datahash) = tc["outputs"].get("datahash") {
1620                let expected_datahash = expected_datahash.as_str().unwrap();
1621                assert_eq!(
1622                    result.datahash, expected_datahash,
1623                    "datahash mismatch in test case {name}"
1624                );
1625            }
1626
1627            // Verify filesize from struct
1628            if let Some(expected_filesize) = tc["outputs"].get("filesize") {
1629                let expected_filesize = expected_filesize.as_u64().unwrap();
1630                assert_eq!(
1631                    result.filesize, expected_filesize,
1632                    "filesize mismatch in test case {name}"
1633                );
1634            }
1635
1636            // Also verify filesize matches input data length
1637            assert_eq!(
1638                result.filesize,
1639                input_bytes.len() as u64,
1640                "filesize should match input length in test case {name}"
1641            );
1642        }
1643    }
1644
1645    #[test]
1646    fn test_gen_iscc_code_v0_conformance() {
1647        let json_str = include_str!("../tests/data.json");
1648        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1649        let section = &data["gen_iscc_code_v0"];
1650        let cases = section.as_object().unwrap();
1651
1652        let mut tested = 0;
1653
1654        for (tc_name, tc) in cases {
1655            let inputs = tc["inputs"].as_array().unwrap();
1656            let codes_json = inputs[0].as_array().unwrap();
1657            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1658
1659            let codes: Vec<&str> = codes_json.iter().map(|v| v.as_str().unwrap()).collect();
1660
1661            let result = gen_iscc_code_v0(&codes, false)
1662                .unwrap_or_else(|e| panic!("gen_iscc_code_v0 failed for {tc_name}: {e}"));
1663            assert_eq!(
1664                result.iscc, expected_iscc,
1665                "ISCC mismatch in test case {tc_name}"
1666            );
1667
1668            tested += 1;
1669        }
1670
1671        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1672    }
1673
1674    #[test]
1675    fn test_gen_iscc_code_v0_too_few_codes() {
1676        assert!(matches!(
1677            gen_iscc_code_v0(&["AAAWKLHFPV6OPKDG"], false),
1678            Err(IsccError::InvalidInput(_))
1679        ));
1680    }
1681
1682    #[test]
1683    fn test_gen_iscc_code_v0_missing_instance() {
1684        // Two Meta codes — missing Data and Instance
1685        assert!(matches!(
1686            gen_iscc_code_v0(&["AAAWKLHFPV6OPKDG", "AAAWKLHFPV6OPKDG"], false),
1687            Err(IsccError::InvalidInput(_))
1688        ));
1689    }
1690
1691    #[test]
1692    fn test_gen_iscc_code_v0_short_code() {
1693        // Code too short (< 16 chars)
1694        assert!(matches!(
1695            gen_iscc_code_v0(&["AAAWKLHFPV6", "AAAWKLHFPV6OPKDG"], false),
1696            Err(IsccError::InvalidInput(_))
1697        ));
1698    }
1699
1700    /// Verify that a Data-URL with empty base64 payload enters the meta bytes path.
1701    ///
1702    /// Python reference: `if meta:` is truthy for `"data:application/json;base64,"` (non-empty
1703    /// string), so it enters the meta branch with `payload = b""`. The result must have
1704    /// `meta = Some(...)` containing the original Data-URL and `metahash` equal to
1705    /// `multi_hash_blake3(&[])` (BLAKE3 of empty bytes).
1706    #[test]
1707    fn test_gen_meta_code_empty_data_url_enters_meta_branch() {
1708        let result =
1709            gen_meta_code_v0("Test", None, Some("data:application/json;base64,"), 64).unwrap();
1710
1711        // Result should be Ok
1712        assert_eq!(result.name, "Test");
1713
1714        // meta should contain the original Data-URL string (not None)
1715        assert_eq!(
1716            result.meta,
1717            Some("data:application/json;base64,".to_string()),
1718            "empty Data-URL payload should still enter meta branch"
1719        );
1720
1721        // metahash should be BLAKE3 of empty bytes
1722        let expected_metahash = utils::multi_hash_blake3(&[]);
1723        assert_eq!(
1724            result.metahash, expected_metahash,
1725            "metahash should be BLAKE3 of empty bytes"
1726        );
1727    }
1728
1729    /// Verify that `soft_hash_meta_v0_with_bytes` with empty bytes produces the same
1730    /// digest as `soft_hash_meta_v0` with no extra text.
1731    ///
1732    /// Python reference (`code_meta.py:142`): `if extra in {None, "", b""}:` returns
1733    /// name-only simhash without interleaving for all empty-like values.
1734    #[test]
1735    fn test_soft_hash_meta_v0_with_bytes_empty_equals_name_only() {
1736        let name_only = soft_hash_meta_v0("test", None);
1737        let empty_bytes = soft_hash_meta_v0_with_bytes("test", &[]);
1738        assert_eq!(
1739            name_only, empty_bytes,
1740            "empty bytes should produce same digest as name-only (no interleaving)"
1741        );
1742    }
1743
1744    // ---- Algorithm constants tests ----
1745
1746    #[test]
1747    fn test_meta_trim_name_value() {
1748        assert_eq!(META_TRIM_NAME, 128);
1749    }
1750
1751    #[test]
1752    fn test_meta_trim_description_value() {
1753        assert_eq!(META_TRIM_DESCRIPTION, 4096);
1754    }
1755
1756    #[test]
1757    fn test_io_read_size_value() {
1758        assert_eq!(IO_READ_SIZE, 4_194_304);
1759    }
1760
1761    #[test]
1762    fn test_text_ngram_size_value() {
1763        assert_eq!(TEXT_NGRAM_SIZE, 13);
1764    }
1765
1766    // ---- encode_component Tier 1 wrapper tests ----
1767
1768    /// Encode a known digest and verify the output matches the codec version.
1769    #[test]
1770    fn test_encode_component_matches_codec() {
1771        let digest = [0xABu8; 8];
1772        let tier1 = encode_component(3, 0, 0, 64, &digest).unwrap();
1773        let tier2 = codec::encode_component(
1774            codec::MainType::Data,
1775            codec::SubType::None,
1776            codec::Version::V0,
1777            64,
1778            &digest,
1779        )
1780        .unwrap();
1781        assert_eq!(tier1, tier2);
1782    }
1783
1784    /// Round-trip: encode a digest and verify the result is a valid ISCC unit.
1785    #[test]
1786    fn test_encode_component_round_trip() {
1787        let digest = [0x42u8; 32];
1788        let result = encode_component(0, 0, 0, 64, &digest).unwrap();
1789        // Meta-Code with 64-bit digest should start with "AA"
1790        assert!(!result.is_empty());
1791    }
1792
1793    /// Reject MainType::Iscc (value 5).
1794    #[test]
1795    fn test_encode_component_rejects_iscc() {
1796        let result = encode_component(5, 0, 0, 64, &[0u8; 8]);
1797        assert!(result.is_err());
1798    }
1799
1800    /// Reject digest shorter than bit_length / 8.
1801    #[test]
1802    fn test_encode_component_rejects_short_digest() {
1803        let result = encode_component(0, 0, 0, 64, &[0u8; 4]);
1804        assert!(result.is_err());
1805        let err = result.unwrap_err().to_string();
1806        assert!(
1807            err.contains("digest length 4 < bit_length/8 (8)"),
1808            "unexpected error: {err}"
1809        );
1810    }
1811
1812    /// Reject invalid MainType value.
1813    #[test]
1814    fn test_encode_component_rejects_invalid_mtype() {
1815        let result = encode_component(99, 0, 0, 64, &[0u8; 8]);
1816        assert!(result.is_err());
1817    }
1818
1819    /// Reject invalid SubType value.
1820    #[test]
1821    fn test_encode_component_rejects_invalid_stype() {
1822        let result = encode_component(0, 99, 0, 64, &[0u8; 8]);
1823        assert!(result.is_err());
1824    }
1825
1826    /// Reject invalid Version value.
1827    #[test]
1828    fn test_encode_component_rejects_invalid_version() {
1829        let result = encode_component(0, 0, 99, 64, &[0u8; 8]);
1830        assert!(result.is_err());
1831    }
1832
1833    // ---- iscc_decode tests ----
1834
1835    /// Round-trip: encode a Meta-Code digest, decode back, verify all fields match.
1836    #[test]
1837    fn test_iscc_decode_round_trip_meta() {
1838        let digest = [0xaa_u8; 8];
1839        let encoded = encode_component(0, 0, 0, 64, &digest).unwrap();
1840        let (mt, st, vs, li, decoded_digest) = iscc_decode(&encoded).unwrap();
1841        assert_eq!(mt, 0, "MainType::Meta");
1842        assert_eq!(st, 0, "SubType::None");
1843        assert_eq!(vs, 0, "Version::V0");
1844        // encode_length(Meta, 64) → 64/32 - 1 = 1
1845        assert_eq!(li, 1, "length_index");
1846        assert_eq!(decoded_digest, digest.to_vec());
1847    }
1848
1849    /// Round-trip with Content-Code (MainType=2, SubType::TEXT=0).
1850    #[test]
1851    fn test_iscc_decode_round_trip_content() {
1852        let digest = [0xbb_u8; 8];
1853        let encoded = encode_component(2, 0, 0, 64, &digest).unwrap();
1854        let (mt, st, vs, _li, decoded_digest) = iscc_decode(&encoded).unwrap();
1855        assert_eq!(mt, 2, "MainType::Content");
1856        assert_eq!(st, 0, "SubType::TEXT");
1857        assert_eq!(vs, 0, "Version::V0");
1858        assert_eq!(decoded_digest, digest.to_vec());
1859    }
1860
1861    /// Round-trip with Data-Code (MainType=3).
1862    #[test]
1863    fn test_iscc_decode_round_trip_data() {
1864        let digest = [0xcc_u8; 8];
1865        let encoded = encode_component(3, 0, 0, 64, &digest).unwrap();
1866        let (mt, _st, _vs, _li, decoded_digest) = iscc_decode(&encoded).unwrap();
1867        assert_eq!(mt, 3, "MainType::Data");
1868        assert_eq!(decoded_digest, digest.to_vec());
1869    }
1870
1871    /// Round-trip with Instance-Code (MainType=4).
1872    #[test]
1873    fn test_iscc_decode_round_trip_instance() {
1874        let digest = [0xdd_u8; 8];
1875        let encoded = encode_component(4, 0, 0, 64, &digest).unwrap();
1876        let (mt, _st, _vs, _li, decoded_digest) = iscc_decode(&encoded).unwrap();
1877        assert_eq!(mt, 4, "MainType::Instance");
1878        assert_eq!(decoded_digest, digest.to_vec());
1879    }
1880
1881    /// Decode with "ISCC:" prefix produces the same result.
1882    #[test]
1883    fn test_iscc_decode_with_prefix() {
1884        let digest = [0xaa_u8; 8];
1885        let encoded = encode_component(0, 0, 0, 64, &digest).unwrap();
1886        let with_prefix = format!("ISCC:{encoded}");
1887        let (mt, st, vs, li, decoded_digest) = iscc_decode(&with_prefix).unwrap();
1888        assert_eq!(mt, 0);
1889        assert_eq!(st, 0);
1890        assert_eq!(vs, 0);
1891        assert_eq!(li, 1);
1892        assert_eq!(decoded_digest, digest.to_vec());
1893    }
1894
1895    /// Decode with dashes inserted in the string.
1896    #[test]
1897    fn test_iscc_decode_with_dashes() {
1898        let digest = [0xaa_u8; 8];
1899        let encoded = encode_component(0, 0, 0, 64, &digest).unwrap();
1900        // Insert dashes at arbitrary positions
1901        let with_dashes = format!("{}-{}-{}", &encoded[..4], &encoded[4..8], &encoded[8..]);
1902        let (mt, st, vs, li, decoded_digest) = iscc_decode(&with_dashes).unwrap();
1903        assert_eq!(mt, 0);
1904        assert_eq!(st, 0);
1905        assert_eq!(vs, 0);
1906        assert_eq!(li, 1);
1907        assert_eq!(decoded_digest, digest.to_vec());
1908    }
1909
1910    /// Error on invalid base32 characters.
1911    #[test]
1912    fn test_iscc_decode_invalid_base32() {
1913        let result = iscc_decode("!!!INVALID!!!");
1914        assert!(result.is_err());
1915        let err = result.unwrap_err().to_string();
1916        assert!(err.contains("base32"), "expected base32 error: {err}");
1917    }
1918
1919    /// Known value from conformance vectors: Meta-Code "ISCC:AAAZXZ6OU74YAZIM".
1920    /// MainType=Meta(0), SubType=None(0), Version=V0(0), 64-bit digest.
1921    #[test]
1922    fn test_iscc_decode_known_meta_code() {
1923        let (mt, st, vs, li, digest) = iscc_decode("ISCC:AAAZXZ6OU74YAZIM").unwrap();
1924        assert_eq!(mt, 0, "MainType::Meta");
1925        assert_eq!(st, 0, "SubType::None");
1926        assert_eq!(vs, 0, "Version::V0");
1927        assert_eq!(li, 1, "length_index for 64-bit");
1928        assert_eq!(digest.len(), 8, "64-bit = 8 bytes");
1929    }
1930
1931    /// Known value from conformance vectors: Instance-Code "ISCC:IAA26E2JXH27TING".
1932    /// MainType=Instance(4), SubType=None(0), Version=V0(0), 64-bit digest.
1933    #[test]
1934    fn test_iscc_decode_known_instance_code() {
1935        let (mt, st, vs, li, digest) = iscc_decode("ISCC:IAA26E2JXH27TING").unwrap();
1936        assert_eq!(mt, 4, "MainType::Instance");
1937        assert_eq!(st, 0, "SubType::None");
1938        assert_eq!(vs, 0, "Version::V0");
1939        assert_eq!(li, 1, "length_index for 64-bit");
1940        assert_eq!(digest.len(), 8, "64-bit = 8 bytes");
1941    }
1942
1943    /// Known value: Data-Code "ISCC:GAAXL2XYM5BQIAZ3".
1944    /// MainType=Data(3), SubType=None(0), Version=V0(0), 64-bit digest.
1945    #[test]
1946    fn test_iscc_decode_known_data_code() {
1947        let (mt, st, vs, _li, digest) = iscc_decode("ISCC:GAAXL2XYM5BQIAZ3").unwrap();
1948        assert_eq!(mt, 3, "MainType::Data");
1949        assert_eq!(st, 0, "SubType::None");
1950        assert_eq!(vs, 0, "Version::V0");
1951        assert_eq!(digest.len(), 8, "64-bit = 8 bytes");
1952    }
1953
1954    /// Verification criterion: round-trip with specific known values.
1955    /// encode_component(0, 0, 0, 64, &[0xaa;8]) → iscc_decode → (0, 0, 0, 1, vec![0xaa;8])
1956    #[test]
1957    fn test_iscc_decode_verification_round_trip() {
1958        let digest = [0xaa_u8; 8];
1959        let encoded = encode_component(0, 0, 0, 64, &digest).unwrap();
1960        let result = iscc_decode(&encoded).unwrap();
1961        assert_eq!(result, (0, 0, 0, 1, vec![0xaa; 8]));
1962    }
1963
1964    /// Error on truncated input where body is shorter than expected digest length.
1965    #[test]
1966    fn test_iscc_decode_truncated_input() {
1967        // Encode a valid 256-bit Meta-Code, then truncate the base32 string
1968        let digest = [0xff_u8; 32];
1969        let encoded = encode_component(0, 0, 0, 256, &digest).unwrap();
1970        // Truncate to just the header portion (first few chars)
1971        let truncated = &encoded[..6];
1972        let result = iscc_decode(truncated);
1973        assert!(result.is_err(), "should fail on truncated input");
1974    }
1975
1976    // --- json_to_data_url tests ---
1977
1978    /// Basic JSON object produces a data URL with application/json media type.
1979    #[test]
1980    fn test_json_to_data_url_basic() {
1981        let url = json_to_data_url(r#"{"key": "value"}"#).unwrap();
1982        assert!(
1983            url.starts_with("data:application/json;base64,"),
1984            "expected application/json prefix, got: {url}"
1985        );
1986    }
1987
1988    /// JSON with `@context` key uses application/ld+json media type.
1989    #[test]
1990    fn test_json_to_data_url_ld_json() {
1991        let url = json_to_data_url(r#"{"@context": "https://schema.org"}"#).unwrap();
1992        assert!(
1993            url.starts_with("data:application/ld+json;base64,"),
1994            "expected application/ld+json prefix, got: {url}"
1995        );
1996    }
1997
1998    /// JCS canonicalization reorders keys alphabetically.
1999    #[test]
2000    fn test_json_to_data_url_jcs_ordering() {
2001        let url = json_to_data_url(r#"{"b":1,"a":2}"#).unwrap();
2002        // Extract and decode the base64 payload
2003        let b64 = url.split_once(',').unwrap().1;
2004        let decoded = data_encoding::BASE64.decode(b64.as_bytes()).unwrap();
2005        let canonical = std::str::from_utf8(&decoded).unwrap();
2006        assert_eq!(canonical, r#"{"a":2,"b":1}"#, "JCS should sort keys");
2007    }
2008
2009    /// Round-trip: json_to_data_url output fed into decode_data_url recovers
2010    /// the JCS-canonical bytes.
2011    #[test]
2012    fn test_json_to_data_url_round_trip() {
2013        let input = r#"{"hello": "world", "num": 42}"#;
2014        let url = json_to_data_url(input).unwrap();
2015        let decoded_bytes = decode_data_url(&url).unwrap();
2016        // The decoded bytes should be JCS-canonical JSON
2017        let canonical: serde_json::Value =
2018            serde_json::from_slice(&decoded_bytes).expect("decoded bytes should be valid JSON");
2019        let original: serde_json::Value = serde_json::from_str(input).unwrap();
2020        assert_eq!(canonical, original, "round-trip preserves JSON semantics");
2021    }
2022
2023    /// Invalid JSON string returns an error.
2024    #[test]
2025    fn test_json_to_data_url_invalid_json() {
2026        let result = json_to_data_url("not json");
2027        assert!(result.is_err(), "should reject invalid JSON");
2028        let err = result.unwrap_err().to_string();
2029        assert!(
2030            err.contains("invalid JSON"),
2031            "expected 'invalid JSON' in error: {err}"
2032        );
2033    }
2034
2035    /// Compatibility with conformance vector test_0016_meta_data_url.
2036    ///
2037    /// The conformance vector's meta field is:
2038    ///   data:application/json;charset=utf-8;base64,eyJzb21lIjogIm9iamVjdCJ9
2039    /// which encodes `{"some": "object"}` (with space after colon).
2040    ///
2041    /// Our function differs in two ways:
2042    /// 1. No `charset=utf-8` parameter (matching Python's DataURL.from_byte_data)
2043    /// 2. JCS canonicalization removes whitespace: `{"some":"object"}` (no space)
2044    ///
2045    /// We verify: (a) correct media type prefix, and (b) decoded payload equals
2046    /// JCS-canonical form of the same JSON input.
2047    #[test]
2048    fn test_json_to_data_url_conformance_0016() {
2049        let url = json_to_data_url(r#"{"some": "object"}"#).unwrap();
2050        // (a) Correct media type prefix (no charset, no @context → application/json)
2051        assert!(
2052            url.starts_with("data:application/json;base64,"),
2053            "expected application/json prefix"
2054        );
2055        // (b) Decoded payload is JCS-canonical (no whitespace)
2056        let b64 = url.split_once(',').unwrap().1;
2057        let decoded = data_encoding::BASE64.decode(b64.as_bytes()).unwrap();
2058        let canonical = std::str::from_utf8(&decoded).unwrap();
2059        assert_eq!(
2060            canonical, r#"{"some":"object"}"#,
2061            "JCS removes whitespace from JSON"
2062        );
2063    }
2064
2065    #[test]
2066    fn test_meta_trim_meta_value() {
2067        assert_eq!(META_TRIM_META, 128_000);
2068    }
2069
2070    #[test]
2071    fn test_gen_meta_code_v0_meta_at_limit() {
2072        // Create a JSON payload that decodes to exactly 128,000 bytes
2073        // JSON: {"x":"<padding>"} where padding fills to 128,000 bytes
2074        // The canonical JSON overhead is {"x":""} = 8 bytes, so padding = 127,992 bytes
2075        let padding = "a".repeat(128_000 - 8);
2076        let json_str = format!(r#"{{"x":"{padding}"}}"#);
2077        let result = gen_meta_code_v0("test", None, Some(&json_str), 64);
2078        assert!(
2079            result.is_ok(),
2080            "payload at exactly META_TRIM_META should succeed"
2081        );
2082    }
2083
2084    #[test]
2085    fn test_gen_meta_code_v0_meta_over_limit() {
2086        // Create a JSON payload that decodes to 128,001 bytes (one over limit)
2087        let padding = "a".repeat(128_000 - 8 + 1);
2088        let json_str = format!(r#"{{"x":"{padding}"}}"#);
2089        let result = gen_meta_code_v0("test", None, Some(&json_str), 64);
2090        assert!(
2091            matches!(result, Err(IsccError::InvalidInput(ref msg)) if msg.contains("size limit")),
2092            "payload exceeding META_TRIM_META should return InvalidInput"
2093        );
2094    }
2095
2096    #[test]
2097    fn test_gen_meta_code_v0_data_url_pre_decode_reject() {
2098        // Create a Data-URL string exceeding the pre-decode limit
2099        // PRE_DECODE_LIMIT = META_TRIM_META * 4 / 3 + 256 = 170,922
2100        let pre_decode_limit = META_TRIM_META * 4 / 3 + 256;
2101        let padding = "A".repeat(pre_decode_limit + 1);
2102        let data_url = format!("data:application/octet-stream;base64,{padding}");
2103        let result = gen_meta_code_v0("test", None, Some(&data_url), 64);
2104        assert!(
2105            matches!(result, Err(IsccError::InvalidInput(ref msg)) if msg.contains("size limit")),
2106            "oversized Data-URL should be rejected before decoding"
2107        );
2108    }
2109
2110    // ---- gen_sum_code_v0 tests ----
2111
2112    /// Helper: write data to a unique temp file and return the path.
2113    fn write_temp_file(name: &str, data: &[u8]) -> std::path::PathBuf {
2114        let path = std::env::temp_dir().join(format!("iscc_test_{name}"));
2115        std::fs::write(&path, data).expect("failed to write temp file");
2116        path
2117    }
2118
2119    #[test]
2120    fn test_gen_sum_code_v0_equivalence() {
2121        let data = b"Hello, ISCC World! This is a test of gen_sum_code_v0.";
2122        let path = write_temp_file("sum_equiv", data);
2123
2124        let sum_result = gen_sum_code_v0(&path, 64, false).unwrap();
2125
2126        // Compute the same result via separate functions
2127        let data_result = gen_data_code_v0(data, 64).unwrap();
2128        let instance_result = gen_instance_code_v0(data, 64).unwrap();
2129        let iscc_result =
2130            gen_iscc_code_v0(&[&data_result.iscc, &instance_result.iscc], false).unwrap();
2131
2132        assert_eq!(sum_result.iscc, iscc_result.iscc);
2133        assert_eq!(sum_result.datahash, instance_result.datahash);
2134        assert_eq!(sum_result.filesize, instance_result.filesize);
2135        assert_eq!(sum_result.filesize, data.len() as u64);
2136
2137        std::fs::remove_file(&path).ok();
2138    }
2139
2140    #[test]
2141    fn test_gen_sum_code_v0_empty_file() {
2142        let path = write_temp_file("sum_empty", b"");
2143
2144        let sum_result = gen_sum_code_v0(&path, 64, false).unwrap();
2145
2146        let data_result = gen_data_code_v0(b"", 64).unwrap();
2147        let instance_result = gen_instance_code_v0(b"", 64).unwrap();
2148        let iscc_result =
2149            gen_iscc_code_v0(&[&data_result.iscc, &instance_result.iscc], false).unwrap();
2150
2151        assert_eq!(sum_result.iscc, iscc_result.iscc);
2152        assert_eq!(sum_result.datahash, instance_result.datahash);
2153        assert_eq!(sum_result.filesize, 0);
2154
2155        std::fs::remove_file(&path).ok();
2156    }
2157
2158    #[test]
2159    fn test_gen_sum_code_v0_file_not_found() {
2160        let path = std::env::temp_dir().join("iscc_test_nonexistent_file_xyz");
2161        let result = gen_sum_code_v0(&path, 64, false);
2162        assert!(result.is_err());
2163        let err_msg = result.unwrap_err().to_string();
2164        assert!(
2165            err_msg.contains("Cannot open file"),
2166            "error message should mention file open failure: {err_msg}"
2167        );
2168    }
2169
2170    #[test]
2171    fn test_gen_sum_code_v0_wide_mode() {
2172        let data = b"Testing wide mode for gen_sum_code_v0 function.";
2173        let path = write_temp_file("sum_wide", data);
2174
2175        let narrow = gen_sum_code_v0(&path, 64, false).unwrap();
2176        let wide = gen_sum_code_v0(&path, 64, true).unwrap();
2177
2178        // Wide mode with 64-bit codes doesn't trigger (need 128+), so they should be equal
2179        assert_eq!(narrow.iscc, wide.iscc);
2180
2181        // With 128 bits, wide mode should produce a different (longer) ISCC
2182        let narrow_128 = gen_sum_code_v0(&path, 128, false).unwrap();
2183        let wide_128 = gen_sum_code_v0(&path, 128, true).unwrap();
2184        assert_ne!(narrow_128.iscc, wide_128.iscc);
2185
2186        // Both should have the same datahash and filesize
2187        assert_eq!(narrow_128.datahash, wide_128.datahash);
2188        assert_eq!(narrow_128.filesize, wide_128.filesize);
2189
2190        std::fs::remove_file(&path).ok();
2191    }
2192
2193    #[test]
2194    fn test_gen_sum_code_v0_bits_64() {
2195        let data = b"Testing 64-bit gen_sum_code_v0.";
2196        let path = write_temp_file("sum_bits64", data);
2197
2198        let sum_result = gen_sum_code_v0(&path, 64, false).unwrap();
2199
2200        let data_result = gen_data_code_v0(data, 64).unwrap();
2201        let instance_result = gen_instance_code_v0(data, 64).unwrap();
2202        let iscc_result =
2203            gen_iscc_code_v0(&[&data_result.iscc, &instance_result.iscc], false).unwrap();
2204
2205        assert_eq!(sum_result.iscc, iscc_result.iscc);
2206
2207        std::fs::remove_file(&path).ok();
2208    }
2209
2210    #[test]
2211    fn test_gen_sum_code_v0_bits_128() {
2212        let data = b"Testing 128-bit gen_sum_code_v0.";
2213        let path = write_temp_file("sum_bits128", data);
2214
2215        let sum_result = gen_sum_code_v0(&path, 128, false).unwrap();
2216
2217        let data_result = gen_data_code_v0(data, 128).unwrap();
2218        let instance_result = gen_instance_code_v0(data, 128).unwrap();
2219        let iscc_result =
2220            gen_iscc_code_v0(&[&data_result.iscc, &instance_result.iscc], false).unwrap();
2221
2222        assert_eq!(sum_result.iscc, iscc_result.iscc);
2223        assert_eq!(sum_result.datahash, instance_result.datahash);
2224        assert_eq!(sum_result.filesize, data.len() as u64);
2225
2226        std::fs::remove_file(&path).ok();
2227    }
2228
2229    #[test]
2230    fn test_gen_sum_code_v0_large_data() {
2231        // Generate data large enough to produce multiple CDC chunks
2232        let data: Vec<u8> = (0..50_000).map(|i| (i % 256) as u8).collect();
2233        let path = write_temp_file("sum_large", &data);
2234
2235        let sum_result = gen_sum_code_v0(&path, 64, false).unwrap();
2236
2237        let data_result = gen_data_code_v0(&data, 64).unwrap();
2238        let instance_result = gen_instance_code_v0(&data, 64).unwrap();
2239        let iscc_result =
2240            gen_iscc_code_v0(&[&data_result.iscc, &instance_result.iscc], false).unwrap();
2241
2242        assert_eq!(sum_result.iscc, iscc_result.iscc);
2243        assert_eq!(sum_result.datahash, instance_result.datahash);
2244        assert_eq!(sum_result.filesize, data.len() as u64);
2245
2246        std::fs::remove_file(&path).ok();
2247    }
2248}