Skip to main content

iscc_lib/
lib.rs

1//! High-performance Rust implementation of ISO 24138:2024 (ISCC).
2//!
3//! This crate provides the core ISCC algorithm implementations. All 9 `gen_*_v0`
4//! functions are the public Tier 1 API surface, designed to be compatible with
5//! the `iscc-core` Python reference implementation.
6
7pub mod cdc;
8pub mod codec;
9pub mod conformance;
10pub(crate) mod dct;
11pub mod minhash;
12pub mod simhash;
13pub mod streaming;
14pub mod types;
15pub mod utils;
16pub(crate) mod wtahash;
17
18pub use cdc::alg_cdc_chunks;
19pub use codec::encode_base64;
20pub use codec::iscc_decompose;
21pub use conformance::conformance_selftest;
22pub use minhash::alg_minhash_256;
23pub use simhash::{alg_simhash, sliding_window};
24pub use streaming::{DataHasher, InstanceHasher};
25pub use types::*;
26pub use utils::{text_clean, text_collapse, text_remove_newlines, text_trim};
27
28/// Error type for ISCC operations.
29#[derive(Debug, thiserror::Error)]
30pub enum IsccError {
31    /// Input data is invalid.
32    #[error("invalid input: {0}")]
33    InvalidInput(String),
34}
35
36/// Result type alias for ISCC operations.
37pub type IsccResult<T> = Result<T, IsccError>;
38
39/// Interleave two 32-byte SimHash digests in 4-byte chunks.
40///
41/// Takes the first 16 bytes of each digest and interleaves them into
42/// a 32-byte result: 4 bytes from `a`, 4 bytes from `b`, alternating
43/// for 4 rounds (8 chunks total).
44fn interleave_digests(a: &[u8], b: &[u8]) -> Vec<u8> {
45    let mut result = vec![0u8; 32];
46    for chunk in 0..4 {
47        let src = chunk * 4;
48        let dst_a = chunk * 8;
49        let dst_b = chunk * 8 + 4;
50        result[dst_a..dst_a + 4].copy_from_slice(&a[src..src + 4]);
51        result[dst_b..dst_b + 4].copy_from_slice(&b[src..src + 4]);
52    }
53    result
54}
55
56/// Compute a SimHash digest from the name text for meta hashing.
57///
58/// Applies `text_collapse`, generates width-3 sliding window n-grams,
59/// hashes each with BLAKE3, and produces a SimHash.
60fn meta_name_simhash(name: &str) -> Vec<u8> {
61    let collapsed_name = utils::text_collapse(name);
62    let name_ngrams = simhash::sliding_window_strs(&collapsed_name, 3);
63    let name_hashes: Vec<[u8; 32]> = name_ngrams
64        .iter()
65        .map(|ng| *blake3::hash(ng.as_bytes()).as_bytes())
66        .collect();
67    simhash::alg_simhash_inner(&name_hashes)
68}
69
70/// Compute a similarity-preserving 256-bit hash from metadata text.
71///
72/// Produces a SimHash digest from `name` n-grams. When `extra` is provided,
73/// interleaves the name and extra SimHash digests in 4-byte chunks.
74fn soft_hash_meta_v0(name: &str, extra: Option<&str>) -> Vec<u8> {
75    let name_simhash = meta_name_simhash(name);
76
77    match extra {
78        None | Some("") => name_simhash,
79        Some(extra_str) => {
80            let collapsed_extra = utils::text_collapse(extra_str);
81            let extra_ngrams = simhash::sliding_window_strs(&collapsed_extra, 3);
82            let extra_hashes: Vec<[u8; 32]> = extra_ngrams
83                .iter()
84                .map(|ng| *blake3::hash(ng.as_bytes()).as_bytes())
85                .collect();
86            let extra_simhash = simhash::alg_simhash_inner(&extra_hashes);
87
88            interleave_digests(&name_simhash, &extra_simhash)
89        }
90    }
91}
92
93/// Compute a similarity-preserving 256-bit hash from name text and raw bytes.
94///
95/// Like `soft_hash_meta_v0` but the extra data is raw bytes instead of text.
96/// Uses width-4 byte n-grams (no `text_collapse`) for the bytes path,
97/// and interleaves name/bytes SimHash digests in 4-byte chunks.
98fn soft_hash_meta_v0_with_bytes(name: &str, extra: &[u8]) -> Vec<u8> {
99    let name_simhash = meta_name_simhash(name);
100
101    if extra.is_empty() {
102        return name_simhash;
103    }
104
105    let byte_ngrams = simhash::sliding_window_bytes(extra, 4);
106    let byte_hashes: Vec<[u8; 32]> = byte_ngrams
107        .iter()
108        .map(|ng| *blake3::hash(ng).as_bytes())
109        .collect();
110    let byte_simhash = simhash::alg_simhash_inner(&byte_hashes);
111
112    interleave_digests(&name_simhash, &byte_simhash)
113}
114
115/// Decode a Data-URL's base64 payload.
116///
117/// Expects a string starting with `"data:"`. Splits on the first `,` and
118/// decodes the remainder as standard base64. Returns `InvalidInput` on
119/// missing comma or invalid base64.
120fn decode_data_url(data_url: &str) -> IsccResult<Vec<u8>> {
121    let payload_b64 = data_url
122        .split_once(',')
123        .map(|(_, b64)| b64)
124        .ok_or_else(|| IsccError::InvalidInput("Data-URL missing comma separator".into()))?;
125    data_encoding::BASE64
126        .decode(payload_b64.as_bytes())
127        .map_err(|e| IsccError::InvalidInput(format!("invalid base64 in Data-URL: {e}")))
128}
129
130/// Parse a meta string as JSON and re-serialize to RFC 8785 (JCS) canonical bytes.
131fn parse_meta_json(meta_str: &str) -> IsccResult<Vec<u8>> {
132    let parsed: serde_json::Value = serde_json::from_str(meta_str)
133        .map_err(|e| IsccError::InvalidInput(format!("invalid JSON in meta: {e}")))?;
134    let mut buf = Vec::new();
135    serde_json_canonicalizer::to_writer(&parsed, &mut buf)
136        .map_err(|e| IsccError::InvalidInput(format!("JSON canonicalization failed: {e}")))?;
137    Ok(buf)
138}
139
140/// Build a Data-URL from canonical JSON bytes.
141///
142/// Uses `application/ld+json` media type if the JSON has an `@context` key,
143/// otherwise `application/json`. Encodes payload as standard base64 with padding.
144fn build_meta_data_url(json_bytes: &[u8], json_value: &serde_json::Value) -> String {
145    let media_type = if json_value.get("@context").is_some() {
146        "application/ld+json"
147    } else {
148        "application/json"
149    };
150    let b64 = data_encoding::BASE64.encode(json_bytes);
151    format!("data:{media_type};base64,{b64}")
152}
153
154/// Generate a Meta-Code from name and optional metadata.
155///
156/// Produces an ISCC Meta-Code by hashing the provided name, description,
157/// and metadata fields using the SimHash algorithm. When `meta` is provided,
158/// it is treated as either a Data-URL (if starting with `"data:"`) or a JSON
159/// string, and the decoded/serialized bytes are used for similarity hashing
160/// and metahash computation.
161pub fn gen_meta_code_v0(
162    name: &str,
163    description: Option<&str>,
164    meta: Option<&str>,
165    bits: u32,
166) -> IsccResult<MetaCodeResult> {
167    // Normalize name: clean → remove newlines → trim to 128 bytes
168    let name = utils::text_clean(name);
169    let name = utils::text_remove_newlines(&name);
170    let name = utils::text_trim(&name, 128);
171
172    if name.is_empty() {
173        return Err(IsccError::InvalidInput(
174            "name is empty after normalization".into(),
175        ));
176    }
177
178    // Normalize description: clean → trim to 4096 bytes
179    let desc_str = description.unwrap_or("");
180    let desc_clean = utils::text_clean(desc_str);
181    let desc_clean = utils::text_trim(&desc_clean, 4096);
182
183    // Resolve meta payload bytes (if meta is provided)
184    let meta_payload: Option<Vec<u8>> = match meta {
185        Some(meta_str) if meta_str.starts_with("data:") => Some(decode_data_url(meta_str)?),
186        Some(meta_str) => Some(parse_meta_json(meta_str)?),
187        None => None,
188    };
189
190    // Branch: meta bytes path vs. description text path
191    if let Some(ref payload) = meta_payload {
192        let meta_code_digest = soft_hash_meta_v0_with_bytes(&name, payload);
193        let metahash = utils::multi_hash_blake3(payload);
194
195        let meta_code = codec::encode_component(
196            codec::MainType::Meta,
197            codec::SubType::None,
198            codec::Version::V0,
199            bits,
200            &meta_code_digest,
201        )?;
202
203        // Build the meta Data-URL for the result
204        let meta_value = match meta {
205            Some(meta_str) if meta_str.starts_with("data:") => meta_str.to_string(),
206            Some(meta_str) => {
207                let parsed: serde_json::Value = serde_json::from_str(meta_str)
208                    .map_err(|e| IsccError::InvalidInput(format!("invalid JSON: {e}")))?;
209                build_meta_data_url(payload, &parsed)
210            }
211            None => unreachable!(),
212        };
213
214        Ok(MetaCodeResult {
215            iscc: format!("ISCC:{meta_code}"),
216            name: name.clone(),
217            description: if desc_clean.is_empty() {
218                None
219            } else {
220                Some(desc_clean)
221            },
222            meta: Some(meta_value),
223            metahash,
224        })
225    } else {
226        // Compute metahash from normalized text payload
227        let payload = if desc_clean.is_empty() {
228            name.clone()
229        } else {
230            format!("{name} {desc_clean}")
231        };
232        let payload = payload.trim().to_string();
233        let metahash = utils::multi_hash_blake3(payload.as_bytes());
234
235        // Compute similarity digest
236        let extra = if desc_clean.is_empty() {
237            None
238        } else {
239            Some(desc_clean.as_str())
240        };
241        let meta_code_digest = soft_hash_meta_v0(&name, extra);
242
243        let meta_code = codec::encode_component(
244            codec::MainType::Meta,
245            codec::SubType::None,
246            codec::Version::V0,
247            bits,
248            &meta_code_digest,
249        )?;
250
251        Ok(MetaCodeResult {
252            iscc: format!("ISCC:{meta_code}"),
253            name: name.clone(),
254            description: if desc_clean.is_empty() {
255                None
256            } else {
257                Some(desc_clean)
258            },
259            meta: None,
260            metahash,
261        })
262    }
263}
264
265/// Compute a 256-bit similarity-preserving hash from collapsed text.
266///
267/// Generates character n-grams with a sliding window of width 13,
268/// hashes each with xxh32, then applies MinHash to produce a 32-byte digest.
269fn soft_hash_text_v0(text: &str) -> Vec<u8> {
270    let ngrams = simhash::sliding_window_strs(text, 13);
271    let features: Vec<u32> = ngrams
272        .iter()
273        .map(|ng| xxhash_rust::xxh32::xxh32(ng.as_bytes(), 0))
274        .collect();
275    minhash::alg_minhash_256(&features)
276}
277
278/// Generate a Text-Code from plain text content.
279///
280/// Produces an ISCC Content-Code for text by collapsing the input,
281/// extracting character n-gram features, and applying MinHash to
282/// create a similarity-preserving fingerprint.
283pub fn gen_text_code_v0(text: &str, bits: u32) -> IsccResult<TextCodeResult> {
284    let collapsed = utils::text_collapse(text);
285    let characters = collapsed.chars().count();
286    let hash_digest = soft_hash_text_v0(&collapsed);
287    let component = codec::encode_component(
288        codec::MainType::Content,
289        codec::SubType::TEXT,
290        codec::Version::V0,
291        bits,
292        &hash_digest,
293    )?;
294    Ok(TextCodeResult {
295        iscc: format!("ISCC:{component}"),
296        characters,
297    })
298}
299
300/// Transpose a matrix represented as a Vec of Vecs.
301fn transpose_matrix(matrix: &[Vec<f64>]) -> Vec<Vec<f64>> {
302    let rows = matrix.len();
303    if rows == 0 {
304        return vec![];
305    }
306    let cols = matrix[0].len();
307    let mut result = vec![vec![0.0f64; rows]; cols];
308    for (r, row) in matrix.iter().enumerate() {
309        for (c, &val) in row.iter().enumerate() {
310            result[c][r] = val;
311        }
312    }
313    result
314}
315
316/// Extract an 8×8 block from a matrix and flatten to 64 values.
317///
318/// Block position `(col, row)` means the block starts at
319/// `matrix[row][col]` and spans 8 rows and 8 columns.
320fn flatten_8x8(matrix: &[Vec<f64>], col: usize, row: usize) -> Vec<f64> {
321    let mut flat = Vec::with_capacity(64);
322    for matrix_row in matrix.iter().skip(row).take(8) {
323        for &val in matrix_row.iter().skip(col).take(8) {
324            flat.push(val);
325        }
326    }
327    flat
328}
329
330/// Compute the median of a slice of f64 values.
331///
332/// For even-length slices, returns the average of the two middle values
333/// (matching Python `statistics.median` behavior).
334fn compute_median(values: &[f64]) -> f64 {
335    let mut sorted: Vec<f64> = values.to_vec();
336    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
337    let n = sorted.len();
338    if n % 2 == 1 {
339        sorted[n / 2]
340    } else {
341        (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
342    }
343}
344
345/// Convert a slice of bools to a byte vector (MSB first per byte).
346fn bits_to_bytes(bits: &[bool]) -> Vec<u8> {
347    bits.chunks(8)
348        .map(|chunk| {
349            let mut byte = 0u8;
350            for (i, &bit) in chunk.iter().enumerate() {
351                if bit {
352                    byte |= 1 << (7 - i);
353                }
354            }
355            byte
356        })
357        .collect()
358}
359
360/// Compute a DCT-based perceptual hash from 32×32 grayscale pixels.
361///
362/// Applies a 2D DCT to the pixel matrix, extracts four 8×8 low-frequency
363/// blocks, and generates a bitstring by comparing each coefficient against
364/// the block median. Returns up to `bits` bits as a byte vector.
365fn soft_hash_image_v0(pixels: &[u8], bits: u32) -> IsccResult<Vec<u8>> {
366    if pixels.len() != 1024 {
367        return Err(IsccError::InvalidInput(format!(
368            "expected 1024 pixels, got {}",
369            pixels.len()
370        )));
371    }
372    if bits > 256 {
373        return Err(IsccError::InvalidInput(format!(
374            "bits must be <= 256, got {bits}"
375        )));
376    }
377
378    // Step 1: Row-wise DCT (32 rows of 32 pixels)
379    let rows: Vec<Vec<f64>> = pixels
380        .chunks(32)
381        .map(|row| {
382            let row_f64: Vec<f64> = row.iter().map(|&p| p as f64).collect();
383            dct::alg_dct(&row_f64)
384        })
385        .collect::<IsccResult<Vec<Vec<f64>>>>()?;
386
387    // Step 2: Transpose
388    let transposed = transpose_matrix(&rows);
389
390    // Step 3: Column-wise DCT
391    let dct_cols: Vec<Vec<f64>> = transposed
392        .iter()
393        .map(|col| dct::alg_dct(col))
394        .collect::<IsccResult<Vec<Vec<f64>>>>()?;
395
396    // Step 4: Transpose back → dct_matrix
397    let dct_matrix = transpose_matrix(&dct_cols);
398
399    // Step 5: Extract 8×8 blocks at positions (0,0), (1,0), (0,1), (1,1)
400    let positions = [(0, 0), (1, 0), (0, 1), (1, 1)];
401    let mut bitstring = Vec::<bool>::with_capacity(256);
402
403    for (col, row) in positions {
404        let flat = flatten_8x8(&dct_matrix, col, row);
405        let median = compute_median(&flat);
406        for val in &flat {
407            bitstring.push(*val > median);
408        }
409        if bitstring.len() >= bits as usize {
410            break;
411        }
412    }
413
414    // Step 6: Convert first `bits` bools to bytes
415    Ok(bits_to_bytes(&bitstring[..bits as usize]))
416}
417
418/// Generate an Image-Code from pixel data.
419///
420/// Produces an ISCC Content-Code for images from a sequence of 1024
421/// grayscale pixel values (32×32, values 0-255) using a DCT-based
422/// perceptual hash.
423pub fn gen_image_code_v0(pixels: &[u8], bits: u32) -> IsccResult<ImageCodeResult> {
424    let hash_digest = soft_hash_image_v0(pixels, bits)?;
425    let component = codec::encode_component(
426        codec::MainType::Content,
427        codec::SubType::Image,
428        codec::Version::V0,
429        bits,
430        &hash_digest,
431    )?;
432    Ok(ImageCodeResult {
433        iscc: format!("ISCC:{component}"),
434    })
435}
436
437/// Split a slice into `n` parts, distributing remainder across first chunks.
438///
439/// Equivalent to `numpy.array_split` / `more_itertools.divide`:
440/// each part gets `len / n` elements, and the first `len % n` parts
441/// get one extra element. Returns empty slices for excess parts.
442fn array_split<T>(slice: &[T], n: usize) -> Vec<&[T]> {
443    if n == 0 {
444        return vec![];
445    }
446    let len = slice.len();
447    let base = len / n;
448    let remainder = len % n;
449    let mut parts = Vec::with_capacity(n);
450    let mut offset = 0;
451    for i in 0..n {
452        let size = base + if i < remainder { 1 } else { 0 };
453        parts.push(&slice[offset..offset + size]);
454        offset += size;
455    }
456    parts
457}
458
459/// Compute a multi-stage SimHash digest from Chromaprint features.
460///
461/// Builds a 32-byte digest by concatenating 4-byte SimHash chunks:
462/// - Stage 1: overall SimHash of all features (4 bytes)
463/// - Stage 2: SimHash of each quarter of features (4 × 4 = 16 bytes)
464/// - Stage 3: SimHash of each third of sorted features (3 × 4 = 12 bytes)
465fn soft_hash_audio_v0(cv: &[i32]) -> Vec<u8> {
466    // Convert each i32 to 4-byte big-endian digest
467    let digests: Vec<[u8; 4]> = cv.iter().map(|&v| v.to_be_bytes()).collect();
468
469    if digests.is_empty() {
470        return vec![0u8; 32];
471    }
472
473    // Stage 1: overall SimHash (4 bytes)
474    let mut parts: Vec<u8> = simhash::alg_simhash_inner(&digests);
475
476    // Stage 2: quarter-based SimHashes (4 × 4 = 16 bytes)
477    let quarters = array_split(&digests, 4);
478    for quarter in &quarters {
479        if quarter.is_empty() {
480            parts.extend_from_slice(&[0u8; 4]);
481        } else {
482            parts.extend_from_slice(&simhash::alg_simhash_inner(quarter));
483        }
484    }
485
486    // Stage 3: sorted-third-based SimHashes (3 × 4 = 12 bytes)
487    let mut sorted_values: Vec<i32> = cv.to_vec();
488    sorted_values.sort();
489    let sorted_digests: Vec<[u8; 4]> = sorted_values.iter().map(|&v| v.to_be_bytes()).collect();
490    let thirds = array_split(&sorted_digests, 3);
491    for third in &thirds {
492        if third.is_empty() {
493            parts.extend_from_slice(&[0u8; 4]);
494        } else {
495            parts.extend_from_slice(&simhash::alg_simhash_inner(third));
496        }
497    }
498
499    parts
500}
501
502/// Generate an Audio-Code from a Chromaprint feature vector.
503///
504/// Produces an ISCC Content-Code for audio from a Chromaprint signed
505/// integer fingerprint vector using multi-stage SimHash.
506pub fn gen_audio_code_v0(cv: &[i32], bits: u32) -> IsccResult<AudioCodeResult> {
507    let hash_digest = soft_hash_audio_v0(cv);
508    let component = codec::encode_component(
509        codec::MainType::Content,
510        codec::SubType::Audio,
511        codec::Version::V0,
512        bits,
513        &hash_digest,
514    )?;
515    Ok(AudioCodeResult {
516        iscc: format!("ISCC:{component}"),
517    })
518}
519
520/// Compute a similarity-preserving hash from video frame signatures.
521///
522/// Deduplicates frame signatures, computes column-wise sums across all
523/// unique frames, then applies WTA-Hash to produce a digest of `bits/8` bytes.
524pub fn soft_hash_video_v0<S: AsRef<[i32]> + Ord>(
525    frame_sigs: &[S],
526    bits: u32,
527) -> IsccResult<Vec<u8>> {
528    if frame_sigs.is_empty() {
529        return Err(IsccError::InvalidInput(
530            "frame_sigs must not be empty".into(),
531        ));
532    }
533
534    // Deduplicate using BTreeSet (S: Ord)
535    let unique: std::collections::BTreeSet<&S> = frame_sigs.iter().collect();
536
537    // Column-wise sum into i64 to avoid overflow
538    let cols = frame_sigs[0].as_ref().len();
539    let mut vecsum = vec![0i64; cols];
540    for sig in &unique {
541        for (c, &val) in sig.as_ref().iter().enumerate() {
542            vecsum[c] += val as i64;
543        }
544    }
545
546    wtahash::alg_wtahash(&vecsum, bits)
547}
548
549/// Generate a Video-Code from frame signature data.
550///
551/// Produces an ISCC Content-Code for video from a sequence of MPEG-7 frame
552/// signatures. Each frame signature is a 380-element integer vector.
553pub fn gen_video_code_v0<S: AsRef<[i32]> + Ord>(
554    frame_sigs: &[S],
555    bits: u32,
556) -> IsccResult<VideoCodeResult> {
557    let digest = soft_hash_video_v0(frame_sigs, bits)?;
558    let component = codec::encode_component(
559        codec::MainType::Content,
560        codec::SubType::Video,
561        codec::Version::V0,
562        bits,
563        &digest,
564    )?;
565    Ok(VideoCodeResult {
566        iscc: format!("ISCC:{component}"),
567    })
568}
569
570/// Combine multiple Content-Code digests into a single similarity hash.
571///
572/// Takes raw decoded ISCC bytes (header + body) for each Content-Code and
573/// produces a SimHash digest. Each input is trimmed to `bits/8` bytes by
574/// keeping the first header byte (encodes type info) plus `nbytes-1` body bytes.
575/// Requires at least 2 codes, all of MainType::Content.
576fn soft_hash_codes_v0(cc_digests: &[Vec<u8>], bits: u32) -> IsccResult<Vec<u8>> {
577    if cc_digests.len() < 2 {
578        return Err(IsccError::InvalidInput(
579            "at least 2 Content-Codes required for mixing".into(),
580        ));
581    }
582
583    let nbytes = (bits / 8) as usize;
584    let mut prepared: Vec<Vec<u8>> = Vec::with_capacity(cc_digests.len());
585
586    for raw in cc_digests {
587        let (mtype, stype, _ver, blen, body) = codec::decode_header(raw)?;
588        if mtype != codec::MainType::Content {
589            return Err(IsccError::InvalidInput(
590                "all codes must be Content-Codes".into(),
591            ));
592        }
593        let unit_bits = codec::decode_length(mtype, blen, stype);
594        if unit_bits < bits {
595            return Err(IsccError::InvalidInput(format!(
596                "Content-Code too short for {bits}-bit length (has {unit_bits} bits)"
597            )));
598        }
599        let mut entry = Vec::with_capacity(nbytes);
600        entry.push(raw[0]); // first byte preserves type info
601        let take = std::cmp::min(nbytes - 1, body.len());
602        entry.extend_from_slice(&body[..take]);
603        // Pad with zeros if body is shorter than nbytes-1
604        while entry.len() < nbytes {
605            entry.push(0);
606        }
607        prepared.push(entry);
608    }
609
610    Ok(simhash::alg_simhash_inner(&prepared))
611}
612
613/// Generate a Mixed-Code from multiple Content-Code strings.
614///
615/// Produces a Mixed Content-Code by combining multiple ISCC Content-Codes
616/// of different types (text, image, audio, video) using SimHash. Input codes
617/// may optionally include the "ISCC:" prefix.
618pub fn gen_mixed_code_v0(codes: &[&str], bits: u32) -> IsccResult<MixedCodeResult> {
619    let decoded: Vec<Vec<u8>> = codes
620        .iter()
621        .map(|code| {
622            let clean = code.strip_prefix("ISCC:").unwrap_or(code);
623            codec::decode_base32(clean)
624        })
625        .collect::<IsccResult<Vec<Vec<u8>>>>()?;
626
627    let digest = soft_hash_codes_v0(&decoded, bits)?;
628
629    let component = codec::encode_component(
630        codec::MainType::Content,
631        codec::SubType::Mixed,
632        codec::Version::V0,
633        bits,
634        &digest,
635    )?;
636
637    Ok(MixedCodeResult {
638        iscc: format!("ISCC:{component}"),
639        parts: codes.iter().map(|s| s.to_string()).collect(),
640    })
641}
642
643/// Generate a Data-Code from raw byte data.
644///
645/// Produces an ISCC Data-Code by splitting data into content-defined chunks,
646/// hashing each chunk with xxh32, and applying MinHash to create a
647/// similarity-preserving fingerprint.
648pub fn gen_data_code_v0(data: &[u8], bits: u32) -> IsccResult<DataCodeResult> {
649    let chunks = cdc::alg_cdc_chunks(data, false, cdc::DATA_AVG_CHUNK_SIZE);
650    let mut features: Vec<u32> = chunks
651        .iter()
652        .map(|chunk| xxhash_rust::xxh32::xxh32(chunk, 0))
653        .collect();
654
655    // Defensive: ensure at least one feature (alg_cdc_chunks guarantees >= 1 chunk)
656    if features.is_empty() {
657        features.push(xxhash_rust::xxh32::xxh32(b"", 0));
658    }
659
660    let digest = minhash::alg_minhash_256(&features);
661    let component = codec::encode_component(
662        codec::MainType::Data,
663        codec::SubType::None,
664        codec::Version::V0,
665        bits,
666        &digest,
667    )?;
668
669    Ok(DataCodeResult {
670        iscc: format!("ISCC:{component}"),
671    })
672}
673
674/// Generate an Instance-Code from raw byte data.
675///
676/// Produces an ISCC Instance-Code by hashing the complete byte stream
677/// with BLAKE3. Captures the exact binary identity of the data.
678pub fn gen_instance_code_v0(data: &[u8], bits: u32) -> IsccResult<InstanceCodeResult> {
679    let digest = blake3::hash(data);
680    let datahash = utils::multi_hash_blake3(data);
681    let filesize = data.len() as u64;
682    let component = codec::encode_component(
683        codec::MainType::Instance,
684        codec::SubType::None,
685        codec::Version::V0,
686        bits,
687        digest.as_bytes(),
688    )?;
689    Ok(InstanceCodeResult {
690        iscc: format!("ISCC:{component}"),
691        datahash,
692        filesize,
693    })
694}
695
696/// Generate a composite ISCC-CODE from individual ISCC unit codes.
697///
698/// Combines multiple ISCC unit codes (Meta-Code, Content-Code, Data-Code,
699/// Instance-Code) into a single composite ISCC-CODE. Input codes may
700/// optionally include the "ISCC:" prefix. At least Data-Code and
701/// Instance-Code are required. When `wide` is true and exactly two
702/// 128-bit+ codes (Data + Instance) are provided, produces a 256-bit
703/// wide-mode code.
704pub fn gen_iscc_code_v0(codes: &[&str], wide: bool) -> IsccResult<IsccCodeResult> {
705    // Step 1: Clean inputs — strip "ISCC:" prefix
706    let cleaned: Vec<&str> = codes
707        .iter()
708        .map(|c| c.strip_prefix("ISCC:").unwrap_or(c))
709        .collect();
710
711    // Step 2: Validate minimum count
712    if cleaned.len() < 2 {
713        return Err(IsccError::InvalidInput(
714            "at least 2 ISCC unit codes required".into(),
715        ));
716    }
717
718    // Step 3: Validate minimum length (16 base32 chars = 64-bit minimum)
719    for code in &cleaned {
720        if code.len() < 16 {
721            return Err(IsccError::InvalidInput(format!(
722                "ISCC unit code too short (min 16 chars): {code}"
723            )));
724        }
725    }
726
727    // Step 4: Decode each code
728    let mut decoded: Vec<(
729        codec::MainType,
730        codec::SubType,
731        codec::Version,
732        u32,
733        Vec<u8>,
734    )> = Vec::with_capacity(cleaned.len());
735    for code in &cleaned {
736        let raw = codec::decode_base32(code)?;
737        let header = codec::decode_header(&raw)?;
738        decoded.push(header);
739    }
740
741    // Step 5: Sort by MainType (ascending)
742    decoded.sort_by_key(|&(mt, ..)| mt);
743
744    // Step 6: Extract main_types
745    let main_types: Vec<codec::MainType> = decoded.iter().map(|&(mt, ..)| mt).collect();
746
747    // Step 7: Validate last two are Data + Instance (mandatory)
748    let n = main_types.len();
749    if main_types[n - 2] != codec::MainType::Data || main_types[n - 1] != codec::MainType::Instance
750    {
751        return Err(IsccError::InvalidInput(
752            "Data-Code and Instance-Code are mandatory".into(),
753        ));
754    }
755
756    // Step 8: Determine wide composite
757    let is_wide = wide
758        && decoded.len() == 2
759        && main_types == [codec::MainType::Data, codec::MainType::Instance]
760        && decoded
761            .iter()
762            .all(|&(mt, st, _, len, _)| codec::decode_length(mt, len, st) >= 128);
763
764    // Step 9: Determine SubType
765    let st = if is_wide {
766        codec::SubType::Wide
767    } else {
768        // Collect SubTypes of Semantic/Content units
769        let sc_subtypes: Vec<codec::SubType> = decoded
770            .iter()
771            .filter(|&&(mt, ..)| mt == codec::MainType::Semantic || mt == codec::MainType::Content)
772            .map(|&(_, st, ..)| st)
773            .collect();
774
775        if !sc_subtypes.is_empty() {
776            // All must be the same
777            let first = sc_subtypes[0];
778            if sc_subtypes.iter().all(|&s| s == first) {
779                first
780            } else {
781                return Err(IsccError::InvalidInput(
782                    "mixed SubTypes among Content/Semantic units".into(),
783                ));
784            }
785        } else if decoded.len() == 2 {
786            codec::SubType::Sum
787        } else {
788            codec::SubType::IsccNone
789        }
790    };
791
792    // Step 10–11: Get optional MainTypes and encode
793    let optional_types = &main_types[..n - 2];
794    let encoded_length = codec::encode_units(optional_types)?;
795
796    // Step 12: Build digest body
797    let bytes_per_unit = if is_wide { 16 } else { 8 };
798    let mut digest = Vec::with_capacity(decoded.len() * bytes_per_unit);
799    for (_, _, _, _, tail) in &decoded {
800        let take = bytes_per_unit.min(tail.len());
801        digest.extend_from_slice(&tail[..take]);
802    }
803
804    // Step 13–14: Encode header + digest as base32
805    let header = codec::encode_header(
806        codec::MainType::Iscc,
807        st,
808        codec::Version::V0,
809        encoded_length,
810    )?;
811    let mut code_bytes = header;
812    code_bytes.extend_from_slice(&digest);
813    let code = codec::encode_base32(&code_bytes);
814
815    // Step 15: Return with prefix
816    Ok(IsccCodeResult {
817        iscc: format!("ISCC:{code}"),
818    })
819}
820
821#[cfg(test)]
822mod tests {
823    use super::*;
824
825    #[test]
826    fn test_gen_meta_code_v0_title_only() {
827        let result = gen_meta_code_v0("Die Unendliche Geschichte", None, None, 64).unwrap();
828        assert_eq!(result.iscc, "ISCC:AAAZXZ6OU74YAZIM");
829        assert_eq!(result.name, "Die Unendliche Geschichte");
830        assert_eq!(result.description, None);
831        assert_eq!(result.meta, None);
832    }
833
834    #[test]
835    fn test_gen_meta_code_v0_title_description() {
836        let result = gen_meta_code_v0(
837            "Die Unendliche Geschichte",
838            Some("Von Michael Ende"),
839            None,
840            64,
841        )
842        .unwrap();
843        assert_eq!(result.iscc, "ISCC:AAAZXZ6OU4E45RB5");
844        assert_eq!(result.name, "Die Unendliche Geschichte");
845        assert_eq!(result.description, Some("Von Michael Ende".to_string()));
846        assert_eq!(result.meta, None);
847    }
848
849    #[test]
850    fn test_gen_meta_code_v0_json_meta() {
851        let result = gen_meta_code_v0("Hello", None, Some(r#"{"some":"object"}"#), 64).unwrap();
852        assert_eq!(result.iscc, "ISCC:AAAWKLHFXN63LHL2");
853        assert!(result.meta.is_some());
854        assert!(
855            result
856                .meta
857                .unwrap()
858                .starts_with("data:application/json;base64,")
859        );
860    }
861
862    #[test]
863    fn test_gen_meta_code_v0_data_url_meta() {
864        let result = gen_meta_code_v0(
865            "Hello",
866            None,
867            Some("data:application/json;charset=utf-8;base64,eyJzb21lIjogIm9iamVjdCJ9"),
868            64,
869        )
870        .unwrap();
871        assert_eq!(result.iscc, "ISCC:AAAWKLHFXN43ICP2");
872        // Data-URL is passed through as-is
873        assert_eq!(
874            result.meta,
875            Some("data:application/json;charset=utf-8;base64,eyJzb21lIjogIm9iamVjdCJ9".to_string())
876        );
877    }
878
879    /// Verify that JSON metadata with float values is canonicalized per RFC 8785 (JCS).
880    ///
881    /// JCS serializes `1.0` as `1` (integer form), while `serde_json` preserves `1.0`.
882    /// This causes different canonical bytes, different metahash, and different ISCC codes.
883    /// Expected values generated by `iscc-core` with `jcs.canonicalize({"value": 1.0})`.
884    #[test]
885    fn test_gen_meta_code_v0_jcs_float_canonicalization() {
886        // JCS canonicalizes {"value": 1.0} → {"value":1} (integer form)
887        // serde_json produces {"value":1.0} (preserves float notation)
888        let result = gen_meta_code_v0("Test", None, Some(r#"{"value":1.0}"#), 64).unwrap();
889
890        // Expected values from iscc-core (Python) using jcs.canonicalize()
891        assert_eq!(
892            result.iscc, "ISCC:AAAX4GX3RZH2I6QZ",
893            "ISCC mismatch: parse_meta_json must use RFC 8785 (JCS) canonicalization"
894        );
895        assert_eq!(
896            result.meta,
897            Some("data:application/json;base64,eyJ2YWx1ZSI6MX0=".to_string()),
898            "meta Data-URL mismatch: JCS should serialize 1.0 as 1"
899        );
900        assert_eq!(
901            result.metahash, "1e2010b291d392b6999ffe4aa4661fb343fc371fca3bfb5bb4e8d8226fdf85743232",
902            "metahash mismatch: canonical bytes differ between JCS and serde_json"
903        );
904    }
905
906    /// Verify JCS number formatting for large floats (scientific notation edge case).
907    ///
908    /// JCS serializes `1e20` as `100000000000000000000` (expanded integer form).
909    /// Expected values generated by `iscc-core` with `jcs.canonicalize({"value": 1e20})`.
910    #[test]
911    fn test_gen_meta_code_v0_jcs_large_float_canonicalization() {
912        let result = gen_meta_code_v0("Test", None, Some(r#"{"value":1e20}"#), 64).unwrap();
913
914        assert_eq!(
915            result.iscc, "ISCC:AAAX4GX3R32YH5P7",
916            "ISCC mismatch: JCS should expand 1e20 to 100000000000000000000"
917        );
918        assert_eq!(
919            result.meta,
920            Some(
921                "data:application/json;base64,eyJ2YWx1ZSI6MTAwMDAwMDAwMDAwMDAwMDAwMDAwfQ=="
922                    .to_string()
923            ),
924            "meta Data-URL mismatch: JCS should expand large float to integer form"
925        );
926        assert_eq!(
927            result.metahash, "1e201ff83c1822c348717658a0b4713739646da7c59832691b337a457416ddd1c73d",
928            "metahash mismatch: canonical bytes differ for large float"
929        );
930    }
931
932    #[test]
933    fn test_gen_meta_code_v0_invalid_json() {
934        assert!(matches!(
935            gen_meta_code_v0("test", None, Some("not json"), 64),
936            Err(IsccError::InvalidInput(_))
937        ));
938    }
939
940    #[test]
941    fn test_gen_meta_code_v0_invalid_data_url() {
942        assert!(matches!(
943            gen_meta_code_v0("test", None, Some("data:no-comma-here"), 64),
944            Err(IsccError::InvalidInput(_))
945        ));
946    }
947
948    #[test]
949    fn test_gen_meta_code_v0_conformance() {
950        let json_str = include_str!("../tests/data.json");
951        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
952        let section = &data["gen_meta_code_v0"];
953        let cases = section.as_object().unwrap();
954
955        let mut tested = 0;
956
957        for (tc_name, tc) in cases {
958            let inputs = tc["inputs"].as_array().unwrap();
959            let input_name = inputs[0].as_str().unwrap();
960            let input_desc = inputs[1].as_str().unwrap();
961            let meta_val = &inputs[2];
962            let bits = inputs[3].as_u64().unwrap() as u32;
963
964            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
965            let expected_metahash = tc["outputs"]["metahash"].as_str().unwrap();
966
967            // Dispatch meta parameter based on JSON value type
968            let meta_arg: Option<String> = match meta_val {
969                serde_json::Value::Null => None,
970                serde_json::Value::String(s) => Some(s.clone()),
971                serde_json::Value::Object(_) => Some(serde_json::to_string(meta_val).unwrap()),
972                other => panic!("unexpected meta type in {tc_name}: {other:?}"),
973            };
974
975            let desc = if input_desc.is_empty() {
976                None
977            } else {
978                Some(input_desc)
979            };
980
981            // Verify ISCC output from struct
982            let result = gen_meta_code_v0(input_name, desc, meta_arg.as_deref(), bits)
983                .unwrap_or_else(|e| panic!("gen_meta_code_v0 failed for {tc_name}: {e}"));
984            assert_eq!(
985                result.iscc, expected_iscc,
986                "ISCC mismatch in test case {tc_name}"
987            );
988
989            // Verify metahash from struct
990            assert_eq!(
991                result.metahash, expected_metahash,
992                "metahash mismatch in test case {tc_name}"
993            );
994
995            // Verify name from struct
996            if let Some(expected_name) = tc["outputs"].get("name") {
997                let expected_name = expected_name.as_str().unwrap();
998                assert_eq!(
999                    result.name, expected_name,
1000                    "name mismatch in test case {tc_name}"
1001                );
1002            }
1003
1004            // Verify description from struct
1005            if let Some(expected_desc) = tc["outputs"].get("description") {
1006                let expected_desc = expected_desc.as_str().unwrap();
1007                assert_eq!(
1008                    result.description.as_deref(),
1009                    Some(expected_desc),
1010                    "description mismatch in test case {tc_name}"
1011                );
1012            }
1013
1014            // Verify meta from struct
1015            if meta_arg.is_some() {
1016                assert!(
1017                    result.meta.is_some(),
1018                    "meta should be present in test case {tc_name}"
1019                );
1020            } else {
1021                assert!(
1022                    result.meta.is_none(),
1023                    "meta should be absent in test case {tc_name}"
1024                );
1025            }
1026
1027            tested += 1;
1028        }
1029
1030        assert_eq!(tested, 16, "expected 16 conformance tests to run");
1031    }
1032
1033    #[test]
1034    fn test_gen_text_code_v0_empty() {
1035        let result = gen_text_code_v0("", 64).unwrap();
1036        assert_eq!(result.iscc, "ISCC:EAASL4F2WZY7KBXB");
1037        assert_eq!(result.characters, 0);
1038    }
1039
1040    #[test]
1041    fn test_gen_text_code_v0_hello_world() {
1042        let result = gen_text_code_v0("Hello World", 64).unwrap();
1043        assert_eq!(result.iscc, "ISCC:EAASKDNZNYGUUF5A");
1044        assert_eq!(result.characters, 10); // "helloworld" after collapse
1045    }
1046
1047    #[test]
1048    fn test_gen_text_code_v0_conformance() {
1049        let json_str = include_str!("../tests/data.json");
1050        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1051        let section = &data["gen_text_code_v0"];
1052        let cases = section.as_object().unwrap();
1053
1054        let mut tested = 0;
1055
1056        for (tc_name, tc) in cases {
1057            let inputs = tc["inputs"].as_array().unwrap();
1058            let input_text = inputs[0].as_str().unwrap();
1059            let bits = inputs[1].as_u64().unwrap() as u32;
1060
1061            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1062            let expected_chars = tc["outputs"]["characters"].as_u64().unwrap() as usize;
1063
1064            // Verify ISCC output from struct
1065            let result = gen_text_code_v0(input_text, bits)
1066                .unwrap_or_else(|e| panic!("gen_text_code_v0 failed for {tc_name}: {e}"));
1067            assert_eq!(
1068                result.iscc, expected_iscc,
1069                "ISCC mismatch in test case {tc_name}"
1070            );
1071
1072            // Verify character count from struct
1073            assert_eq!(
1074                result.characters, expected_chars,
1075                "character count mismatch in test case {tc_name}"
1076            );
1077
1078            tested += 1;
1079        }
1080
1081        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1082    }
1083
1084    #[test]
1085    fn test_gen_image_code_v0_all_black() {
1086        let pixels = vec![0u8; 1024];
1087        let result = gen_image_code_v0(&pixels, 64).unwrap();
1088        assert_eq!(result.iscc, "ISCC:EEAQAAAAAAAAAAAA");
1089    }
1090
1091    #[test]
1092    fn test_gen_image_code_v0_all_white() {
1093        let pixels = vec![255u8; 1024];
1094        let result = gen_image_code_v0(&pixels, 128).unwrap();
1095        assert_eq!(result.iscc, "ISCC:EEBYAAAAAAAAAAAAAAAAAAAAAAAAA");
1096    }
1097
1098    #[test]
1099    fn test_gen_image_code_v0_invalid_pixel_count() {
1100        assert!(gen_image_code_v0(&[0u8; 100], 64).is_err());
1101    }
1102
1103    #[test]
1104    fn test_gen_image_code_v0_conformance() {
1105        let json_str = include_str!("../tests/data.json");
1106        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1107        let section = &data["gen_image_code_v0"];
1108        let cases = section.as_object().unwrap();
1109
1110        let mut tested = 0;
1111
1112        for (tc_name, tc) in cases {
1113            let inputs = tc["inputs"].as_array().unwrap();
1114            let pixels_json = inputs[0].as_array().unwrap();
1115            let bits = inputs[1].as_u64().unwrap() as u32;
1116            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1117
1118            let pixels: Vec<u8> = pixels_json
1119                .iter()
1120                .map(|v| v.as_u64().unwrap() as u8)
1121                .collect();
1122
1123            let result = gen_image_code_v0(&pixels, bits)
1124                .unwrap_or_else(|e| panic!("gen_image_code_v0 failed for {tc_name}: {e}"));
1125            assert_eq!(
1126                result.iscc, expected_iscc,
1127                "ISCC mismatch in test case {tc_name}"
1128            );
1129
1130            tested += 1;
1131        }
1132
1133        assert_eq!(tested, 3, "expected 3 conformance tests to run");
1134    }
1135
1136    #[test]
1137    fn test_gen_audio_code_v0_empty() {
1138        let result = gen_audio_code_v0(&[], 64).unwrap();
1139        assert_eq!(result.iscc, "ISCC:EIAQAAAAAAAAAAAA");
1140    }
1141
1142    #[test]
1143    fn test_gen_audio_code_v0_single() {
1144        let result = gen_audio_code_v0(&[1], 128).unwrap();
1145        assert_eq!(result.iscc, "ISCC:EIBQAAAAAEAAAAABAAAAAAAAAAAAA");
1146    }
1147
1148    #[test]
1149    fn test_gen_audio_code_v0_negative() {
1150        let result = gen_audio_code_v0(&[-1, 0, 1], 256).unwrap();
1151        assert_eq!(
1152            result.iscc,
1153            "ISCC:EIDQAAAAAH777777AAAAAAAAAAAACAAAAAAP777774AAAAAAAAAAAAI"
1154        );
1155    }
1156
1157    #[test]
1158    fn test_gen_audio_code_v0_conformance() {
1159        let json_str = include_str!("../tests/data.json");
1160        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1161        let section = &data["gen_audio_code_v0"];
1162        let cases = section.as_object().unwrap();
1163
1164        let mut tested = 0;
1165
1166        for (tc_name, tc) in cases {
1167            let inputs = tc["inputs"].as_array().unwrap();
1168            let cv_json = inputs[0].as_array().unwrap();
1169            let bits = inputs[1].as_u64().unwrap() as u32;
1170            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1171
1172            let cv: Vec<i32> = cv_json.iter().map(|v| v.as_i64().unwrap() as i32).collect();
1173
1174            let result = gen_audio_code_v0(&cv, bits)
1175                .unwrap_or_else(|e| panic!("gen_audio_code_v0 failed for {tc_name}: {e}"));
1176            assert_eq!(
1177                result.iscc, expected_iscc,
1178                "ISCC mismatch in test case {tc_name}"
1179            );
1180
1181            tested += 1;
1182        }
1183
1184        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1185    }
1186
1187    #[test]
1188    fn test_array_split_even() {
1189        let data = vec![1, 2, 3, 4];
1190        let parts = array_split(&data, 4);
1191        assert_eq!(parts, vec![&[1][..], &[2][..], &[3][..], &[4][..]]);
1192    }
1193
1194    #[test]
1195    fn test_array_split_remainder() {
1196        let data = vec![1, 2, 3, 4, 5];
1197        let parts = array_split(&data, 3);
1198        assert_eq!(parts, vec![&[1, 2][..], &[3, 4][..], &[5][..]]);
1199    }
1200
1201    #[test]
1202    fn test_array_split_more_parts_than_elements() {
1203        let data = vec![1, 2];
1204        let parts = array_split(&data, 4);
1205        assert_eq!(
1206            parts,
1207            vec![&[1][..], &[2][..], &[][..] as &[i32], &[][..] as &[i32]]
1208        );
1209    }
1210
1211    #[test]
1212    fn test_array_split_empty() {
1213        let data: Vec<i32> = vec![];
1214        let parts = array_split(&data, 3);
1215        assert_eq!(
1216            parts,
1217            vec![&[][..] as &[i32], &[][..] as &[i32], &[][..] as &[i32]]
1218        );
1219    }
1220
1221    #[test]
1222    fn test_gen_video_code_v0_empty_frames() {
1223        let frames: Vec<Vec<i32>> = vec![];
1224        assert!(matches!(
1225            gen_video_code_v0(&frames, 64),
1226            Err(IsccError::InvalidInput(_))
1227        ));
1228    }
1229
1230    #[test]
1231    fn test_gen_video_code_v0_conformance() {
1232        let json_str = include_str!("../tests/data.json");
1233        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1234        let section = &data["gen_video_code_v0"];
1235        let cases = section.as_object().unwrap();
1236
1237        let mut tested = 0;
1238
1239        for (tc_name, tc) in cases {
1240            let inputs = tc["inputs"].as_array().unwrap();
1241            let frames_json = inputs[0].as_array().unwrap();
1242            let bits = inputs[1].as_u64().unwrap() as u32;
1243            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1244
1245            let frame_sigs: Vec<Vec<i32>> = frames_json
1246                .iter()
1247                .map(|frame| {
1248                    frame
1249                        .as_array()
1250                        .unwrap()
1251                        .iter()
1252                        .map(|v| v.as_i64().unwrap() as i32)
1253                        .collect()
1254                })
1255                .collect();
1256
1257            let result = gen_video_code_v0(&frame_sigs, bits)
1258                .unwrap_or_else(|e| panic!("gen_video_code_v0 failed for {tc_name}: {e}"));
1259            assert_eq!(
1260                result.iscc, expected_iscc,
1261                "ISCC mismatch in test case {tc_name}"
1262            );
1263
1264            tested += 1;
1265        }
1266
1267        assert_eq!(tested, 3, "expected 3 conformance tests to run");
1268    }
1269
1270    #[test]
1271    fn test_gen_mixed_code_v0_conformance() {
1272        let json_str = include_str!("../tests/data.json");
1273        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1274        let section = &data["gen_mixed_code_v0"];
1275        let cases = section.as_object().unwrap();
1276
1277        let mut tested = 0;
1278
1279        for (tc_name, tc) in cases {
1280            let inputs = tc["inputs"].as_array().unwrap();
1281            let codes_json = inputs[0].as_array().unwrap();
1282            let bits = inputs[1].as_u64().unwrap() as u32;
1283            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1284            let expected_parts: Vec<&str> = tc["outputs"]["parts"]
1285                .as_array()
1286                .unwrap()
1287                .iter()
1288                .map(|v| v.as_str().unwrap())
1289                .collect();
1290
1291            let codes: Vec<&str> = codes_json.iter().map(|v| v.as_str().unwrap()).collect();
1292
1293            let result = gen_mixed_code_v0(&codes, bits)
1294                .unwrap_or_else(|e| panic!("gen_mixed_code_v0 failed for {tc_name}: {e}"));
1295            assert_eq!(
1296                result.iscc, expected_iscc,
1297                "ISCC mismatch in test case {tc_name}"
1298            );
1299
1300            // Verify parts from struct match expected
1301            let result_parts: Vec<&str> = result.parts.iter().map(|s| s.as_str()).collect();
1302            assert_eq!(
1303                result_parts, expected_parts,
1304                "parts mismatch in test case {tc_name}"
1305            );
1306
1307            tested += 1;
1308        }
1309
1310        assert_eq!(tested, 2, "expected 2 conformance tests to run");
1311    }
1312
1313    #[test]
1314    fn test_gen_mixed_code_v0_too_few_codes() {
1315        assert!(matches!(
1316            gen_mixed_code_v0(&["EUA6GIKXN42IQV3S"], 64),
1317            Err(IsccError::InvalidInput(_))
1318        ));
1319    }
1320
1321    /// Build raw Content-Code bytes (header + body) for a given bit length.
1322    fn make_content_code_raw(stype: codec::SubType, bit_length: u32) -> Vec<u8> {
1323        let nbytes = (bit_length / 8) as usize;
1324        let body: Vec<u8> = (0..nbytes).map(|i| (i & 0xFF) as u8).collect();
1325        let base32 = codec::encode_component(
1326            codec::MainType::Content,
1327            stype,
1328            codec::Version::V0,
1329            bit_length,
1330            &body,
1331        )
1332        .unwrap();
1333        codec::decode_base32(&base32).unwrap()
1334    }
1335
1336    #[test]
1337    fn test_soft_hash_codes_v0_rejects_short_code() {
1338        // One code with 64 bits, one with only 32 bits — should reject when requesting 64
1339        let code_64 = make_content_code_raw(codec::SubType::None, 64);
1340        let code_32 = make_content_code_raw(codec::SubType::Image, 32);
1341        let result = soft_hash_codes_v0(&[code_64, code_32], 64);
1342        assert!(
1343            matches!(&result, Err(IsccError::InvalidInput(msg)) if msg.contains("too short")),
1344            "expected InvalidInput with 'too short', got {result:?}"
1345        );
1346    }
1347
1348    #[test]
1349    fn test_soft_hash_codes_v0_accepts_exact_length() {
1350        // Two codes with exactly 64 bits each — should succeed when requesting 64
1351        let code_a = make_content_code_raw(codec::SubType::None, 64);
1352        let code_b = make_content_code_raw(codec::SubType::Image, 64);
1353        let result = soft_hash_codes_v0(&[code_a, code_b], 64);
1354        assert!(result.is_ok(), "expected Ok, got {result:?}");
1355    }
1356
1357    #[test]
1358    fn test_soft_hash_codes_v0_accepts_longer_codes() {
1359        // Two codes with 128 bits each — should succeed when requesting 64
1360        let code_a = make_content_code_raw(codec::SubType::None, 128);
1361        let code_b = make_content_code_raw(codec::SubType::Audio, 128);
1362        let result = soft_hash_codes_v0(&[code_a, code_b], 64);
1363        assert!(result.is_ok(), "expected Ok, got {result:?}");
1364    }
1365
1366    #[test]
1367    fn test_gen_data_code_v0_conformance() {
1368        let json_str = include_str!("../tests/data.json");
1369        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1370        let section = &data["gen_data_code_v0"];
1371        let cases = section.as_object().unwrap();
1372
1373        let mut tested = 0;
1374
1375        for (tc_name, tc) in cases {
1376            let inputs = tc["inputs"].as_array().unwrap();
1377            let stream_str = inputs[0].as_str().unwrap();
1378            let bits = inputs[1].as_u64().unwrap() as u32;
1379            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1380
1381            // Parse "stream:" prefix — remainder is hex-encoded bytes
1382            let hex_data = stream_str
1383                .strip_prefix("stream:")
1384                .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {tc_name}"));
1385            let input_bytes = hex::decode(hex_data)
1386                .unwrap_or_else(|e| panic!("invalid hex in test case {tc_name}: {e}"));
1387
1388            let result = gen_data_code_v0(&input_bytes, bits)
1389                .unwrap_or_else(|e| panic!("gen_data_code_v0 failed for {tc_name}: {e}"));
1390            assert_eq!(
1391                result.iscc, expected_iscc,
1392                "ISCC mismatch in test case {tc_name}"
1393            );
1394
1395            tested += 1;
1396        }
1397
1398        assert_eq!(tested, 4, "expected 4 conformance tests to run");
1399    }
1400
1401    #[test]
1402    fn test_gen_instance_code_v0_empty() {
1403        let result = gen_instance_code_v0(b"", 64).unwrap();
1404        assert_eq!(result.iscc, "ISCC:IAA26E2JXH27TING");
1405        assert_eq!(result.filesize, 0);
1406        assert_eq!(
1407            result.datahash,
1408            "1e20af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"
1409        );
1410    }
1411
1412    #[test]
1413    fn test_gen_instance_code_v0_conformance() {
1414        let json_str = include_str!("../tests/data.json");
1415        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1416        let section = &data["gen_instance_code_v0"];
1417        let cases = section.as_object().unwrap();
1418
1419        for (name, tc) in cases {
1420            let inputs = tc["inputs"].as_array().unwrap();
1421            let stream_str = inputs[0].as_str().unwrap();
1422            let bits = inputs[1].as_u64().unwrap() as u32;
1423            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1424
1425            // Parse "stream:" prefix — remainder is hex-encoded bytes
1426            let hex_data = stream_str
1427                .strip_prefix("stream:")
1428                .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {name}"));
1429            let input_bytes = hex::decode(hex_data)
1430                .unwrap_or_else(|e| panic!("invalid hex in test case {name}: {e}"));
1431
1432            let result = gen_instance_code_v0(&input_bytes, bits)
1433                .unwrap_or_else(|e| panic!("gen_instance_code_v0 failed for {name}: {e}"));
1434            assert_eq!(
1435                result.iscc, expected_iscc,
1436                "ISCC mismatch in test case {name}"
1437            );
1438
1439            // Verify datahash from struct
1440            if let Some(expected_datahash) = tc["outputs"].get("datahash") {
1441                let expected_datahash = expected_datahash.as_str().unwrap();
1442                assert_eq!(
1443                    result.datahash, expected_datahash,
1444                    "datahash mismatch in test case {name}"
1445                );
1446            }
1447
1448            // Verify filesize from struct
1449            if let Some(expected_filesize) = tc["outputs"].get("filesize") {
1450                let expected_filesize = expected_filesize.as_u64().unwrap();
1451                assert_eq!(
1452                    result.filesize, expected_filesize,
1453                    "filesize mismatch in test case {name}"
1454                );
1455            }
1456
1457            // Also verify filesize matches input data length
1458            assert_eq!(
1459                result.filesize,
1460                input_bytes.len() as u64,
1461                "filesize should match input length in test case {name}"
1462            );
1463        }
1464    }
1465
1466    #[test]
1467    fn test_gen_iscc_code_v0_conformance() {
1468        let json_str = include_str!("../tests/data.json");
1469        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1470        let section = &data["gen_iscc_code_v0"];
1471        let cases = section.as_object().unwrap();
1472
1473        let mut tested = 0;
1474
1475        for (tc_name, tc) in cases {
1476            let inputs = tc["inputs"].as_array().unwrap();
1477            let codes_json = inputs[0].as_array().unwrap();
1478            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1479
1480            let codes: Vec<&str> = codes_json.iter().map(|v| v.as_str().unwrap()).collect();
1481
1482            let result = gen_iscc_code_v0(&codes, false)
1483                .unwrap_or_else(|e| panic!("gen_iscc_code_v0 failed for {tc_name}: {e}"));
1484            assert_eq!(
1485                result.iscc, expected_iscc,
1486                "ISCC mismatch in test case {tc_name}"
1487            );
1488
1489            tested += 1;
1490        }
1491
1492        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1493    }
1494
1495    #[test]
1496    fn test_gen_iscc_code_v0_too_few_codes() {
1497        assert!(matches!(
1498            gen_iscc_code_v0(&["AAAWKLHFPV6OPKDG"], false),
1499            Err(IsccError::InvalidInput(_))
1500        ));
1501    }
1502
1503    #[test]
1504    fn test_gen_iscc_code_v0_missing_instance() {
1505        // Two Meta codes — missing Data and Instance
1506        assert!(matches!(
1507            gen_iscc_code_v0(&["AAAWKLHFPV6OPKDG", "AAAWKLHFPV6OPKDG"], false),
1508            Err(IsccError::InvalidInput(_))
1509        ));
1510    }
1511
1512    #[test]
1513    fn test_gen_iscc_code_v0_short_code() {
1514        // Code too short (< 16 chars)
1515        assert!(matches!(
1516            gen_iscc_code_v0(&["AAAWKLHFPV6", "AAAWKLHFPV6OPKDG"], false),
1517            Err(IsccError::InvalidInput(_))
1518        ));
1519    }
1520
1521    /// Verify that a Data-URL with empty base64 payload enters the meta bytes path.
1522    ///
1523    /// Python reference: `if meta:` is truthy for `"data:application/json;base64,"` (non-empty
1524    /// string), so it enters the meta branch with `payload = b""`. The result must have
1525    /// `meta = Some(...)` containing the original Data-URL and `metahash` equal to
1526    /// `multi_hash_blake3(&[])` (BLAKE3 of empty bytes).
1527    #[test]
1528    fn test_gen_meta_code_empty_data_url_enters_meta_branch() {
1529        let result =
1530            gen_meta_code_v0("Test", None, Some("data:application/json;base64,"), 64).unwrap();
1531
1532        // Result should be Ok
1533        assert_eq!(result.name, "Test");
1534
1535        // meta should contain the original Data-URL string (not None)
1536        assert_eq!(
1537            result.meta,
1538            Some("data:application/json;base64,".to_string()),
1539            "empty Data-URL payload should still enter meta branch"
1540        );
1541
1542        // metahash should be BLAKE3 of empty bytes
1543        let expected_metahash = utils::multi_hash_blake3(&[]);
1544        assert_eq!(
1545            result.metahash, expected_metahash,
1546            "metahash should be BLAKE3 of empty bytes"
1547        );
1548    }
1549
1550    /// Verify that `soft_hash_meta_v0_with_bytes` with empty bytes produces the same
1551    /// digest as `soft_hash_meta_v0` with no extra text.
1552    ///
1553    /// Python reference (`code_meta.py:142`): `if extra in {None, "", b""}:` returns
1554    /// name-only simhash without interleaving for all empty-like values.
1555    #[test]
1556    fn test_soft_hash_meta_v0_with_bytes_empty_equals_name_only() {
1557        let name_only = soft_hash_meta_v0("test", None);
1558        let empty_bytes = soft_hash_meta_v0_with_bytes("test", &[]);
1559        assert_eq!(
1560            name_only, empty_bytes,
1561            "empty bytes should produce same digest as name-only (no interleaving)"
1562        );
1563    }
1564}