Skip to main content

iscc_lib/
lib.rs

1//! High-performance Rust implementation of ISO 24138:2024 (ISCC).
2//!
3//! This crate provides the core ISCC algorithm implementations. All 9 `gen_*_v0`
4//! functions are the public Tier 1 API surface, designed to be compatible with
5//! the `iscc-core` Python reference implementation.
6
7pub mod cdc;
8pub mod codec;
9pub mod conformance;
10pub(crate) mod dct;
11pub mod minhash;
12pub mod simhash;
13pub mod streaming;
14pub mod types;
15pub mod utils;
16pub(crate) mod wtahash;
17
18pub use cdc::alg_cdc_chunks;
19pub use codec::encode_base64;
20pub use codec::iscc_decompose;
21pub use conformance::conformance_selftest;
22pub use minhash::alg_minhash_256;
23pub use simhash::{alg_simhash, sliding_window};
24pub use streaming::{DataHasher, InstanceHasher};
25pub use types::*;
26pub use utils::{text_clean, text_collapse, text_remove_newlines, text_trim};
27
28/// Error type for ISCC operations.
29#[derive(Debug, thiserror::Error)]
30pub enum IsccError {
31    /// Input data is invalid.
32    #[error("invalid input: {0}")]
33    InvalidInput(String),
34}
35
36/// Result type alias for ISCC operations.
37pub type IsccResult<T> = Result<T, IsccError>;
38
39/// Interleave two 32-byte SimHash digests in 4-byte chunks.
40///
41/// Takes the first 16 bytes of each digest and interleaves them into
42/// a 32-byte result: 4 bytes from `a`, 4 bytes from `b`, alternating
43/// for 4 rounds (8 chunks total).
44fn interleave_digests(a: &[u8], b: &[u8]) -> Vec<u8> {
45    let mut result = vec![0u8; 32];
46    for chunk in 0..4 {
47        let src = chunk * 4;
48        let dst_a = chunk * 8;
49        let dst_b = chunk * 8 + 4;
50        result[dst_a..dst_a + 4].copy_from_slice(&a[src..src + 4]);
51        result[dst_b..dst_b + 4].copy_from_slice(&b[src..src + 4]);
52    }
53    result
54}
55
56/// Compute a SimHash digest from the name text for meta hashing.
57///
58/// Applies `text_collapse`, generates width-3 sliding window n-grams,
59/// hashes each with BLAKE3, and produces a SimHash.
60fn meta_name_simhash(name: &str) -> Vec<u8> {
61    let collapsed_name = utils::text_collapse(name);
62    let name_ngrams = simhash::sliding_window_strs(&collapsed_name, 3);
63    let name_hashes: Vec<[u8; 32]> = name_ngrams
64        .iter()
65        .map(|ng| *blake3::hash(ng.as_bytes()).as_bytes())
66        .collect();
67    simhash::alg_simhash_inner(&name_hashes)
68}
69
70/// Compute a similarity-preserving 256-bit hash from metadata text.
71///
72/// Produces a SimHash digest from `name` n-grams. When `extra` is provided,
73/// interleaves the name and extra SimHash digests in 4-byte chunks.
74fn soft_hash_meta_v0(name: &str, extra: Option<&str>) -> Vec<u8> {
75    let name_simhash = meta_name_simhash(name);
76
77    match extra {
78        None | Some("") => name_simhash,
79        Some(extra_str) => {
80            let collapsed_extra = utils::text_collapse(extra_str);
81            let extra_ngrams = simhash::sliding_window_strs(&collapsed_extra, 3);
82            let extra_hashes: Vec<[u8; 32]> = extra_ngrams
83                .iter()
84                .map(|ng| *blake3::hash(ng.as_bytes()).as_bytes())
85                .collect();
86            let extra_simhash = simhash::alg_simhash_inner(&extra_hashes);
87
88            interleave_digests(&name_simhash, &extra_simhash)
89        }
90    }
91}
92
93/// Compute a similarity-preserving 256-bit hash from name text and raw bytes.
94///
95/// Like `soft_hash_meta_v0` but the extra data is raw bytes instead of text.
96/// Uses width-4 byte n-grams (no `text_collapse`) for the bytes path,
97/// and interleaves name/bytes SimHash digests in 4-byte chunks.
98fn soft_hash_meta_v0_with_bytes(name: &str, extra: &[u8]) -> Vec<u8> {
99    let name_simhash = meta_name_simhash(name);
100
101    if extra.is_empty() {
102        return name_simhash;
103    }
104
105    let byte_ngrams = simhash::sliding_window_bytes(extra, 4);
106    let byte_hashes: Vec<[u8; 32]> = byte_ngrams
107        .iter()
108        .map(|ng| *blake3::hash(ng).as_bytes())
109        .collect();
110    let byte_simhash = simhash::alg_simhash_inner(&byte_hashes);
111
112    interleave_digests(&name_simhash, &byte_simhash)
113}
114
115/// Decode a Data-URL's base64 payload.
116///
117/// Expects a string starting with `"data:"`. Splits on the first `,` and
118/// decodes the remainder as standard base64. Returns `InvalidInput` on
119/// missing comma or invalid base64.
120fn decode_data_url(data_url: &str) -> IsccResult<Vec<u8>> {
121    let payload_b64 = data_url
122        .split_once(',')
123        .map(|(_, b64)| b64)
124        .ok_or_else(|| IsccError::InvalidInput("Data-URL missing comma separator".into()))?;
125    data_encoding::BASE64
126        .decode(payload_b64.as_bytes())
127        .map_err(|e| IsccError::InvalidInput(format!("invalid base64 in Data-URL: {e}")))
128}
129
130/// Parse a meta string as JSON and re-serialize to RFC 8785 (JCS) canonical bytes.
131fn parse_meta_json(meta_str: &str) -> IsccResult<Vec<u8>> {
132    let parsed: serde_json::Value = serde_json::from_str(meta_str)
133        .map_err(|e| IsccError::InvalidInput(format!("invalid JSON in meta: {e}")))?;
134    let mut buf = Vec::new();
135    serde_json_canonicalizer::to_writer(&parsed, &mut buf)
136        .map_err(|e| IsccError::InvalidInput(format!("JSON canonicalization failed: {e}")))?;
137    Ok(buf)
138}
139
140/// Build a Data-URL from canonical JSON bytes.
141///
142/// Uses `application/ld+json` media type if the JSON has an `@context` key,
143/// otherwise `application/json`. Encodes payload as standard base64 with padding.
144fn build_meta_data_url(json_bytes: &[u8], json_value: &serde_json::Value) -> String {
145    let media_type = if json_value.get("@context").is_some() {
146        "application/ld+json"
147    } else {
148        "application/json"
149    };
150    let b64 = data_encoding::BASE64.encode(json_bytes);
151    format!("data:{media_type};base64,{b64}")
152}
153
154/// Generate a Meta-Code from name and optional metadata.
155///
156/// Produces an ISCC Meta-Code by hashing the provided name, description,
157/// and metadata fields using the SimHash algorithm. When `meta` is provided,
158/// it is treated as either a Data-URL (if starting with `"data:"`) or a JSON
159/// string, and the decoded/serialized bytes are used for similarity hashing
160/// and metahash computation.
161pub fn gen_meta_code_v0(
162    name: &str,
163    description: Option<&str>,
164    meta: Option<&str>,
165    bits: u32,
166) -> IsccResult<MetaCodeResult> {
167    // Normalize name: clean → remove newlines → trim to 128 bytes
168    let name = utils::text_clean(name);
169    let name = utils::text_remove_newlines(&name);
170    let name = utils::text_trim(&name, 128);
171
172    if name.is_empty() {
173        return Err(IsccError::InvalidInput(
174            "name is empty after normalization".into(),
175        ));
176    }
177
178    // Normalize description: clean → trim to 4096 bytes
179    let desc_str = description.unwrap_or("");
180    let desc_clean = utils::text_clean(desc_str);
181    let desc_clean = utils::text_trim(&desc_clean, 4096);
182
183    // Resolve meta payload bytes (if meta is provided)
184    let meta_payload: Option<Vec<u8>> = match meta {
185        Some(meta_str) if meta_str.starts_with("data:") => Some(decode_data_url(meta_str)?),
186        Some(meta_str) => Some(parse_meta_json(meta_str)?),
187        None => None,
188    };
189
190    // Branch: meta bytes path vs. description text path
191    if let Some(ref payload) = meta_payload {
192        let meta_code_digest = soft_hash_meta_v0_with_bytes(&name, payload);
193        let metahash = utils::multi_hash_blake3(payload);
194
195        let meta_code = codec::encode_component(
196            codec::MainType::Meta,
197            codec::SubType::None,
198            codec::Version::V0,
199            bits,
200            &meta_code_digest,
201        )?;
202
203        // Build the meta Data-URL for the result
204        let meta_value = match meta {
205            Some(meta_str) if meta_str.starts_with("data:") => meta_str.to_string(),
206            Some(meta_str) => {
207                let parsed: serde_json::Value = serde_json::from_str(meta_str)
208                    .map_err(|e| IsccError::InvalidInput(format!("invalid JSON: {e}")))?;
209                build_meta_data_url(payload, &parsed)
210            }
211            None => unreachable!(),
212        };
213
214        Ok(MetaCodeResult {
215            iscc: format!("ISCC:{meta_code}"),
216            name: name.clone(),
217            description: if desc_clean.is_empty() {
218                None
219            } else {
220                Some(desc_clean)
221            },
222            meta: Some(meta_value),
223            metahash,
224        })
225    } else {
226        // Compute metahash from normalized text payload
227        let payload = if desc_clean.is_empty() {
228            name.clone()
229        } else {
230            format!("{} {}", name, desc_clean)
231        };
232        let payload = payload.trim().to_string();
233        let metahash = utils::multi_hash_blake3(payload.as_bytes());
234
235        // Compute similarity digest
236        let extra = if desc_clean.is_empty() {
237            None
238        } else {
239            Some(desc_clean.as_str())
240        };
241        let meta_code_digest = soft_hash_meta_v0(&name, extra);
242
243        let meta_code = codec::encode_component(
244            codec::MainType::Meta,
245            codec::SubType::None,
246            codec::Version::V0,
247            bits,
248            &meta_code_digest,
249        )?;
250
251        Ok(MetaCodeResult {
252            iscc: format!("ISCC:{meta_code}"),
253            name: name.clone(),
254            description: if desc_clean.is_empty() {
255                None
256            } else {
257                Some(desc_clean)
258            },
259            meta: None,
260            metahash,
261        })
262    }
263}
264
265/// Compute a 256-bit similarity-preserving hash from collapsed text.
266///
267/// Generates character n-grams with a sliding window of width 13,
268/// hashes each with xxh32, then applies MinHash to produce a 32-byte digest.
269fn soft_hash_text_v0(text: &str) -> Vec<u8> {
270    let ngrams = simhash::sliding_window_strs(text, 13);
271    let features: Vec<u32> = ngrams
272        .iter()
273        .map(|ng| xxhash_rust::xxh32::xxh32(ng.as_bytes(), 0))
274        .collect();
275    minhash::alg_minhash_256(&features)
276}
277
278/// Generate a Text-Code from plain text content.
279///
280/// Produces an ISCC Content-Code for text by collapsing the input,
281/// extracting character n-gram features, and applying MinHash to
282/// create a similarity-preserving fingerprint.
283pub fn gen_text_code_v0(text: &str, bits: u32) -> IsccResult<TextCodeResult> {
284    let collapsed = utils::text_collapse(text);
285    let characters = collapsed.chars().count();
286    let hash_digest = soft_hash_text_v0(&collapsed);
287    let component = codec::encode_component(
288        codec::MainType::Content,
289        codec::SubType::TEXT,
290        codec::Version::V0,
291        bits,
292        &hash_digest,
293    )?;
294    Ok(TextCodeResult {
295        iscc: format!("ISCC:{component}"),
296        characters,
297    })
298}
299
300/// Transpose a matrix represented as a Vec of Vecs.
301fn transpose_matrix(matrix: &[Vec<f64>]) -> Vec<Vec<f64>> {
302    let rows = matrix.len();
303    if rows == 0 {
304        return vec![];
305    }
306    let cols = matrix[0].len();
307    let mut result = vec![vec![0.0f64; rows]; cols];
308    for (r, row) in matrix.iter().enumerate() {
309        for (c, &val) in row.iter().enumerate() {
310            result[c][r] = val;
311        }
312    }
313    result
314}
315
316/// Extract an 8×8 block from a matrix and flatten to 64 values.
317///
318/// Block position `(col, row)` means the block starts at
319/// `matrix[row][col]` and spans 8 rows and 8 columns.
320fn flatten_8x8(matrix: &[Vec<f64>], col: usize, row: usize) -> Vec<f64> {
321    let mut flat = Vec::with_capacity(64);
322    for matrix_row in matrix.iter().skip(row).take(8) {
323        for &val in matrix_row.iter().skip(col).take(8) {
324            flat.push(val);
325        }
326    }
327    flat
328}
329
330/// Compute the median of a slice of f64 values.
331///
332/// For even-length slices, returns the average of the two middle values
333/// (matching Python `statistics.median` behavior).
334fn compute_median(values: &[f64]) -> f64 {
335    let mut sorted: Vec<f64> = values.to_vec();
336    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
337    let n = sorted.len();
338    if n % 2 == 1 {
339        sorted[n / 2]
340    } else {
341        (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
342    }
343}
344
345/// Convert a slice of bools to a byte vector (MSB first per byte).
346fn bits_to_bytes(bits: &[bool]) -> Vec<u8> {
347    bits.chunks(8)
348        .map(|chunk| {
349            let mut byte = 0u8;
350            for (i, &bit) in chunk.iter().enumerate() {
351                if bit {
352                    byte |= 1 << (7 - i);
353                }
354            }
355            byte
356        })
357        .collect()
358}
359
360/// Compute a DCT-based perceptual hash from 32×32 grayscale pixels.
361///
362/// Applies a 2D DCT to the pixel matrix, extracts four 8×8 low-frequency
363/// blocks, and generates a bitstring by comparing each coefficient against
364/// the block median. Returns up to `bits` bits as a byte vector.
365fn soft_hash_image_v0(pixels: &[u8], bits: u32) -> IsccResult<Vec<u8>> {
366    if pixels.len() != 1024 {
367        return Err(IsccError::InvalidInput(format!(
368            "expected 1024 pixels, got {}",
369            pixels.len()
370        )));
371    }
372    if bits > 256 {
373        return Err(IsccError::InvalidInput(format!(
374            "bits must be <= 256, got {}",
375            bits
376        )));
377    }
378
379    // Step 1: Row-wise DCT (32 rows of 32 pixels)
380    let rows: Vec<Vec<f64>> = pixels
381        .chunks(32)
382        .map(|row| {
383            let row_f64: Vec<f64> = row.iter().map(|&p| p as f64).collect();
384            dct::alg_dct(&row_f64)
385        })
386        .collect::<IsccResult<Vec<Vec<f64>>>>()?;
387
388    // Step 2: Transpose
389    let transposed = transpose_matrix(&rows);
390
391    // Step 3: Column-wise DCT
392    let dct_cols: Vec<Vec<f64>> = transposed
393        .iter()
394        .map(|col| dct::alg_dct(col))
395        .collect::<IsccResult<Vec<Vec<f64>>>>()?;
396
397    // Step 4: Transpose back → dct_matrix
398    let dct_matrix = transpose_matrix(&dct_cols);
399
400    // Step 5: Extract 8×8 blocks at positions (0,0), (1,0), (0,1), (1,1)
401    let positions = [(0, 0), (1, 0), (0, 1), (1, 1)];
402    let mut bitstring = Vec::<bool>::with_capacity(256);
403
404    for (col, row) in positions {
405        let flat = flatten_8x8(&dct_matrix, col, row);
406        let median = compute_median(&flat);
407        for val in &flat {
408            bitstring.push(*val > median);
409        }
410        if bitstring.len() >= bits as usize {
411            break;
412        }
413    }
414
415    // Step 6: Convert first `bits` bools to bytes
416    Ok(bits_to_bytes(&bitstring[..bits as usize]))
417}
418
419/// Generate an Image-Code from pixel data.
420///
421/// Produces an ISCC Content-Code for images from a sequence of 1024
422/// grayscale pixel values (32×32, values 0-255) using a DCT-based
423/// perceptual hash.
424pub fn gen_image_code_v0(pixels: &[u8], bits: u32) -> IsccResult<ImageCodeResult> {
425    let hash_digest = soft_hash_image_v0(pixels, bits)?;
426    let component = codec::encode_component(
427        codec::MainType::Content,
428        codec::SubType::Image,
429        codec::Version::V0,
430        bits,
431        &hash_digest,
432    )?;
433    Ok(ImageCodeResult {
434        iscc: format!("ISCC:{component}"),
435    })
436}
437
438/// Split a slice into `n` parts, distributing remainder across first chunks.
439///
440/// Equivalent to `numpy.array_split` / `more_itertools.divide`:
441/// each part gets `len / n` elements, and the first `len % n` parts
442/// get one extra element. Returns empty slices for excess parts.
443fn array_split<T>(slice: &[T], n: usize) -> Vec<&[T]> {
444    if n == 0 {
445        return vec![];
446    }
447    let len = slice.len();
448    let base = len / n;
449    let remainder = len % n;
450    let mut parts = Vec::with_capacity(n);
451    let mut offset = 0;
452    for i in 0..n {
453        let size = base + if i < remainder { 1 } else { 0 };
454        parts.push(&slice[offset..offset + size]);
455        offset += size;
456    }
457    parts
458}
459
460/// Compute a multi-stage SimHash digest from Chromaprint features.
461///
462/// Builds a 32-byte digest by concatenating 4-byte SimHash chunks:
463/// - Stage 1: overall SimHash of all features (4 bytes)
464/// - Stage 2: SimHash of each quarter of features (4 × 4 = 16 bytes)
465/// - Stage 3: SimHash of each third of sorted features (3 × 4 = 12 bytes)
466fn soft_hash_audio_v0(cv: &[i32]) -> Vec<u8> {
467    // Convert each i32 to 4-byte big-endian digest
468    let digests: Vec<[u8; 4]> = cv.iter().map(|&v| v.to_be_bytes()).collect();
469
470    if digests.is_empty() {
471        return vec![0u8; 32];
472    }
473
474    // Stage 1: overall SimHash (4 bytes)
475    let mut parts: Vec<u8> = simhash::alg_simhash_inner(&digests);
476
477    // Stage 2: quarter-based SimHashes (4 × 4 = 16 bytes)
478    let quarters = array_split(&digests, 4);
479    for quarter in &quarters {
480        if quarter.is_empty() {
481            parts.extend_from_slice(&[0u8; 4]);
482        } else {
483            parts.extend_from_slice(&simhash::alg_simhash_inner(quarter));
484        }
485    }
486
487    // Stage 3: sorted-third-based SimHashes (3 × 4 = 12 bytes)
488    let mut sorted_values: Vec<i32> = cv.to_vec();
489    sorted_values.sort();
490    let sorted_digests: Vec<[u8; 4]> = sorted_values.iter().map(|&v| v.to_be_bytes()).collect();
491    let thirds = array_split(&sorted_digests, 3);
492    for third in &thirds {
493        if third.is_empty() {
494            parts.extend_from_slice(&[0u8; 4]);
495        } else {
496            parts.extend_from_slice(&simhash::alg_simhash_inner(third));
497        }
498    }
499
500    parts
501}
502
503/// Generate an Audio-Code from a Chromaprint feature vector.
504///
505/// Produces an ISCC Content-Code for audio from a Chromaprint signed
506/// integer fingerprint vector using multi-stage SimHash.
507pub fn gen_audio_code_v0(cv: &[i32], bits: u32) -> IsccResult<AudioCodeResult> {
508    let hash_digest = soft_hash_audio_v0(cv);
509    let component = codec::encode_component(
510        codec::MainType::Content,
511        codec::SubType::Audio,
512        codec::Version::V0,
513        bits,
514        &hash_digest,
515    )?;
516    Ok(AudioCodeResult {
517        iscc: format!("ISCC:{component}"),
518    })
519}
520
521/// Compute a similarity-preserving hash from video frame signatures.
522///
523/// Deduplicates frame signatures, computes column-wise sums across all
524/// unique frames, then applies WTA-Hash to produce a digest of `bits/8` bytes.
525pub fn soft_hash_video_v0(frame_sigs: &[Vec<i32>], bits: u32) -> IsccResult<Vec<u8>> {
526    if frame_sigs.is_empty() {
527        return Err(IsccError::InvalidInput(
528            "frame_sigs must not be empty".into(),
529        ));
530    }
531
532    // Deduplicate using BTreeSet (Vec<i32> implements Ord)
533    let unique: std::collections::BTreeSet<&Vec<i32>> = frame_sigs.iter().collect();
534
535    // Column-wise sum into i64 to avoid overflow
536    let cols = frame_sigs[0].len();
537    let mut vecsum = vec![0i64; cols];
538    for sig in &unique {
539        for (c, &val) in sig.iter().enumerate() {
540            vecsum[c] += val as i64;
541        }
542    }
543
544    Ok(wtahash::alg_wtahash(&vecsum, bits))
545}
546
547/// Generate a Video-Code from frame signature data.
548///
549/// Produces an ISCC Content-Code for video from a sequence of MPEG-7 frame
550/// signatures. Each frame signature is a 380-element integer vector.
551pub fn gen_video_code_v0(frame_sigs: &[Vec<i32>], bits: u32) -> IsccResult<VideoCodeResult> {
552    let digest = soft_hash_video_v0(frame_sigs, bits)?;
553    let component = codec::encode_component(
554        codec::MainType::Content,
555        codec::SubType::Video,
556        codec::Version::V0,
557        bits,
558        &digest,
559    )?;
560    Ok(VideoCodeResult {
561        iscc: format!("ISCC:{component}"),
562    })
563}
564
565/// Combine multiple Content-Code digests into a single similarity hash.
566///
567/// Takes raw decoded ISCC bytes (header + body) for each Content-Code and
568/// produces a SimHash digest. Each input is trimmed to `bits/8` bytes by
569/// keeping the first header byte (encodes type info) plus `nbytes-1` body bytes.
570/// Requires at least 2 codes, all of MainType::Content.
571fn soft_hash_codes_v0(cc_digests: &[Vec<u8>], bits: u32) -> IsccResult<Vec<u8>> {
572    if cc_digests.len() < 2 {
573        return Err(IsccError::InvalidInput(
574            "at least 2 Content-Codes required for mixing".into(),
575        ));
576    }
577
578    let nbytes = (bits / 8) as usize;
579    let mut prepared: Vec<Vec<u8>> = Vec::with_capacity(cc_digests.len());
580
581    for raw in cc_digests {
582        let (mtype, stype, _ver, blen, body) = codec::decode_header(raw)?;
583        if mtype != codec::MainType::Content {
584            return Err(IsccError::InvalidInput(
585                "all codes must be Content-Codes".into(),
586            ));
587        }
588        let unit_bits = codec::decode_length(mtype, blen, stype);
589        if unit_bits < bits {
590            return Err(IsccError::InvalidInput(format!(
591                "Content-Code too short for {bits}-bit length (has {unit_bits} bits)"
592            )));
593        }
594        let mut entry = Vec::with_capacity(nbytes);
595        entry.push(raw[0]); // first byte preserves type info
596        let take = std::cmp::min(nbytes - 1, body.len());
597        entry.extend_from_slice(&body[..take]);
598        // Pad with zeros if body is shorter than nbytes-1
599        while entry.len() < nbytes {
600            entry.push(0);
601        }
602        prepared.push(entry);
603    }
604
605    Ok(simhash::alg_simhash_inner(&prepared))
606}
607
608/// Generate a Mixed-Code from multiple Content-Code strings.
609///
610/// Produces a Mixed Content-Code by combining multiple ISCC Content-Codes
611/// of different types (text, image, audio, video) using SimHash. Input codes
612/// may optionally include the "ISCC:" prefix.
613pub fn gen_mixed_code_v0(codes: &[&str], bits: u32) -> IsccResult<MixedCodeResult> {
614    let decoded: Vec<Vec<u8>> = codes
615        .iter()
616        .map(|code| {
617            let clean = code.strip_prefix("ISCC:").unwrap_or(code);
618            codec::decode_base32(clean)
619        })
620        .collect::<IsccResult<Vec<Vec<u8>>>>()?;
621
622    let digest = soft_hash_codes_v0(&decoded, bits)?;
623
624    let component = codec::encode_component(
625        codec::MainType::Content,
626        codec::SubType::Mixed,
627        codec::Version::V0,
628        bits,
629        &digest,
630    )?;
631
632    Ok(MixedCodeResult {
633        iscc: format!("ISCC:{component}"),
634        parts: codes.iter().map(|s| s.to_string()).collect(),
635    })
636}
637
638/// Generate a Data-Code from raw byte data.
639///
640/// Produces an ISCC Data-Code by splitting data into content-defined chunks,
641/// hashing each chunk with xxh32, and applying MinHash to create a
642/// similarity-preserving fingerprint.
643pub fn gen_data_code_v0(data: &[u8], bits: u32) -> IsccResult<DataCodeResult> {
644    let chunks = cdc::alg_cdc_chunks(data, false, cdc::DATA_AVG_CHUNK_SIZE);
645    let mut features: Vec<u32> = chunks
646        .iter()
647        .map(|chunk| xxhash_rust::xxh32::xxh32(chunk, 0))
648        .collect();
649
650    // Defensive: ensure at least one feature (alg_cdc_chunks guarantees >= 1 chunk)
651    if features.is_empty() {
652        features.push(xxhash_rust::xxh32::xxh32(b"", 0));
653    }
654
655    let digest = minhash::alg_minhash_256(&features);
656    let component = codec::encode_component(
657        codec::MainType::Data,
658        codec::SubType::None,
659        codec::Version::V0,
660        bits,
661        &digest,
662    )?;
663
664    Ok(DataCodeResult {
665        iscc: format!("ISCC:{component}"),
666    })
667}
668
669/// Generate an Instance-Code from raw byte data.
670///
671/// Produces an ISCC Instance-Code by hashing the complete byte stream
672/// with BLAKE3. Captures the exact binary identity of the data.
673pub fn gen_instance_code_v0(data: &[u8], bits: u32) -> IsccResult<InstanceCodeResult> {
674    let digest = blake3::hash(data);
675    let datahash = utils::multi_hash_blake3(data);
676    let filesize = data.len() as u64;
677    let component = codec::encode_component(
678        codec::MainType::Instance,
679        codec::SubType::None,
680        codec::Version::V0,
681        bits,
682        digest.as_bytes(),
683    )?;
684    Ok(InstanceCodeResult {
685        iscc: format!("ISCC:{component}"),
686        datahash,
687        filesize,
688    })
689}
690
691/// Generate a composite ISCC-CODE from individual ISCC unit codes.
692///
693/// Combines multiple ISCC unit codes (Meta-Code, Content-Code, Data-Code,
694/// Instance-Code) into a single composite ISCC-CODE. Input codes may
695/// optionally include the "ISCC:" prefix. At least Data-Code and
696/// Instance-Code are required. When `wide` is true and exactly two
697/// 128-bit+ codes (Data + Instance) are provided, produces a 256-bit
698/// wide-mode code.
699pub fn gen_iscc_code_v0(codes: &[&str], wide: bool) -> IsccResult<IsccCodeResult> {
700    // Step 1: Clean inputs — strip "ISCC:" prefix
701    let cleaned: Vec<&str> = codes
702        .iter()
703        .map(|c| c.strip_prefix("ISCC:").unwrap_or(c))
704        .collect();
705
706    // Step 2: Validate minimum count
707    if cleaned.len() < 2 {
708        return Err(IsccError::InvalidInput(
709            "at least 2 ISCC unit codes required".into(),
710        ));
711    }
712
713    // Step 3: Validate minimum length (16 base32 chars = 64-bit minimum)
714    for code in &cleaned {
715        if code.len() < 16 {
716            return Err(IsccError::InvalidInput(format!(
717                "ISCC unit code too short (min 16 chars): {}",
718                code
719            )));
720        }
721    }
722
723    // Step 4: Decode each code
724    let mut decoded: Vec<(
725        codec::MainType,
726        codec::SubType,
727        codec::Version,
728        u32,
729        Vec<u8>,
730    )> = Vec::with_capacity(cleaned.len());
731    for code in &cleaned {
732        let raw = codec::decode_base32(code)?;
733        let header = codec::decode_header(&raw)?;
734        decoded.push(header);
735    }
736
737    // Step 5: Sort by MainType (ascending)
738    decoded.sort_by_key(|&(mt, ..)| mt);
739
740    // Step 6: Extract main_types
741    let main_types: Vec<codec::MainType> = decoded.iter().map(|&(mt, ..)| mt).collect();
742
743    // Step 7: Validate last two are Data + Instance (mandatory)
744    let n = main_types.len();
745    if main_types[n - 2] != codec::MainType::Data || main_types[n - 1] != codec::MainType::Instance
746    {
747        return Err(IsccError::InvalidInput(
748            "Data-Code and Instance-Code are mandatory".into(),
749        ));
750    }
751
752    // Step 8: Determine wide composite
753    let is_wide = wide
754        && decoded.len() == 2
755        && main_types == [codec::MainType::Data, codec::MainType::Instance]
756        && decoded
757            .iter()
758            .all(|&(mt, st, _, len, _)| codec::decode_length(mt, len, st) >= 128);
759
760    // Step 9: Determine SubType
761    let st = if is_wide {
762        codec::SubType::Wide
763    } else {
764        // Collect SubTypes of Semantic/Content units
765        let sc_subtypes: Vec<codec::SubType> = decoded
766            .iter()
767            .filter(|&&(mt, ..)| mt == codec::MainType::Semantic || mt == codec::MainType::Content)
768            .map(|&(_, st, ..)| st)
769            .collect();
770
771        if !sc_subtypes.is_empty() {
772            // All must be the same
773            let first = sc_subtypes[0];
774            if sc_subtypes.iter().all(|&s| s == first) {
775                first
776            } else {
777                return Err(IsccError::InvalidInput(
778                    "mixed SubTypes among Content/Semantic units".into(),
779                ));
780            }
781        } else if decoded.len() == 2 {
782            codec::SubType::Sum
783        } else {
784            codec::SubType::IsccNone
785        }
786    };
787
788    // Step 10–11: Get optional MainTypes and encode
789    let optional_types = &main_types[..n - 2];
790    let encoded_length = codec::encode_units(optional_types)?;
791
792    // Step 12: Build digest body
793    let bytes_per_unit = if is_wide { 16 } else { 8 };
794    let mut digest = Vec::with_capacity(decoded.len() * bytes_per_unit);
795    for (_, _, _, _, tail) in &decoded {
796        let take = bytes_per_unit.min(tail.len());
797        digest.extend_from_slice(&tail[..take]);
798    }
799
800    // Step 13–14: Encode header + digest as base32
801    let header = codec::encode_header(
802        codec::MainType::Iscc,
803        st,
804        codec::Version::V0,
805        encoded_length,
806    )?;
807    let mut code_bytes = header;
808    code_bytes.extend_from_slice(&digest);
809    let code = codec::encode_base32(&code_bytes);
810
811    // Step 15: Return with prefix
812    Ok(IsccCodeResult {
813        iscc: format!("ISCC:{code}"),
814    })
815}
816
817#[cfg(test)]
818mod tests {
819    use super::*;
820
821    #[test]
822    fn test_gen_meta_code_v0_title_only() {
823        let result = gen_meta_code_v0("Die Unendliche Geschichte", None, None, 64).unwrap();
824        assert_eq!(result.iscc, "ISCC:AAAZXZ6OU74YAZIM");
825        assert_eq!(result.name, "Die Unendliche Geschichte");
826        assert_eq!(result.description, None);
827        assert_eq!(result.meta, None);
828    }
829
830    #[test]
831    fn test_gen_meta_code_v0_title_description() {
832        let result = gen_meta_code_v0(
833            "Die Unendliche Geschichte",
834            Some("Von Michael Ende"),
835            None,
836            64,
837        )
838        .unwrap();
839        assert_eq!(result.iscc, "ISCC:AAAZXZ6OU4E45RB5");
840        assert_eq!(result.name, "Die Unendliche Geschichte");
841        assert_eq!(result.description, Some("Von Michael Ende".to_string()));
842        assert_eq!(result.meta, None);
843    }
844
845    #[test]
846    fn test_gen_meta_code_v0_json_meta() {
847        let result = gen_meta_code_v0("Hello", None, Some(r#"{"some":"object"}"#), 64).unwrap();
848        assert_eq!(result.iscc, "ISCC:AAAWKLHFXN63LHL2");
849        assert!(result.meta.is_some());
850        assert!(
851            result
852                .meta
853                .unwrap()
854                .starts_with("data:application/json;base64,")
855        );
856    }
857
858    #[test]
859    fn test_gen_meta_code_v0_data_url_meta() {
860        let result = gen_meta_code_v0(
861            "Hello",
862            None,
863            Some("data:application/json;charset=utf-8;base64,eyJzb21lIjogIm9iamVjdCJ9"),
864            64,
865        )
866        .unwrap();
867        assert_eq!(result.iscc, "ISCC:AAAWKLHFXN43ICP2");
868        // Data-URL is passed through as-is
869        assert_eq!(
870            result.meta,
871            Some("data:application/json;charset=utf-8;base64,eyJzb21lIjogIm9iamVjdCJ9".to_string())
872        );
873    }
874
875    /// Verify that JSON metadata with float values is canonicalized per RFC 8785 (JCS).
876    ///
877    /// JCS serializes `1.0` as `1` (integer form), while `serde_json` preserves `1.0`.
878    /// This causes different canonical bytes, different metahash, and different ISCC codes.
879    /// Expected values generated by `iscc-core` with `jcs.canonicalize({"value": 1.0})`.
880    #[test]
881    fn test_gen_meta_code_v0_jcs_float_canonicalization() {
882        // JCS canonicalizes {"value": 1.0} → {"value":1} (integer form)
883        // serde_json produces {"value":1.0} (preserves float notation)
884        let result = gen_meta_code_v0("Test", None, Some(r#"{"value":1.0}"#), 64).unwrap();
885
886        // Expected values from iscc-core (Python) using jcs.canonicalize()
887        assert_eq!(
888            result.iscc, "ISCC:AAAX4GX3RZH2I6QZ",
889            "ISCC mismatch: parse_meta_json must use RFC 8785 (JCS) canonicalization"
890        );
891        assert_eq!(
892            result.meta,
893            Some("data:application/json;base64,eyJ2YWx1ZSI6MX0=".to_string()),
894            "meta Data-URL mismatch: JCS should serialize 1.0 as 1"
895        );
896        assert_eq!(
897            result.metahash, "1e2010b291d392b6999ffe4aa4661fb343fc371fca3bfb5bb4e8d8226fdf85743232",
898            "metahash mismatch: canonical bytes differ between JCS and serde_json"
899        );
900    }
901
902    /// Verify JCS number formatting for large floats (scientific notation edge case).
903    ///
904    /// JCS serializes `1e20` as `100000000000000000000` (expanded integer form).
905    /// Expected values generated by `iscc-core` with `jcs.canonicalize({"value": 1e20})`.
906    #[test]
907    fn test_gen_meta_code_v0_jcs_large_float_canonicalization() {
908        let result = gen_meta_code_v0("Test", None, Some(r#"{"value":1e20}"#), 64).unwrap();
909
910        assert_eq!(
911            result.iscc, "ISCC:AAAX4GX3R32YH5P7",
912            "ISCC mismatch: JCS should expand 1e20 to 100000000000000000000"
913        );
914        assert_eq!(
915            result.meta,
916            Some(
917                "data:application/json;base64,eyJ2YWx1ZSI6MTAwMDAwMDAwMDAwMDAwMDAwMDAwfQ=="
918                    .to_string()
919            ),
920            "meta Data-URL mismatch: JCS should expand large float to integer form"
921        );
922        assert_eq!(
923            result.metahash, "1e201ff83c1822c348717658a0b4713739646da7c59832691b337a457416ddd1c73d",
924            "metahash mismatch: canonical bytes differ for large float"
925        );
926    }
927
928    #[test]
929    fn test_gen_meta_code_v0_invalid_json() {
930        assert!(matches!(
931            gen_meta_code_v0("test", None, Some("not json"), 64),
932            Err(IsccError::InvalidInput(_))
933        ));
934    }
935
936    #[test]
937    fn test_gen_meta_code_v0_invalid_data_url() {
938        assert!(matches!(
939            gen_meta_code_v0("test", None, Some("data:no-comma-here"), 64),
940            Err(IsccError::InvalidInput(_))
941        ));
942    }
943
944    #[test]
945    fn test_gen_meta_code_v0_conformance() {
946        let json_str = include_str!("../tests/data.json");
947        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
948        let section = &data["gen_meta_code_v0"];
949        let cases = section.as_object().unwrap();
950
951        let mut tested = 0;
952
953        for (tc_name, tc) in cases {
954            let inputs = tc["inputs"].as_array().unwrap();
955            let input_name = inputs[0].as_str().unwrap();
956            let input_desc = inputs[1].as_str().unwrap();
957            let meta_val = &inputs[2];
958            let bits = inputs[3].as_u64().unwrap() as u32;
959
960            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
961            let expected_metahash = tc["outputs"]["metahash"].as_str().unwrap();
962
963            // Dispatch meta parameter based on JSON value type
964            let meta_arg: Option<String> = match meta_val {
965                serde_json::Value::Null => None,
966                serde_json::Value::String(s) => Some(s.clone()),
967                serde_json::Value::Object(_) => Some(serde_json::to_string(meta_val).unwrap()),
968                other => panic!("unexpected meta type in {tc_name}: {other:?}"),
969            };
970
971            let desc = if input_desc.is_empty() {
972                None
973            } else {
974                Some(input_desc)
975            };
976
977            // Verify ISCC output from struct
978            let result = gen_meta_code_v0(input_name, desc, meta_arg.as_deref(), bits)
979                .unwrap_or_else(|e| panic!("gen_meta_code_v0 failed for {tc_name}: {e}"));
980            assert_eq!(
981                result.iscc, expected_iscc,
982                "ISCC mismatch in test case {tc_name}"
983            );
984
985            // Verify metahash from struct
986            assert_eq!(
987                result.metahash, expected_metahash,
988                "metahash mismatch in test case {tc_name}"
989            );
990
991            // Verify name from struct
992            if let Some(expected_name) = tc["outputs"].get("name") {
993                let expected_name = expected_name.as_str().unwrap();
994                assert_eq!(
995                    result.name, expected_name,
996                    "name mismatch in test case {tc_name}"
997                );
998            }
999
1000            // Verify description from struct
1001            if let Some(expected_desc) = tc["outputs"].get("description") {
1002                let expected_desc = expected_desc.as_str().unwrap();
1003                assert_eq!(
1004                    result.description.as_deref(),
1005                    Some(expected_desc),
1006                    "description mismatch in test case {tc_name}"
1007                );
1008            }
1009
1010            // Verify meta from struct
1011            if meta_arg.is_some() {
1012                assert!(
1013                    result.meta.is_some(),
1014                    "meta should be present in test case {tc_name}"
1015                );
1016            } else {
1017                assert!(
1018                    result.meta.is_none(),
1019                    "meta should be absent in test case {tc_name}"
1020                );
1021            }
1022
1023            tested += 1;
1024        }
1025
1026        assert_eq!(tested, 16, "expected 16 conformance tests to run");
1027    }
1028
1029    #[test]
1030    fn test_gen_text_code_v0_empty() {
1031        let result = gen_text_code_v0("", 64).unwrap();
1032        assert_eq!(result.iscc, "ISCC:EAASL4F2WZY7KBXB");
1033        assert_eq!(result.characters, 0);
1034    }
1035
1036    #[test]
1037    fn test_gen_text_code_v0_hello_world() {
1038        let result = gen_text_code_v0("Hello World", 64).unwrap();
1039        assert_eq!(result.iscc, "ISCC:EAASKDNZNYGUUF5A");
1040        assert_eq!(result.characters, 10); // "helloworld" after collapse
1041    }
1042
1043    #[test]
1044    fn test_gen_text_code_v0_conformance() {
1045        let json_str = include_str!("../tests/data.json");
1046        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1047        let section = &data["gen_text_code_v0"];
1048        let cases = section.as_object().unwrap();
1049
1050        let mut tested = 0;
1051
1052        for (tc_name, tc) in cases {
1053            let inputs = tc["inputs"].as_array().unwrap();
1054            let input_text = inputs[0].as_str().unwrap();
1055            let bits = inputs[1].as_u64().unwrap() as u32;
1056
1057            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1058            let expected_chars = tc["outputs"]["characters"].as_u64().unwrap() as usize;
1059
1060            // Verify ISCC output from struct
1061            let result = gen_text_code_v0(input_text, bits)
1062                .unwrap_or_else(|e| panic!("gen_text_code_v0 failed for {tc_name}: {e}"));
1063            assert_eq!(
1064                result.iscc, expected_iscc,
1065                "ISCC mismatch in test case {tc_name}"
1066            );
1067
1068            // Verify character count from struct
1069            assert_eq!(
1070                result.characters, expected_chars,
1071                "character count mismatch in test case {tc_name}"
1072            );
1073
1074            tested += 1;
1075        }
1076
1077        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1078    }
1079
1080    #[test]
1081    fn test_gen_image_code_v0_all_black() {
1082        let pixels = vec![0u8; 1024];
1083        let result = gen_image_code_v0(&pixels, 64).unwrap();
1084        assert_eq!(result.iscc, "ISCC:EEAQAAAAAAAAAAAA");
1085    }
1086
1087    #[test]
1088    fn test_gen_image_code_v0_all_white() {
1089        let pixels = vec![255u8; 1024];
1090        let result = gen_image_code_v0(&pixels, 128).unwrap();
1091        assert_eq!(result.iscc, "ISCC:EEBYAAAAAAAAAAAAAAAAAAAAAAAAA");
1092    }
1093
1094    #[test]
1095    fn test_gen_image_code_v0_invalid_pixel_count() {
1096        assert!(gen_image_code_v0(&[0u8; 100], 64).is_err());
1097    }
1098
1099    #[test]
1100    fn test_gen_image_code_v0_conformance() {
1101        let json_str = include_str!("../tests/data.json");
1102        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1103        let section = &data["gen_image_code_v0"];
1104        let cases = section.as_object().unwrap();
1105
1106        let mut tested = 0;
1107
1108        for (tc_name, tc) in cases {
1109            let inputs = tc["inputs"].as_array().unwrap();
1110            let pixels_json = inputs[0].as_array().unwrap();
1111            let bits = inputs[1].as_u64().unwrap() as u32;
1112            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1113
1114            let pixels: Vec<u8> = pixels_json
1115                .iter()
1116                .map(|v| v.as_u64().unwrap() as u8)
1117                .collect();
1118
1119            let result = gen_image_code_v0(&pixels, bits)
1120                .unwrap_or_else(|e| panic!("gen_image_code_v0 failed for {tc_name}: {e}"));
1121            assert_eq!(
1122                result.iscc, expected_iscc,
1123                "ISCC mismatch in test case {tc_name}"
1124            );
1125
1126            tested += 1;
1127        }
1128
1129        assert_eq!(tested, 3, "expected 3 conformance tests to run");
1130    }
1131
1132    #[test]
1133    fn test_gen_audio_code_v0_empty() {
1134        let result = gen_audio_code_v0(&[], 64).unwrap();
1135        assert_eq!(result.iscc, "ISCC:EIAQAAAAAAAAAAAA");
1136    }
1137
1138    #[test]
1139    fn test_gen_audio_code_v0_single() {
1140        let result = gen_audio_code_v0(&[1], 128).unwrap();
1141        assert_eq!(result.iscc, "ISCC:EIBQAAAAAEAAAAABAAAAAAAAAAAAA");
1142    }
1143
1144    #[test]
1145    fn test_gen_audio_code_v0_negative() {
1146        let result = gen_audio_code_v0(&[-1, 0, 1], 256).unwrap();
1147        assert_eq!(
1148            result.iscc,
1149            "ISCC:EIDQAAAAAH777777AAAAAAAAAAAACAAAAAAP777774AAAAAAAAAAAAI"
1150        );
1151    }
1152
1153    #[test]
1154    fn test_gen_audio_code_v0_conformance() {
1155        let json_str = include_str!("../tests/data.json");
1156        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1157        let section = &data["gen_audio_code_v0"];
1158        let cases = section.as_object().unwrap();
1159
1160        let mut tested = 0;
1161
1162        for (tc_name, tc) in cases {
1163            let inputs = tc["inputs"].as_array().unwrap();
1164            let cv_json = inputs[0].as_array().unwrap();
1165            let bits = inputs[1].as_u64().unwrap() as u32;
1166            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1167
1168            let cv: Vec<i32> = cv_json.iter().map(|v| v.as_i64().unwrap() as i32).collect();
1169
1170            let result = gen_audio_code_v0(&cv, bits)
1171                .unwrap_or_else(|e| panic!("gen_audio_code_v0 failed for {tc_name}: {e}"));
1172            assert_eq!(
1173                result.iscc, expected_iscc,
1174                "ISCC mismatch in test case {tc_name}"
1175            );
1176
1177            tested += 1;
1178        }
1179
1180        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1181    }
1182
1183    #[test]
1184    fn test_array_split_even() {
1185        let data = vec![1, 2, 3, 4];
1186        let parts = array_split(&data, 4);
1187        assert_eq!(parts, vec![&[1][..], &[2][..], &[3][..], &[4][..]]);
1188    }
1189
1190    #[test]
1191    fn test_array_split_remainder() {
1192        let data = vec![1, 2, 3, 4, 5];
1193        let parts = array_split(&data, 3);
1194        assert_eq!(parts, vec![&[1, 2][..], &[3, 4][..], &[5][..]]);
1195    }
1196
1197    #[test]
1198    fn test_array_split_more_parts_than_elements() {
1199        let data = vec![1, 2];
1200        let parts = array_split(&data, 4);
1201        assert_eq!(
1202            parts,
1203            vec![&[1][..], &[2][..], &[][..] as &[i32], &[][..] as &[i32]]
1204        );
1205    }
1206
1207    #[test]
1208    fn test_array_split_empty() {
1209        let data: Vec<i32> = vec![];
1210        let parts = array_split(&data, 3);
1211        assert_eq!(
1212            parts,
1213            vec![&[][..] as &[i32], &[][..] as &[i32], &[][..] as &[i32]]
1214        );
1215    }
1216
1217    #[test]
1218    fn test_gen_video_code_v0_empty_frames() {
1219        let frames: Vec<Vec<i32>> = vec![];
1220        assert!(matches!(
1221            gen_video_code_v0(&frames, 64),
1222            Err(IsccError::InvalidInput(_))
1223        ));
1224    }
1225
1226    #[test]
1227    fn test_gen_video_code_v0_conformance() {
1228        let json_str = include_str!("../tests/data.json");
1229        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1230        let section = &data["gen_video_code_v0"];
1231        let cases = section.as_object().unwrap();
1232
1233        let mut tested = 0;
1234
1235        for (tc_name, tc) in cases {
1236            let inputs = tc["inputs"].as_array().unwrap();
1237            let frames_json = inputs[0].as_array().unwrap();
1238            let bits = inputs[1].as_u64().unwrap() as u32;
1239            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1240
1241            let frame_sigs: Vec<Vec<i32>> = frames_json
1242                .iter()
1243                .map(|frame| {
1244                    frame
1245                        .as_array()
1246                        .unwrap()
1247                        .iter()
1248                        .map(|v| v.as_i64().unwrap() as i32)
1249                        .collect()
1250                })
1251                .collect();
1252
1253            let result = gen_video_code_v0(&frame_sigs, bits)
1254                .unwrap_or_else(|e| panic!("gen_video_code_v0 failed for {tc_name}: {e}"));
1255            assert_eq!(
1256                result.iscc, expected_iscc,
1257                "ISCC mismatch in test case {tc_name}"
1258            );
1259
1260            tested += 1;
1261        }
1262
1263        assert_eq!(tested, 3, "expected 3 conformance tests to run");
1264    }
1265
1266    #[test]
1267    fn test_gen_mixed_code_v0_conformance() {
1268        let json_str = include_str!("../tests/data.json");
1269        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1270        let section = &data["gen_mixed_code_v0"];
1271        let cases = section.as_object().unwrap();
1272
1273        let mut tested = 0;
1274
1275        for (tc_name, tc) in cases {
1276            let inputs = tc["inputs"].as_array().unwrap();
1277            let codes_json = inputs[0].as_array().unwrap();
1278            let bits = inputs[1].as_u64().unwrap() as u32;
1279            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1280            let expected_parts: Vec<&str> = tc["outputs"]["parts"]
1281                .as_array()
1282                .unwrap()
1283                .iter()
1284                .map(|v| v.as_str().unwrap())
1285                .collect();
1286
1287            let codes: Vec<&str> = codes_json.iter().map(|v| v.as_str().unwrap()).collect();
1288
1289            let result = gen_mixed_code_v0(&codes, bits)
1290                .unwrap_or_else(|e| panic!("gen_mixed_code_v0 failed for {tc_name}: {e}"));
1291            assert_eq!(
1292                result.iscc, expected_iscc,
1293                "ISCC mismatch in test case {tc_name}"
1294            );
1295
1296            // Verify parts from struct match expected
1297            let result_parts: Vec<&str> = result.parts.iter().map(|s| s.as_str()).collect();
1298            assert_eq!(
1299                result_parts, expected_parts,
1300                "parts mismatch in test case {tc_name}"
1301            );
1302
1303            tested += 1;
1304        }
1305
1306        assert_eq!(tested, 2, "expected 2 conformance tests to run");
1307    }
1308
1309    #[test]
1310    fn test_gen_mixed_code_v0_too_few_codes() {
1311        assert!(matches!(
1312            gen_mixed_code_v0(&["EUA6GIKXN42IQV3S"], 64),
1313            Err(IsccError::InvalidInput(_))
1314        ));
1315    }
1316
1317    /// Build raw Content-Code bytes (header + body) for a given bit length.
1318    fn make_content_code_raw(stype: codec::SubType, bit_length: u32) -> Vec<u8> {
1319        let nbytes = (bit_length / 8) as usize;
1320        let body: Vec<u8> = (0..nbytes).map(|i| (i & 0xFF) as u8).collect();
1321        let base32 = codec::encode_component(
1322            codec::MainType::Content,
1323            stype,
1324            codec::Version::V0,
1325            bit_length,
1326            &body,
1327        )
1328        .unwrap();
1329        codec::decode_base32(&base32).unwrap()
1330    }
1331
1332    #[test]
1333    fn test_soft_hash_codes_v0_rejects_short_code() {
1334        // One code with 64 bits, one with only 32 bits — should reject when requesting 64
1335        let code_64 = make_content_code_raw(codec::SubType::None, 64);
1336        let code_32 = make_content_code_raw(codec::SubType::Image, 32);
1337        let result = soft_hash_codes_v0(&[code_64, code_32], 64);
1338        assert!(
1339            matches!(&result, Err(IsccError::InvalidInput(msg)) if msg.contains("too short")),
1340            "expected InvalidInput with 'too short', got {result:?}"
1341        );
1342    }
1343
1344    #[test]
1345    fn test_soft_hash_codes_v0_accepts_exact_length() {
1346        // Two codes with exactly 64 bits each — should succeed when requesting 64
1347        let code_a = make_content_code_raw(codec::SubType::None, 64);
1348        let code_b = make_content_code_raw(codec::SubType::Image, 64);
1349        let result = soft_hash_codes_v0(&[code_a, code_b], 64);
1350        assert!(result.is_ok(), "expected Ok, got {result:?}");
1351    }
1352
1353    #[test]
1354    fn test_soft_hash_codes_v0_accepts_longer_codes() {
1355        // Two codes with 128 bits each — should succeed when requesting 64
1356        let code_a = make_content_code_raw(codec::SubType::None, 128);
1357        let code_b = make_content_code_raw(codec::SubType::Audio, 128);
1358        let result = soft_hash_codes_v0(&[code_a, code_b], 64);
1359        assert!(result.is_ok(), "expected Ok, got {result:?}");
1360    }
1361
1362    #[test]
1363    fn test_gen_data_code_v0_conformance() {
1364        let json_str = include_str!("../tests/data.json");
1365        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1366        let section = &data["gen_data_code_v0"];
1367        let cases = section.as_object().unwrap();
1368
1369        let mut tested = 0;
1370
1371        for (tc_name, tc) in cases {
1372            let inputs = tc["inputs"].as_array().unwrap();
1373            let stream_str = inputs[0].as_str().unwrap();
1374            let bits = inputs[1].as_u64().unwrap() as u32;
1375            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1376
1377            // Parse "stream:" prefix — remainder is hex-encoded bytes
1378            let hex_data = stream_str
1379                .strip_prefix("stream:")
1380                .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {tc_name}"));
1381            let input_bytes = hex::decode(hex_data)
1382                .unwrap_or_else(|e| panic!("invalid hex in test case {tc_name}: {e}"));
1383
1384            let result = gen_data_code_v0(&input_bytes, bits)
1385                .unwrap_or_else(|e| panic!("gen_data_code_v0 failed for {tc_name}: {e}"));
1386            assert_eq!(
1387                result.iscc, expected_iscc,
1388                "ISCC mismatch in test case {tc_name}"
1389            );
1390
1391            tested += 1;
1392        }
1393
1394        assert_eq!(tested, 4, "expected 4 conformance tests to run");
1395    }
1396
1397    #[test]
1398    fn test_gen_instance_code_v0_empty() {
1399        let result = gen_instance_code_v0(b"", 64).unwrap();
1400        assert_eq!(result.iscc, "ISCC:IAA26E2JXH27TING");
1401        assert_eq!(result.filesize, 0);
1402        assert_eq!(
1403            result.datahash,
1404            "1e20af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"
1405        );
1406    }
1407
1408    #[test]
1409    fn test_gen_instance_code_v0_conformance() {
1410        let json_str = include_str!("../tests/data.json");
1411        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1412        let section = &data["gen_instance_code_v0"];
1413        let cases = section.as_object().unwrap();
1414
1415        for (name, tc) in cases {
1416            let inputs = tc["inputs"].as_array().unwrap();
1417            let stream_str = inputs[0].as_str().unwrap();
1418            let bits = inputs[1].as_u64().unwrap() as u32;
1419            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1420
1421            // Parse "stream:" prefix — remainder is hex-encoded bytes
1422            let hex_data = stream_str
1423                .strip_prefix("stream:")
1424                .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {name}"));
1425            let input_bytes = hex::decode(hex_data)
1426                .unwrap_or_else(|e| panic!("invalid hex in test case {name}: {e}"));
1427
1428            let result = gen_instance_code_v0(&input_bytes, bits)
1429                .unwrap_or_else(|e| panic!("gen_instance_code_v0 failed for {name}: {e}"));
1430            assert_eq!(
1431                result.iscc, expected_iscc,
1432                "ISCC mismatch in test case {name}"
1433            );
1434
1435            // Verify datahash from struct
1436            if let Some(expected_datahash) = tc["outputs"].get("datahash") {
1437                let expected_datahash = expected_datahash.as_str().unwrap();
1438                assert_eq!(
1439                    result.datahash, expected_datahash,
1440                    "datahash mismatch in test case {name}"
1441                );
1442            }
1443
1444            // Verify filesize from struct
1445            if let Some(expected_filesize) = tc["outputs"].get("filesize") {
1446                let expected_filesize = expected_filesize.as_u64().unwrap();
1447                assert_eq!(
1448                    result.filesize, expected_filesize,
1449                    "filesize mismatch in test case {name}"
1450                );
1451            }
1452
1453            // Also verify filesize matches input data length
1454            assert_eq!(
1455                result.filesize,
1456                input_bytes.len() as u64,
1457                "filesize should match input length in test case {name}"
1458            );
1459        }
1460    }
1461
1462    #[test]
1463    fn test_gen_iscc_code_v0_conformance() {
1464        let json_str = include_str!("../tests/data.json");
1465        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1466        let section = &data["gen_iscc_code_v0"];
1467        let cases = section.as_object().unwrap();
1468
1469        let mut tested = 0;
1470
1471        for (tc_name, tc) in cases {
1472            let inputs = tc["inputs"].as_array().unwrap();
1473            let codes_json = inputs[0].as_array().unwrap();
1474            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1475
1476            let codes: Vec<&str> = codes_json.iter().map(|v| v.as_str().unwrap()).collect();
1477
1478            let result = gen_iscc_code_v0(&codes, false)
1479                .unwrap_or_else(|e| panic!("gen_iscc_code_v0 failed for {tc_name}: {e}"));
1480            assert_eq!(
1481                result.iscc, expected_iscc,
1482                "ISCC mismatch in test case {tc_name}"
1483            );
1484
1485            tested += 1;
1486        }
1487
1488        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1489    }
1490
1491    #[test]
1492    fn test_gen_iscc_code_v0_too_few_codes() {
1493        assert!(matches!(
1494            gen_iscc_code_v0(&["AAAWKLHFPV6OPKDG"], false),
1495            Err(IsccError::InvalidInput(_))
1496        ));
1497    }
1498
1499    #[test]
1500    fn test_gen_iscc_code_v0_missing_instance() {
1501        // Two Meta codes — missing Data and Instance
1502        assert!(matches!(
1503            gen_iscc_code_v0(&["AAAWKLHFPV6OPKDG", "AAAWKLHFPV6OPKDG"], false),
1504            Err(IsccError::InvalidInput(_))
1505        ));
1506    }
1507
1508    #[test]
1509    fn test_gen_iscc_code_v0_short_code() {
1510        // Code too short (< 16 chars)
1511        assert!(matches!(
1512            gen_iscc_code_v0(&["AAAWKLHFPV6", "AAAWKLHFPV6OPKDG"], false),
1513            Err(IsccError::InvalidInput(_))
1514        ));
1515    }
1516
1517    /// Verify that a Data-URL with empty base64 payload enters the meta bytes path.
1518    ///
1519    /// Python reference: `if meta:` is truthy for `"data:application/json;base64,"` (non-empty
1520    /// string), so it enters the meta branch with `payload = b""`. The result must have
1521    /// `meta = Some(...)` containing the original Data-URL and `metahash` equal to
1522    /// `multi_hash_blake3(&[])` (BLAKE3 of empty bytes).
1523    #[test]
1524    fn test_gen_meta_code_empty_data_url_enters_meta_branch() {
1525        let result =
1526            gen_meta_code_v0("Test", None, Some("data:application/json;base64,"), 64).unwrap();
1527
1528        // Result should be Ok
1529        assert_eq!(result.name, "Test");
1530
1531        // meta should contain the original Data-URL string (not None)
1532        assert_eq!(
1533            result.meta,
1534            Some("data:application/json;base64,".to_string()),
1535            "empty Data-URL payload should still enter meta branch"
1536        );
1537
1538        // metahash should be BLAKE3 of empty bytes
1539        let expected_metahash = utils::multi_hash_blake3(&[]);
1540        assert_eq!(
1541            result.metahash, expected_metahash,
1542            "metahash should be BLAKE3 of empty bytes"
1543        );
1544    }
1545
1546    /// Verify that `soft_hash_meta_v0_with_bytes` with empty bytes produces the same
1547    /// digest as `soft_hash_meta_v0` with no extra text.
1548    ///
1549    /// Python reference (`code_meta.py:142`): `if extra in {None, "", b""}:` returns
1550    /// name-only simhash without interleaving for all empty-like values.
1551    #[test]
1552    fn test_soft_hash_meta_v0_with_bytes_empty_equals_name_only() {
1553        let name_only = soft_hash_meta_v0("test", None);
1554        let empty_bytes = soft_hash_meta_v0_with_bytes("test", &[]);
1555        assert_eq!(
1556            name_only, empty_bytes,
1557            "empty bytes should produce same digest as name-only (no interleaving)"
1558        );
1559    }
1560}