Skip to main content

iscc_lib/
lib.rs

1//! High-performance Rust implementation of ISO 24138:2024 (ISCC).
2//!
3//! This crate provides the core ISCC algorithm implementations. All 10 `gen_*_v0`
4//! functions are the public Tier 1 API surface, designed to be compatible with
5//! the `iscc-core` Python reference implementation.
6
7pub mod cdc;
8pub mod codec;
9pub mod conformance;
10pub(crate) mod dct;
11pub mod minhash;
12pub mod simhash;
13pub mod streaming;
14pub mod types;
15pub mod utils;
16pub(crate) mod wtahash;
17
18pub use cdc::alg_cdc_chunks;
19pub use codec::encode_base64;
20pub use codec::iscc_decompose;
21pub use conformance::conformance_selftest;
22pub use minhash::alg_minhash_256;
23pub use simhash::{alg_simhash, sliding_window};
24pub use streaming::{DataHasher, InstanceHasher};
25pub use types::*;
26#[cfg(feature = "text-processing")]
27pub use utils::{text_clean, text_collapse};
28pub use utils::{text_remove_newlines, text_trim};
29
30/// Max UTF-8 byte length for name metadata trimming.
31#[cfg(feature = "meta-code")]
32pub const META_TRIM_NAME: usize = 128;
33
34/// Max UTF-8 byte length for description metadata trimming.
35#[cfg(feature = "meta-code")]
36pub const META_TRIM_DESCRIPTION: usize = 4096;
37
38/// Max decoded payload size in bytes for the meta element.
39#[cfg(feature = "meta-code")]
40pub const META_TRIM_META: usize = 128_000;
41
42/// Buffer size in bytes for streaming file reads (4 MB).
43pub const IO_READ_SIZE: usize = 4_194_304;
44
45/// Character n-gram width for text content features.
46pub const TEXT_NGRAM_SIZE: usize = 13;
47
48/// Error type for ISCC operations.
49#[derive(Debug, thiserror::Error)]
50pub enum IsccError {
51    /// Input data is invalid.
52    #[error("invalid input: {0}")]
53    InvalidInput(String),
54}
55
56/// Result type alias for ISCC operations.
57pub type IsccResult<T> = Result<T, IsccError>;
58
59/// Interleave two 32-byte SimHash digests in 4-byte chunks.
60///
61/// Takes the first 16 bytes of each digest and interleaves them into
62/// a 32-byte result: 4 bytes from `a`, 4 bytes from `b`, alternating
63/// for 4 rounds (8 chunks total).
64#[cfg(feature = "meta-code")]
65fn interleave_digests(a: &[u8], b: &[u8]) -> Vec<u8> {
66    let mut result = vec![0u8; 32];
67    for chunk in 0..4 {
68        let src = chunk * 4;
69        let dst_a = chunk * 8;
70        let dst_b = chunk * 8 + 4;
71        result[dst_a..dst_a + 4].copy_from_slice(&a[src..src + 4]);
72        result[dst_b..dst_b + 4].copy_from_slice(&b[src..src + 4]);
73    }
74    result
75}
76
77/// Compute a SimHash digest from the name text for meta hashing.
78///
79/// Applies `text_collapse`, generates width-3 sliding window n-grams,
80/// hashes each with BLAKE3, and produces a SimHash.
81#[cfg(feature = "meta-code")]
82fn meta_name_simhash(name: &str) -> Vec<u8> {
83    let collapsed_name = utils::text_collapse(name);
84    let name_ngrams = simhash::sliding_window_strs(&collapsed_name, 3);
85    let name_hashes: Vec<[u8; 32]> = name_ngrams
86        .iter()
87        .map(|ng| *blake3::hash(ng.as_bytes()).as_bytes())
88        .collect();
89    simhash::alg_simhash_inner(&name_hashes)
90}
91
92/// Compute a similarity-preserving 256-bit hash from metadata text.
93///
94/// Produces a SimHash digest from `name` n-grams. When `extra` is provided,
95/// interleaves the name and extra SimHash digests in 4-byte chunks.
96#[cfg(feature = "meta-code")]
97fn soft_hash_meta_v0(name: &str, extra: Option<&str>) -> Vec<u8> {
98    let name_simhash = meta_name_simhash(name);
99
100    match extra {
101        None | Some("") => name_simhash,
102        Some(extra_str) => {
103            let collapsed_extra = utils::text_collapse(extra_str);
104            let extra_ngrams = simhash::sliding_window_strs(&collapsed_extra, 3);
105            let extra_hashes: Vec<[u8; 32]> = extra_ngrams
106                .iter()
107                .map(|ng| *blake3::hash(ng.as_bytes()).as_bytes())
108                .collect();
109            let extra_simhash = simhash::alg_simhash_inner(&extra_hashes);
110
111            interleave_digests(&name_simhash, &extra_simhash)
112        }
113    }
114}
115
116/// Compute a similarity-preserving 256-bit hash from name text and raw bytes.
117///
118/// Like `soft_hash_meta_v0` but the extra data is raw bytes instead of text.
119/// Uses width-4 byte n-grams (no `text_collapse`) for the bytes path,
120/// and interleaves name/bytes SimHash digests in 4-byte chunks.
121#[cfg(feature = "meta-code")]
122fn soft_hash_meta_v0_with_bytes(name: &str, extra: &[u8]) -> Vec<u8> {
123    let name_simhash = meta_name_simhash(name);
124
125    if extra.is_empty() {
126        return name_simhash;
127    }
128
129    let byte_ngrams = simhash::sliding_window_bytes(extra, 4);
130    let byte_hashes: Vec<[u8; 32]> = byte_ngrams
131        .iter()
132        .map(|ng| *blake3::hash(ng).as_bytes())
133        .collect();
134    let byte_simhash = simhash::alg_simhash_inner(&byte_hashes);
135
136    interleave_digests(&name_simhash, &byte_simhash)
137}
138
139/// Decode a Data-URL's base64 payload.
140///
141/// Expects a string starting with `"data:"`. Splits on the first `,` and
142/// decodes the remainder as standard base64. Returns `InvalidInput` on
143/// missing comma or invalid base64.
144#[cfg(feature = "meta-code")]
145fn decode_data_url(data_url: &str) -> IsccResult<Vec<u8>> {
146    let payload_b64 = data_url
147        .split_once(',')
148        .map(|(_, b64)| b64)
149        .ok_or_else(|| IsccError::InvalidInput("Data-URL missing comma separator".into()))?;
150    data_encoding::BASE64
151        .decode(payload_b64.as_bytes())
152        .map_err(|e| IsccError::InvalidInput(format!("invalid base64 in Data-URL: {e}")))
153}
154
155/// Parse a meta string as JSON and re-serialize to RFC 8785 (JCS) canonical bytes.
156#[cfg(feature = "meta-code")]
157fn parse_meta_json(meta_str: &str) -> IsccResult<Vec<u8>> {
158    let parsed: serde_json::Value = serde_json::from_str(meta_str)
159        .map_err(|e| IsccError::InvalidInput(format!("invalid JSON in meta: {e}")))?;
160    let mut buf = Vec::new();
161    serde_json_canonicalizer::to_writer(&parsed, &mut buf)
162        .map_err(|e| IsccError::InvalidInput(format!("JSON canonicalization failed: {e}")))?;
163    Ok(buf)
164}
165
166/// Build a Data-URL from canonical JSON bytes.
167///
168/// Uses `application/ld+json` media type if the JSON has an `@context` key,
169/// otherwise `application/json`. Encodes payload as standard base64 with padding.
170#[cfg(feature = "meta-code")]
171fn build_meta_data_url(json_bytes: &[u8], json_value: &serde_json::Value) -> String {
172    let media_type = if json_value.get("@context").is_some() {
173        "application/ld+json"
174    } else {
175        "application/json"
176    };
177    let b64 = data_encoding::BASE64.encode(json_bytes);
178    format!("data:{media_type};base64,{b64}")
179}
180
181/// Encode a raw digest into an ISCC unit string.
182///
183/// Takes integer type identifiers (matching `MainType`, `SubType`, `Version` enum values)
184/// and a raw digest, returns a base32-encoded ISCC unit string.
185///
186/// # Errors
187///
188/// Returns `IsccError::InvalidInput` if enum values are out of range, if `mtype` is
189/// `MainType::Iscc` (5), or if `digest.len() < bit_length / 8`.
190pub fn encode_component(
191    mtype: u8,
192    stype: u8,
193    version: u8,
194    bit_length: u32,
195    digest: &[u8],
196) -> IsccResult<String> {
197    let mt = codec::MainType::try_from(mtype)?;
198    let st = codec::SubType::try_from(stype)?;
199    let vs = codec::Version::try_from(version)?;
200    let needed = (bit_length / 8) as usize;
201    if digest.len() < needed {
202        return Err(IsccError::InvalidInput(format!(
203            "digest length {} < bit_length/8 ({})",
204            digest.len(),
205            needed
206        )));
207    }
208    codec::encode_component(mt, st, vs, bit_length, digest)
209}
210
211/// Decode an ISCC unit string into its header components and raw digest.
212///
213/// Inverse of [`encode_component`]. Strips an optional `"ISCC:"` prefix and
214/// dashes, base32-decodes the string, parses the variable-length header, and
215/// returns the digest truncated to exactly the encoded bit-length.
216///
217/// Returns `(maintype, subtype, version, length_index, digest)` where the
218/// integer fields match [`codec::MainType`], [`codec::SubType`], and
219/// [`codec::Version`] enum values.
220///
221/// # Errors
222///
223/// Returns `IsccError::InvalidInput` on invalid base32 input, malformed
224/// header, or if the decoded body is shorter than the expected digest length.
225pub fn iscc_decode(iscc: &str) -> IsccResult<(u8, u8, u8, u8, Vec<u8>)> {
226    // Strip optional "ISCC:" prefix (case-sensitive, matching iscc_decompose)
227    let clean = iscc.strip_prefix("ISCC:").unwrap_or(iscc);
228    // Remove dashes (matching iscc_clean behavior for base32 input)
229    let clean = clean.replace('-', "");
230    let raw = codec::decode_base32(&clean)?;
231    let (mt, st, vs, length_index, tail) = codec::decode_header(&raw)?;
232    let bit_length = codec::decode_length(mt, length_index, st);
233    let nbytes = (bit_length / 8) as usize;
234    if tail.len() < nbytes {
235        return Err(IsccError::InvalidInput(format!(
236            "decoded body too short: expected {nbytes} digest bytes, got {}",
237            tail.len()
238        )));
239    }
240    Ok((
241        mt as u8,
242        st as u8,
243        vs as u8,
244        length_index as u8,
245        tail[..nbytes].to_vec(),
246    ))
247}
248
249/// Convert a JSON string into a `data:` URL with JCS canonicalization.
250///
251/// Parses the JSON, re-serializes to [RFC 8785 (JCS)](https://www.rfc-editor.org/rfc/rfc8785)
252/// canonical form, base64-encodes the result, and wraps it in a `data:` URL.
253/// Uses `application/ld+json` media type when the JSON contains an `@context`
254/// key, otherwise `application/json`.
255///
256/// This enables all language bindings to support dict/object meta parameters
257/// by serializing to JSON once (language-specific) then delegating encoding
258/// to Rust.
259///
260/// # Errors
261///
262/// Returns [`IsccError::InvalidInput`] if `json` is not valid JSON or if
263/// JCS canonicalization fails.
264///
265/// # Examples
266///
267/// ```
268/// # use iscc_lib::json_to_data_url;
269/// let url = json_to_data_url(r#"{"key": "value"}"#).unwrap();
270/// assert!(url.starts_with("data:application/json;base64,"));
271///
272/// let ld_url = json_to_data_url(r#"{"@context": "https://schema.org"}"#).unwrap();
273/// assert!(ld_url.starts_with("data:application/ld+json;base64,"));
274/// ```
275#[cfg(feature = "meta-code")]
276pub fn json_to_data_url(json: &str) -> IsccResult<String> {
277    let parsed: serde_json::Value = serde_json::from_str(json)
278        .map_err(|e| IsccError::InvalidInput(format!("invalid JSON: {e}")))?;
279    let mut canonical_bytes = Vec::new();
280    serde_json_canonicalizer::to_writer(&parsed, &mut canonical_bytes)
281        .map_err(|e| IsccError::InvalidInput(format!("JSON canonicalization failed: {e}")))?;
282    Ok(build_meta_data_url(&canonical_bytes, &parsed))
283}
284
285/// Generate a Meta-Code from name and optional metadata.
286///
287/// Produces an ISCC Meta-Code by hashing the provided name, description,
288/// and metadata fields using the SimHash algorithm. When `meta` is provided,
289/// it is treated as either a Data-URL (if starting with `"data:"`) or a JSON
290/// string, and the decoded/serialized bytes are used for similarity hashing
291/// and metahash computation.
292#[cfg(feature = "meta-code")]
293pub fn gen_meta_code_v0(
294    name: &str,
295    description: Option<&str>,
296    meta: Option<&str>,
297    bits: u32,
298) -> IsccResult<MetaCodeResult> {
299    // Normalize name: clean → remove newlines → trim to 128 bytes
300    let name = utils::text_clean(name);
301    let name = utils::text_remove_newlines(&name);
302    let name = utils::text_trim(&name, META_TRIM_NAME);
303
304    if name.is_empty() {
305        return Err(IsccError::InvalidInput(
306            "name is empty after normalization".into(),
307        ));
308    }
309
310    // Normalize description: clean → trim to 4096 bytes
311    let desc_str = description.unwrap_or("");
312    let desc_clean = utils::text_clean(desc_str);
313    let desc_clean = utils::text_trim(&desc_clean, META_TRIM_DESCRIPTION);
314
315    // Pre-decode fast check: reject obviously oversized meta strings
316    if let Some(meta_str) = meta {
317        const PRE_DECODE_LIMIT: usize = META_TRIM_META * 4 / 3 + 256;
318        if meta_str.len() > PRE_DECODE_LIMIT {
319            return Err(IsccError::InvalidInput(format!(
320                "meta string exceeds size limit ({} > {PRE_DECODE_LIMIT} bytes)",
321                meta_str.len()
322            )));
323        }
324    }
325
326    // Resolve meta payload bytes (if meta is provided)
327    let meta_payload: Option<Vec<u8>> = match meta {
328        Some(meta_str) if meta_str.starts_with("data:") => Some(decode_data_url(meta_str)?),
329        Some(meta_str) => Some(parse_meta_json(meta_str)?),
330        None => None,
331    };
332
333    // Post-decode check: reject payloads exceeding META_TRIM_META
334    if let Some(ref payload) = meta_payload {
335        if payload.len() > META_TRIM_META {
336            return Err(IsccError::InvalidInput(format!(
337                "decoded meta payload exceeds size limit ({} > {META_TRIM_META} bytes)",
338                payload.len()
339            )));
340        }
341    }
342
343    // Branch: meta bytes path vs. description text path
344    if let Some(ref payload) = meta_payload {
345        let meta_code_digest = soft_hash_meta_v0_with_bytes(&name, payload);
346        let metahash = utils::multi_hash_blake3(payload);
347
348        let meta_code = codec::encode_component(
349            codec::MainType::Meta,
350            codec::SubType::None,
351            codec::Version::V0,
352            bits,
353            &meta_code_digest,
354        )?;
355
356        // Build the meta Data-URL for the result
357        let meta_value = match meta {
358            Some(meta_str) if meta_str.starts_with("data:") => meta_str.to_string(),
359            Some(meta_str) => {
360                let parsed: serde_json::Value = serde_json::from_str(meta_str)
361                    .map_err(|e| IsccError::InvalidInput(format!("invalid JSON: {e}")))?;
362                build_meta_data_url(payload, &parsed)
363            }
364            None => unreachable!(),
365        };
366
367        Ok(MetaCodeResult {
368            iscc: format!("ISCC:{meta_code}"),
369            name: name.clone(),
370            description: if desc_clean.is_empty() {
371                None
372            } else {
373                Some(desc_clean)
374            },
375            meta: Some(meta_value),
376            metahash,
377        })
378    } else {
379        // Compute metahash from normalized text payload
380        let payload = if desc_clean.is_empty() {
381            name.clone()
382        } else {
383            format!("{name} {desc_clean}")
384        };
385        let payload = payload.trim().to_string();
386        let metahash = utils::multi_hash_blake3(payload.as_bytes());
387
388        // Compute similarity digest
389        let extra = if desc_clean.is_empty() {
390            None
391        } else {
392            Some(desc_clean.as_str())
393        };
394        let meta_code_digest = soft_hash_meta_v0(&name, extra);
395
396        let meta_code = codec::encode_component(
397            codec::MainType::Meta,
398            codec::SubType::None,
399            codec::Version::V0,
400            bits,
401            &meta_code_digest,
402        )?;
403
404        Ok(MetaCodeResult {
405            iscc: format!("ISCC:{meta_code}"),
406            name: name.clone(),
407            description: if desc_clean.is_empty() {
408                None
409            } else {
410                Some(desc_clean)
411            },
412            meta: None,
413            metahash,
414        })
415    }
416}
417
418/// Compute a 256-bit similarity-preserving hash from collapsed text.
419///
420/// Generates character n-grams with a sliding window of width 13,
421/// hashes each with xxh32, then applies MinHash to produce a 32-byte digest.
422#[cfg(feature = "text-processing")]
423fn soft_hash_text_v0(text: &str) -> Vec<u8> {
424    let ngrams = simhash::sliding_window_strs(text, TEXT_NGRAM_SIZE);
425    let features: Vec<u32> = ngrams
426        .iter()
427        .map(|ng| xxhash_rust::xxh32::xxh32(ng.as_bytes(), 0))
428        .collect();
429    minhash::alg_minhash_256(&features)
430}
431
432/// Generate a Text-Code from plain text content.
433///
434/// Produces an ISCC Content-Code for text by collapsing the input,
435/// extracting character n-gram features, and applying MinHash to
436/// create a similarity-preserving fingerprint.
437#[cfg(feature = "text-processing")]
438pub fn gen_text_code_v0(text: &str, bits: u32) -> IsccResult<TextCodeResult> {
439    let collapsed = utils::text_collapse(text);
440    let characters = collapsed.chars().count();
441    let hash_digest = soft_hash_text_v0(&collapsed);
442    let component = codec::encode_component(
443        codec::MainType::Content,
444        codec::SubType::TEXT,
445        codec::Version::V0,
446        bits,
447        &hash_digest,
448    )?;
449    Ok(TextCodeResult {
450        iscc: format!("ISCC:{component}"),
451        characters,
452    })
453}
454
455/// Transpose a matrix represented as a Vec of Vecs.
456fn transpose_matrix(matrix: &[Vec<f64>]) -> Vec<Vec<f64>> {
457    let rows = matrix.len();
458    if rows == 0 {
459        return vec![];
460    }
461    let cols = matrix[0].len();
462    let mut result = vec![vec![0.0f64; rows]; cols];
463    for (r, row) in matrix.iter().enumerate() {
464        for (c, &val) in row.iter().enumerate() {
465            result[c][r] = val;
466        }
467    }
468    result
469}
470
471/// Extract an 8×8 block from a matrix and flatten to 64 values.
472///
473/// Block position `(col, row)` means the block starts at
474/// `matrix[row][col]` and spans 8 rows and 8 columns.
475fn flatten_8x8(matrix: &[Vec<f64>], col: usize, row: usize) -> Vec<f64> {
476    let mut flat = Vec::with_capacity(64);
477    for matrix_row in matrix.iter().skip(row).take(8) {
478        for &val in matrix_row.iter().skip(col).take(8) {
479            flat.push(val);
480        }
481    }
482    flat
483}
484
485/// Compute the median of a slice of f64 values.
486///
487/// For even-length slices, returns the average of the two middle values
488/// (matching Python `statistics.median` behavior).
489fn compute_median(values: &[f64]) -> f64 {
490    let mut sorted: Vec<f64> = values.to_vec();
491    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
492    let n = sorted.len();
493    if n % 2 == 1 {
494        sorted[n / 2]
495    } else {
496        (sorted[n / 2 - 1] + sorted[n / 2]) / 2.0
497    }
498}
499
500/// Convert a slice of bools to a byte vector (MSB first per byte).
501fn bits_to_bytes(bits: &[bool]) -> Vec<u8> {
502    bits.chunks(8)
503        .map(|chunk| {
504            let mut byte = 0u8;
505            for (i, &bit) in chunk.iter().enumerate() {
506                if bit {
507                    byte |= 1 << (7 - i);
508                }
509            }
510            byte
511        })
512        .collect()
513}
514
515/// Compute a DCT-based perceptual hash from 32×32 grayscale pixels.
516///
517/// Applies a 2D DCT to the pixel matrix, extracts four 8×8 low-frequency
518/// blocks, and generates a bitstring by comparing each coefficient against
519/// the block median. Returns up to `bits` bits as a byte vector.
520fn soft_hash_image_v0(pixels: &[u8], bits: u32) -> IsccResult<Vec<u8>> {
521    if pixels.len() != 1024 {
522        return Err(IsccError::InvalidInput(format!(
523            "expected 1024 pixels, got {}",
524            pixels.len()
525        )));
526    }
527    if bits > 256 {
528        return Err(IsccError::InvalidInput(format!(
529            "bits must be <= 256, got {bits}"
530        )));
531    }
532
533    // Step 1: Row-wise DCT (32 rows of 32 pixels)
534    let rows: Vec<Vec<f64>> = pixels
535        .chunks(32)
536        .map(|row| {
537            let row_f64: Vec<f64> = row.iter().map(|&p| p as f64).collect();
538            dct::alg_dct(&row_f64)
539        })
540        .collect::<IsccResult<Vec<Vec<f64>>>>()?;
541
542    // Step 2: Transpose
543    let transposed = transpose_matrix(&rows);
544
545    // Step 3: Column-wise DCT
546    let dct_cols: Vec<Vec<f64>> = transposed
547        .iter()
548        .map(|col| dct::alg_dct(col))
549        .collect::<IsccResult<Vec<Vec<f64>>>>()?;
550
551    // Step 4: Transpose back → dct_matrix
552    let dct_matrix = transpose_matrix(&dct_cols);
553
554    // Step 5: Extract 8×8 blocks at positions (0,0), (1,0), (0,1), (1,1)
555    let positions = [(0, 0), (1, 0), (0, 1), (1, 1)];
556    let mut bitstring = Vec::<bool>::with_capacity(256);
557
558    for (col, row) in positions {
559        let flat = flatten_8x8(&dct_matrix, col, row);
560        let median = compute_median(&flat);
561        for val in &flat {
562            bitstring.push(*val > median);
563        }
564        if bitstring.len() >= bits as usize {
565            break;
566        }
567    }
568
569    // Step 6: Convert first `bits` bools to bytes
570    Ok(bits_to_bytes(&bitstring[..bits as usize]))
571}
572
573/// Generate an Image-Code from pixel data.
574///
575/// Produces an ISCC Content-Code for images from a sequence of 1024
576/// grayscale pixel values (32×32, values 0-255) using a DCT-based
577/// perceptual hash.
578pub fn gen_image_code_v0(pixels: &[u8], bits: u32) -> IsccResult<ImageCodeResult> {
579    let hash_digest = soft_hash_image_v0(pixels, bits)?;
580    let component = codec::encode_component(
581        codec::MainType::Content,
582        codec::SubType::Image,
583        codec::Version::V0,
584        bits,
585        &hash_digest,
586    )?;
587    Ok(ImageCodeResult {
588        iscc: format!("ISCC:{component}"),
589    })
590}
591
592/// Split a slice into `n` parts, distributing remainder across first chunks.
593///
594/// Equivalent to `numpy.array_split` / `more_itertools.divide`:
595/// each part gets `len / n` elements, and the first `len % n` parts
596/// get one extra element. Returns empty slices for excess parts.
597fn array_split<T>(slice: &[T], n: usize) -> Vec<&[T]> {
598    if n == 0 {
599        return vec![];
600    }
601    let len = slice.len();
602    let base = len / n;
603    let remainder = len % n;
604    let mut parts = Vec::with_capacity(n);
605    let mut offset = 0;
606    for i in 0..n {
607        let size = base + if i < remainder { 1 } else { 0 };
608        parts.push(&slice[offset..offset + size]);
609        offset += size;
610    }
611    parts
612}
613
614/// Compute a multi-stage SimHash digest from Chromaprint features.
615///
616/// Builds a 32-byte digest by concatenating 4-byte SimHash chunks:
617/// - Stage 1: overall SimHash of all features (4 bytes)
618/// - Stage 2: SimHash of each quarter of features (4 × 4 = 16 bytes)
619/// - Stage 3: SimHash of each third of sorted features (3 × 4 = 12 bytes)
620fn soft_hash_audio_v0(cv: &[i32]) -> Vec<u8> {
621    // Convert each i32 to 4-byte big-endian digest
622    let digests: Vec<[u8; 4]> = cv.iter().map(|&v| v.to_be_bytes()).collect();
623
624    if digests.is_empty() {
625        return vec![0u8; 32];
626    }
627
628    // Stage 1: overall SimHash (4 bytes)
629    let mut parts: Vec<u8> = simhash::alg_simhash_inner(&digests);
630
631    // Stage 2: quarter-based SimHashes (4 × 4 = 16 bytes)
632    let quarters = array_split(&digests, 4);
633    for quarter in &quarters {
634        if quarter.is_empty() {
635            parts.extend_from_slice(&[0u8; 4]);
636        } else {
637            parts.extend_from_slice(&simhash::alg_simhash_inner(quarter));
638        }
639    }
640
641    // Stage 3: sorted-third-based SimHashes (3 × 4 = 12 bytes)
642    let mut sorted_values: Vec<i32> = cv.to_vec();
643    sorted_values.sort();
644    let sorted_digests: Vec<[u8; 4]> = sorted_values.iter().map(|&v| v.to_be_bytes()).collect();
645    let thirds = array_split(&sorted_digests, 3);
646    for third in &thirds {
647        if third.is_empty() {
648            parts.extend_from_slice(&[0u8; 4]);
649        } else {
650            parts.extend_from_slice(&simhash::alg_simhash_inner(third));
651        }
652    }
653
654    parts
655}
656
657/// Generate an Audio-Code from a Chromaprint feature vector.
658///
659/// Produces an ISCC Content-Code for audio from a Chromaprint signed
660/// integer fingerprint vector using multi-stage SimHash.
661pub fn gen_audio_code_v0(cv: &[i32], bits: u32) -> IsccResult<AudioCodeResult> {
662    let hash_digest = soft_hash_audio_v0(cv);
663    let component = codec::encode_component(
664        codec::MainType::Content,
665        codec::SubType::Audio,
666        codec::Version::V0,
667        bits,
668        &hash_digest,
669    )?;
670    Ok(AudioCodeResult {
671        iscc: format!("ISCC:{component}"),
672    })
673}
674
675/// Compute a similarity-preserving hash from video frame signatures.
676///
677/// Deduplicates frame signatures, computes column-wise sums across all
678/// unique frames, then applies WTA-Hash to produce a digest of `bits/8` bytes.
679pub fn soft_hash_video_v0<S: AsRef<[i32]> + Ord>(
680    frame_sigs: &[S],
681    bits: u32,
682) -> IsccResult<Vec<u8>> {
683    if frame_sigs.is_empty() {
684        return Err(IsccError::InvalidInput(
685            "frame_sigs must not be empty".into(),
686        ));
687    }
688
689    // Deduplicate using BTreeSet (S: Ord)
690    let unique: std::collections::BTreeSet<&S> = frame_sigs.iter().collect();
691
692    // Column-wise sum into i64 to avoid overflow
693    let cols = frame_sigs[0].as_ref().len();
694    let mut vecsum = vec![0i64; cols];
695    for sig in &unique {
696        for (c, &val) in sig.as_ref().iter().enumerate() {
697            vecsum[c] += val as i64;
698        }
699    }
700
701    wtahash::alg_wtahash(&vecsum, bits)
702}
703
704/// Generate a Video-Code from frame signature data.
705///
706/// Produces an ISCC Content-Code for video from a sequence of MPEG-7 frame
707/// signatures. Each frame signature is a 380-element integer vector.
708pub fn gen_video_code_v0<S: AsRef<[i32]> + Ord>(
709    frame_sigs: &[S],
710    bits: u32,
711) -> IsccResult<VideoCodeResult> {
712    let digest = soft_hash_video_v0(frame_sigs, bits)?;
713    let component = codec::encode_component(
714        codec::MainType::Content,
715        codec::SubType::Video,
716        codec::Version::V0,
717        bits,
718        &digest,
719    )?;
720    Ok(VideoCodeResult {
721        iscc: format!("ISCC:{component}"),
722    })
723}
724
725/// Combine multiple Content-Code digests into a single similarity hash.
726///
727/// Takes raw decoded ISCC bytes (header + body) for each Content-Code and
728/// produces a SimHash digest. Each input is trimmed to `bits/8` bytes by
729/// keeping the first header byte (encodes type info) plus `nbytes-1` body bytes.
730/// Requires at least 2 codes, all of MainType::Content.
731fn soft_hash_codes_v0(cc_digests: &[Vec<u8>], bits: u32) -> IsccResult<Vec<u8>> {
732    if cc_digests.len() < 2 {
733        return Err(IsccError::InvalidInput(
734            "at least 2 Content-Codes required for mixing".into(),
735        ));
736    }
737
738    let nbytes = (bits / 8) as usize;
739    let mut prepared: Vec<Vec<u8>> = Vec::with_capacity(cc_digests.len());
740
741    for raw in cc_digests {
742        let (mtype, stype, _ver, blen, body) = codec::decode_header(raw)?;
743        if mtype != codec::MainType::Content {
744            return Err(IsccError::InvalidInput(
745                "all codes must be Content-Codes".into(),
746            ));
747        }
748        let unit_bits = codec::decode_length(mtype, blen, stype);
749        if unit_bits < bits {
750            return Err(IsccError::InvalidInput(format!(
751                "Content-Code too short for {bits}-bit length (has {unit_bits} bits)"
752            )));
753        }
754        let mut entry = Vec::with_capacity(nbytes);
755        entry.push(raw[0]); // first byte preserves type info
756        let take = std::cmp::min(nbytes - 1, body.len());
757        entry.extend_from_slice(&body[..take]);
758        // Pad with zeros if body is shorter than nbytes-1
759        while entry.len() < nbytes {
760            entry.push(0);
761        }
762        prepared.push(entry);
763    }
764
765    Ok(simhash::alg_simhash_inner(&prepared))
766}
767
768/// Generate a Mixed-Code from multiple Content-Code strings.
769///
770/// Produces a Mixed Content-Code by combining multiple ISCC Content-Codes
771/// of different types (text, image, audio, video) using SimHash. Input codes
772/// may optionally include the "ISCC:" prefix.
773pub fn gen_mixed_code_v0(codes: &[&str], bits: u32) -> IsccResult<MixedCodeResult> {
774    let decoded: Vec<Vec<u8>> = codes
775        .iter()
776        .map(|code| {
777            let clean = code.strip_prefix("ISCC:").unwrap_or(code);
778            codec::decode_base32(clean)
779        })
780        .collect::<IsccResult<Vec<Vec<u8>>>>()?;
781
782    let digest = soft_hash_codes_v0(&decoded, bits)?;
783
784    let component = codec::encode_component(
785        codec::MainType::Content,
786        codec::SubType::Mixed,
787        codec::Version::V0,
788        bits,
789        &digest,
790    )?;
791
792    Ok(MixedCodeResult {
793        iscc: format!("ISCC:{component}"),
794        parts: codes.iter().map(|s| s.to_string()).collect(),
795    })
796}
797
798/// Generate a Data-Code from raw byte data.
799///
800/// Produces an ISCC Data-Code by splitting data into content-defined chunks,
801/// hashing each chunk with xxh32, and applying MinHash to create a
802/// similarity-preserving fingerprint.
803pub fn gen_data_code_v0(data: &[u8], bits: u32) -> IsccResult<DataCodeResult> {
804    let chunks = cdc::alg_cdc_chunks_unchecked(data, false, cdc::DATA_AVG_CHUNK_SIZE);
805    let mut features: Vec<u32> = chunks
806        .iter()
807        .map(|chunk| xxhash_rust::xxh32::xxh32(chunk, 0))
808        .collect();
809
810    // Defensive: ensure at least one feature (alg_cdc_chunks guarantees >= 1 chunk)
811    if features.is_empty() {
812        features.push(xxhash_rust::xxh32::xxh32(b"", 0));
813    }
814
815    let digest = minhash::alg_minhash_256(&features);
816    let component = codec::encode_component(
817        codec::MainType::Data,
818        codec::SubType::None,
819        codec::Version::V0,
820        bits,
821        &digest,
822    )?;
823
824    Ok(DataCodeResult {
825        iscc: format!("ISCC:{component}"),
826    })
827}
828
829/// Generate an Instance-Code from raw byte data.
830///
831/// Produces an ISCC Instance-Code by hashing the complete byte stream
832/// with BLAKE3. Captures the exact binary identity of the data.
833pub fn gen_instance_code_v0(data: &[u8], bits: u32) -> IsccResult<InstanceCodeResult> {
834    let digest = blake3::hash(data);
835    let datahash = utils::multi_hash_blake3(data);
836    let filesize = data.len() as u64;
837    let component = codec::encode_component(
838        codec::MainType::Instance,
839        codec::SubType::None,
840        codec::Version::V0,
841        bits,
842        digest.as_bytes(),
843    )?;
844    Ok(InstanceCodeResult {
845        iscc: format!("ISCC:{component}"),
846        datahash,
847        filesize,
848    })
849}
850
851/// Generate a composite ISCC-CODE from individual ISCC unit codes.
852///
853/// Combines multiple ISCC unit codes (Meta-Code, Content-Code, Data-Code,
854/// Instance-Code) into a single composite ISCC-CODE. Input codes may
855/// optionally include the "ISCC:" prefix. At least Data-Code and
856/// Instance-Code are required. When `wide` is true and exactly two
857/// 128-bit+ codes (Data + Instance) are provided, produces a 256-bit
858/// wide-mode code.
859pub fn gen_iscc_code_v0(codes: &[&str], wide: bool) -> IsccResult<IsccCodeResult> {
860    // Step 1: Clean inputs — strip "ISCC:" prefix
861    let cleaned: Vec<&str> = codes
862        .iter()
863        .map(|c| c.strip_prefix("ISCC:").unwrap_or(c))
864        .collect();
865
866    // Step 2: Validate minimum count
867    if cleaned.len() < 2 {
868        return Err(IsccError::InvalidInput(
869            "at least 2 ISCC unit codes required".into(),
870        ));
871    }
872
873    // Step 3: Validate minimum length (16 base32 chars = 64-bit minimum)
874    for code in &cleaned {
875        if code.len() < 16 {
876            return Err(IsccError::InvalidInput(format!(
877                "ISCC unit code too short (min 16 chars): {code}"
878            )));
879        }
880    }
881
882    // Step 4: Decode each code
883    let mut decoded: Vec<(
884        codec::MainType,
885        codec::SubType,
886        codec::Version,
887        u32,
888        Vec<u8>,
889    )> = Vec::with_capacity(cleaned.len());
890    for code in &cleaned {
891        let raw = codec::decode_base32(code)?;
892        let header = codec::decode_header(&raw)?;
893        decoded.push(header);
894    }
895
896    // Step 5: Sort by MainType (ascending)
897    decoded.sort_by_key(|&(mt, ..)| mt);
898
899    // Step 6: Extract main_types
900    let main_types: Vec<codec::MainType> = decoded.iter().map(|&(mt, ..)| mt).collect();
901
902    // Step 7: Validate last two are Data + Instance (mandatory)
903    let n = main_types.len();
904    if main_types[n - 2] != codec::MainType::Data || main_types[n - 1] != codec::MainType::Instance
905    {
906        return Err(IsccError::InvalidInput(
907            "Data-Code and Instance-Code are mandatory".into(),
908        ));
909    }
910
911    // Step 8: Determine wide composite
912    let is_wide = wide
913        && decoded.len() == 2
914        && main_types == [codec::MainType::Data, codec::MainType::Instance]
915        && decoded
916            .iter()
917            .all(|&(mt, st, _, len, _)| codec::decode_length(mt, len, st) >= 128);
918
919    // Step 9: Determine SubType
920    let st = if is_wide {
921        codec::SubType::Wide
922    } else {
923        // Collect SubTypes of Semantic/Content units
924        let sc_subtypes: Vec<codec::SubType> = decoded
925            .iter()
926            .filter(|&&(mt, ..)| mt == codec::MainType::Semantic || mt == codec::MainType::Content)
927            .map(|&(_, st, ..)| st)
928            .collect();
929
930        if !sc_subtypes.is_empty() {
931            // All must be the same
932            let first = sc_subtypes[0];
933            if sc_subtypes.iter().all(|&s| s == first) {
934                first
935            } else {
936                return Err(IsccError::InvalidInput(
937                    "mixed SubTypes among Content/Semantic units".into(),
938                ));
939            }
940        } else if decoded.len() == 2 {
941            codec::SubType::Sum
942        } else {
943            codec::SubType::IsccNone
944        }
945    };
946
947    // Step 10–11: Get optional MainTypes and encode
948    let optional_types = &main_types[..n - 2];
949    let encoded_length = codec::encode_units(optional_types)?;
950
951    // Step 12: Build digest body
952    let bytes_per_unit = if is_wide { 16 } else { 8 };
953    let mut digest = Vec::with_capacity(decoded.len() * bytes_per_unit);
954    for (_, _, _, _, tail) in &decoded {
955        let take = bytes_per_unit.min(tail.len());
956        digest.extend_from_slice(&tail[..take]);
957    }
958
959    // Step 13–14: Encode header + digest as base32
960    let header = codec::encode_header(
961        codec::MainType::Iscc,
962        st,
963        codec::Version::V0,
964        encoded_length,
965    )?;
966    let mut code_bytes = header;
967    code_bytes.extend_from_slice(&digest);
968    let code = codec::encode_base32(&code_bytes);
969
970    // Step 15: Return with prefix
971    Ok(IsccCodeResult {
972        iscc: format!("ISCC:{code}"),
973    })
974}
975
976/// Generate a composite ISCC-CODE from a file in a single pass.
977///
978/// Opens the file at `path`, reads it with an optimal buffer size, and feeds
979/// both `DataHasher` (CDC/MinHash) and `InstanceHasher` (BLAKE3) from the
980/// same read buffer. Composes the final ISCC-CODE from the Data-Code and
981/// Instance-Code internally. This avoids multiple passes over the file and
982/// eliminates per-chunk FFI overhead in language bindings.
983///
984/// When `add_units` is `true`, the result includes the individual Data-Code
985/// and Instance-Code ISCC strings at the requested `bits` precision.
986pub fn gen_sum_code_v0(
987    path: &std::path::Path,
988    bits: u32,
989    wide: bool,
990    add_units: bool,
991) -> IsccResult<SumCodeResult> {
992    use std::io::Read;
993
994    let mut file = std::fs::File::open(path)
995        .map_err(|e| IsccError::InvalidInput(format!("Cannot open file: {e}")))?;
996
997    let mut data_hasher = streaming::DataHasher::new();
998    let mut instance_hasher = streaming::InstanceHasher::new();
999
1000    let mut buf = vec![0u8; IO_READ_SIZE];
1001    loop {
1002        let n = file
1003            .read(&mut buf)
1004            .map_err(|e| IsccError::InvalidInput(format!("Cannot read file: {e}")))?;
1005        if n == 0 {
1006            break;
1007        }
1008        data_hasher.update(&buf[..n]);
1009        instance_hasher.update(&buf[..n]);
1010    }
1011
1012    let data_result = data_hasher.finalize(bits)?;
1013    let instance_result = instance_hasher.finalize(bits)?;
1014
1015    // Borrow strings for gen_iscc_code_v0 before potentially moving them into units.
1016    let iscc_result = gen_iscc_code_v0(&[&data_result.iscc, &instance_result.iscc], wide)?;
1017
1018    let units = if add_units {
1019        Some(vec![data_result.iscc, instance_result.iscc])
1020    } else {
1021        None
1022    };
1023
1024    Ok(SumCodeResult {
1025        iscc: iscc_result.iscc,
1026        datahash: instance_result.datahash,
1027        filesize: instance_result.filesize,
1028        units,
1029    })
1030}
1031
1032#[cfg(test)]
1033mod tests {
1034    use super::*;
1035
1036    #[cfg(feature = "meta-code")]
1037    #[test]
1038    fn test_gen_meta_code_v0_title_only() {
1039        let result = gen_meta_code_v0("Die Unendliche Geschichte", None, None, 64).unwrap();
1040        assert_eq!(result.iscc, "ISCC:AAAZXZ6OU74YAZIM");
1041        assert_eq!(result.name, "Die Unendliche Geschichte");
1042        assert_eq!(result.description, None);
1043        assert_eq!(result.meta, None);
1044    }
1045
1046    #[cfg(feature = "meta-code")]
1047    #[test]
1048    fn test_gen_meta_code_v0_title_description() {
1049        let result = gen_meta_code_v0(
1050            "Die Unendliche Geschichte",
1051            Some("Von Michael Ende"),
1052            None,
1053            64,
1054        )
1055        .unwrap();
1056        assert_eq!(result.iscc, "ISCC:AAAZXZ6OU4E45RB5");
1057        assert_eq!(result.name, "Die Unendliche Geschichte");
1058        assert_eq!(result.description, Some("Von Michael Ende".to_string()));
1059        assert_eq!(result.meta, None);
1060    }
1061
1062    #[cfg(feature = "meta-code")]
1063    #[test]
1064    fn test_gen_meta_code_v0_json_meta() {
1065        let result = gen_meta_code_v0("Hello", None, Some(r#"{"some":"object"}"#), 64).unwrap();
1066        assert_eq!(result.iscc, "ISCC:AAAWKLHFXN63LHL2");
1067        assert!(result.meta.is_some());
1068        assert!(
1069            result
1070                .meta
1071                .unwrap()
1072                .starts_with("data:application/json;base64,")
1073        );
1074    }
1075
1076    #[cfg(feature = "meta-code")]
1077    #[test]
1078    fn test_gen_meta_code_v0_data_url_meta() {
1079        let result = gen_meta_code_v0(
1080            "Hello",
1081            None,
1082            Some("data:application/json;charset=utf-8;base64,eyJzb21lIjogIm9iamVjdCJ9"),
1083            64,
1084        )
1085        .unwrap();
1086        assert_eq!(result.iscc, "ISCC:AAAWKLHFXN43ICP2");
1087        // Data-URL is passed through as-is
1088        assert_eq!(
1089            result.meta,
1090            Some("data:application/json;charset=utf-8;base64,eyJzb21lIjogIm9iamVjdCJ9".to_string())
1091        );
1092    }
1093
1094    /// Verify that JSON metadata with float values is canonicalized per RFC 8785 (JCS).
1095    ///
1096    /// JCS serializes `1.0` as `1` (integer form), while `serde_json` preserves `1.0`.
1097    /// This causes different canonical bytes, different metahash, and different ISCC codes.
1098    /// Expected values generated by `iscc-core` with `jcs.canonicalize({"value": 1.0})`.
1099    #[cfg(feature = "meta-code")]
1100    #[test]
1101    fn test_gen_meta_code_v0_jcs_float_canonicalization() {
1102        // JCS canonicalizes {"value": 1.0} → {"value":1} (integer form)
1103        // serde_json produces {"value":1.0} (preserves float notation)
1104        let result = gen_meta_code_v0("Test", None, Some(r#"{"value":1.0}"#), 64).unwrap();
1105
1106        // Expected values from iscc-core (Python) using jcs.canonicalize()
1107        assert_eq!(
1108            result.iscc, "ISCC:AAAX4GX3RZH2I6QZ",
1109            "ISCC mismatch: parse_meta_json must use RFC 8785 (JCS) canonicalization"
1110        );
1111        assert_eq!(
1112            result.meta,
1113            Some("data:application/json;base64,eyJ2YWx1ZSI6MX0=".to_string()),
1114            "meta Data-URL mismatch: JCS should serialize 1.0 as 1"
1115        );
1116        assert_eq!(
1117            result.metahash, "1e2010b291d392b6999ffe4aa4661fb343fc371fca3bfb5bb4e8d8226fdf85743232",
1118            "metahash mismatch: canonical bytes differ between JCS and serde_json"
1119        );
1120    }
1121
1122    /// Verify JCS number formatting for large floats (scientific notation edge case).
1123    ///
1124    /// JCS serializes `1e20` as `100000000000000000000` (expanded integer form).
1125    /// Expected values generated by `iscc-core` with `jcs.canonicalize({"value": 1e20})`.
1126    #[cfg(feature = "meta-code")]
1127    #[test]
1128    fn test_gen_meta_code_v0_jcs_large_float_canonicalization() {
1129        let result = gen_meta_code_v0("Test", None, Some(r#"{"value":1e20}"#), 64).unwrap();
1130
1131        assert_eq!(
1132            result.iscc, "ISCC:AAAX4GX3R32YH5P7",
1133            "ISCC mismatch: JCS should expand 1e20 to 100000000000000000000"
1134        );
1135        assert_eq!(
1136            result.meta,
1137            Some(
1138                "data:application/json;base64,eyJ2YWx1ZSI6MTAwMDAwMDAwMDAwMDAwMDAwMDAwfQ=="
1139                    .to_string()
1140            ),
1141            "meta Data-URL mismatch: JCS should expand large float to integer form"
1142        );
1143        assert_eq!(
1144            result.metahash, "1e201ff83c1822c348717658a0b4713739646da7c59832691b337a457416ddd1c73d",
1145            "metahash mismatch: canonical bytes differ for large float"
1146        );
1147    }
1148
1149    #[cfg(feature = "meta-code")]
1150    #[test]
1151    fn test_gen_meta_code_v0_invalid_json() {
1152        assert!(matches!(
1153            gen_meta_code_v0("test", None, Some("not json"), 64),
1154            Err(IsccError::InvalidInput(_))
1155        ));
1156    }
1157
1158    #[cfg(feature = "meta-code")]
1159    #[test]
1160    fn test_gen_meta_code_v0_invalid_data_url() {
1161        assert!(matches!(
1162            gen_meta_code_v0("test", None, Some("data:no-comma-here"), 64),
1163            Err(IsccError::InvalidInput(_))
1164        ));
1165    }
1166
1167    #[cfg(feature = "meta-code")]
1168    #[test]
1169    fn test_gen_meta_code_v0_conformance() {
1170        let json_str = include_str!("../tests/data.json");
1171        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1172        let section = &data["gen_meta_code_v0"];
1173        let cases = section.as_object().unwrap();
1174
1175        let mut tested = 0;
1176
1177        for (tc_name, tc) in cases {
1178            let inputs = tc["inputs"].as_array().unwrap();
1179            let input_name = inputs[0].as_str().unwrap();
1180            let input_desc = inputs[1].as_str().unwrap();
1181            let meta_val = &inputs[2];
1182            let bits = inputs[3].as_u64().unwrap() as u32;
1183
1184            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1185            let expected_metahash = tc["outputs"]["metahash"].as_str().unwrap();
1186
1187            // Dispatch meta parameter based on JSON value type
1188            let meta_arg: Option<String> = match meta_val {
1189                serde_json::Value::Null => None,
1190                serde_json::Value::String(s) => Some(s.clone()),
1191                serde_json::Value::Object(_) => Some(serde_json::to_string(meta_val).unwrap()),
1192                other => panic!("unexpected meta type in {tc_name}: {other:?}"),
1193            };
1194
1195            let desc = if input_desc.is_empty() {
1196                None
1197            } else {
1198                Some(input_desc)
1199            };
1200
1201            // Verify ISCC output from struct
1202            let result = gen_meta_code_v0(input_name, desc, meta_arg.as_deref(), bits)
1203                .unwrap_or_else(|e| panic!("gen_meta_code_v0 failed for {tc_name}: {e}"));
1204            assert_eq!(
1205                result.iscc, expected_iscc,
1206                "ISCC mismatch in test case {tc_name}"
1207            );
1208
1209            // Verify metahash from struct
1210            assert_eq!(
1211                result.metahash, expected_metahash,
1212                "metahash mismatch in test case {tc_name}"
1213            );
1214
1215            // Verify name from struct
1216            if let Some(expected_name) = tc["outputs"].get("name") {
1217                let expected_name = expected_name.as_str().unwrap();
1218                assert_eq!(
1219                    result.name, expected_name,
1220                    "name mismatch in test case {tc_name}"
1221                );
1222            }
1223
1224            // Verify description from struct
1225            if let Some(expected_desc) = tc["outputs"].get("description") {
1226                let expected_desc = expected_desc.as_str().unwrap();
1227                assert_eq!(
1228                    result.description.as_deref(),
1229                    Some(expected_desc),
1230                    "description mismatch in test case {tc_name}"
1231                );
1232            }
1233
1234            // Verify meta from struct
1235            if meta_arg.is_some() {
1236                assert!(
1237                    result.meta.is_some(),
1238                    "meta should be present in test case {tc_name}"
1239                );
1240            } else {
1241                assert!(
1242                    result.meta.is_none(),
1243                    "meta should be absent in test case {tc_name}"
1244                );
1245            }
1246
1247            tested += 1;
1248        }
1249
1250        assert_eq!(tested, 20, "expected 20 conformance tests to run");
1251    }
1252
1253    #[cfg(feature = "text-processing")]
1254    #[test]
1255    fn test_gen_text_code_v0_empty() {
1256        let result = gen_text_code_v0("", 64).unwrap();
1257        assert_eq!(result.iscc, "ISCC:EAASL4F2WZY7KBXB");
1258        assert_eq!(result.characters, 0);
1259    }
1260
1261    #[cfg(feature = "text-processing")]
1262    #[test]
1263    fn test_gen_text_code_v0_hello_world() {
1264        let result = gen_text_code_v0("Hello World", 64).unwrap();
1265        assert_eq!(result.iscc, "ISCC:EAASKDNZNYGUUF5A");
1266        assert_eq!(result.characters, 10); // "helloworld" after collapse
1267    }
1268
1269    #[cfg(feature = "text-processing")]
1270    #[test]
1271    fn test_gen_text_code_v0_conformance() {
1272        let json_str = include_str!("../tests/data.json");
1273        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1274        let section = &data["gen_text_code_v0"];
1275        let cases = section.as_object().unwrap();
1276
1277        let mut tested = 0;
1278
1279        for (tc_name, tc) in cases {
1280            let inputs = tc["inputs"].as_array().unwrap();
1281            let input_text = inputs[0].as_str().unwrap();
1282            let bits = inputs[1].as_u64().unwrap() as u32;
1283
1284            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1285            let expected_chars = tc["outputs"]["characters"].as_u64().unwrap() as usize;
1286
1287            // Verify ISCC output from struct
1288            let result = gen_text_code_v0(input_text, bits)
1289                .unwrap_or_else(|e| panic!("gen_text_code_v0 failed for {tc_name}: {e}"));
1290            assert_eq!(
1291                result.iscc, expected_iscc,
1292                "ISCC mismatch in test case {tc_name}"
1293            );
1294
1295            // Verify character count from struct
1296            assert_eq!(
1297                result.characters, expected_chars,
1298                "character count mismatch in test case {tc_name}"
1299            );
1300
1301            tested += 1;
1302        }
1303
1304        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1305    }
1306
1307    #[test]
1308    fn test_gen_image_code_v0_all_black() {
1309        let pixels = vec![0u8; 1024];
1310        let result = gen_image_code_v0(&pixels, 64).unwrap();
1311        assert_eq!(result.iscc, "ISCC:EEAQAAAAAAAAAAAA");
1312    }
1313
1314    #[test]
1315    fn test_gen_image_code_v0_all_white() {
1316        let pixels = vec![255u8; 1024];
1317        let result = gen_image_code_v0(&pixels, 128).unwrap();
1318        assert_eq!(result.iscc, "ISCC:EEBYAAAAAAAAAAAAAAAAAAAAAAAAA");
1319    }
1320
1321    #[test]
1322    fn test_gen_image_code_v0_invalid_pixel_count() {
1323        assert!(gen_image_code_v0(&[0u8; 100], 64).is_err());
1324    }
1325
1326    #[test]
1327    fn test_gen_image_code_v0_conformance() {
1328        let json_str = include_str!("../tests/data.json");
1329        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1330        let section = &data["gen_image_code_v0"];
1331        let cases = section.as_object().unwrap();
1332
1333        let mut tested = 0;
1334
1335        for (tc_name, tc) in cases {
1336            let inputs = tc["inputs"].as_array().unwrap();
1337            let pixels_json = inputs[0].as_array().unwrap();
1338            let bits = inputs[1].as_u64().unwrap() as u32;
1339            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1340
1341            let pixels: Vec<u8> = pixels_json
1342                .iter()
1343                .map(|v| v.as_u64().unwrap() as u8)
1344                .collect();
1345
1346            let result = gen_image_code_v0(&pixels, bits)
1347                .unwrap_or_else(|e| panic!("gen_image_code_v0 failed for {tc_name}: {e}"));
1348            assert_eq!(
1349                result.iscc, expected_iscc,
1350                "ISCC mismatch in test case {tc_name}"
1351            );
1352
1353            tested += 1;
1354        }
1355
1356        assert_eq!(tested, 3, "expected 3 conformance tests to run");
1357    }
1358
1359    #[test]
1360    fn test_gen_audio_code_v0_empty() {
1361        let result = gen_audio_code_v0(&[], 64).unwrap();
1362        assert_eq!(result.iscc, "ISCC:EIAQAAAAAAAAAAAA");
1363    }
1364
1365    #[test]
1366    fn test_gen_audio_code_v0_single() {
1367        let result = gen_audio_code_v0(&[1], 128).unwrap();
1368        assert_eq!(result.iscc, "ISCC:EIBQAAAAAEAAAAABAAAAAAAAAAAAA");
1369    }
1370
1371    #[test]
1372    fn test_gen_audio_code_v0_negative() {
1373        let result = gen_audio_code_v0(&[-1, 0, 1], 256).unwrap();
1374        assert_eq!(
1375            result.iscc,
1376            "ISCC:EIDQAAAAAH777777AAAAAAAAAAAACAAAAAAP777774AAAAAAAAAAAAI"
1377        );
1378    }
1379
1380    #[test]
1381    fn test_gen_audio_code_v0_conformance() {
1382        let json_str = include_str!("../tests/data.json");
1383        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1384        let section = &data["gen_audio_code_v0"];
1385        let cases = section.as_object().unwrap();
1386
1387        let mut tested = 0;
1388
1389        for (tc_name, tc) in cases {
1390            let inputs = tc["inputs"].as_array().unwrap();
1391            let cv_json = inputs[0].as_array().unwrap();
1392            let bits = inputs[1].as_u64().unwrap() as u32;
1393            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1394
1395            let cv: Vec<i32> = cv_json.iter().map(|v| v.as_i64().unwrap() as i32).collect();
1396
1397            let result = gen_audio_code_v0(&cv, bits)
1398                .unwrap_or_else(|e| panic!("gen_audio_code_v0 failed for {tc_name}: {e}"));
1399            assert_eq!(
1400                result.iscc, expected_iscc,
1401                "ISCC mismatch in test case {tc_name}"
1402            );
1403
1404            tested += 1;
1405        }
1406
1407        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1408    }
1409
1410    #[test]
1411    fn test_array_split_even() {
1412        let data = vec![1, 2, 3, 4];
1413        let parts = array_split(&data, 4);
1414        assert_eq!(parts, vec![&[1][..], &[2][..], &[3][..], &[4][..]]);
1415    }
1416
1417    #[test]
1418    fn test_array_split_remainder() {
1419        let data = vec![1, 2, 3, 4, 5];
1420        let parts = array_split(&data, 3);
1421        assert_eq!(parts, vec![&[1, 2][..], &[3, 4][..], &[5][..]]);
1422    }
1423
1424    #[test]
1425    fn test_array_split_more_parts_than_elements() {
1426        let data = vec![1, 2];
1427        let parts = array_split(&data, 4);
1428        assert_eq!(
1429            parts,
1430            vec![&[1][..], &[2][..], &[][..] as &[i32], &[][..] as &[i32]]
1431        );
1432    }
1433
1434    #[test]
1435    fn test_array_split_empty() {
1436        let data: Vec<i32> = vec![];
1437        let parts = array_split(&data, 3);
1438        assert_eq!(
1439            parts,
1440            vec![&[][..] as &[i32], &[][..] as &[i32], &[][..] as &[i32]]
1441        );
1442    }
1443
1444    #[test]
1445    fn test_gen_video_code_v0_empty_frames() {
1446        let frames: Vec<Vec<i32>> = vec![];
1447        assert!(matches!(
1448            gen_video_code_v0(&frames, 64),
1449            Err(IsccError::InvalidInput(_))
1450        ));
1451    }
1452
1453    #[test]
1454    fn test_gen_video_code_v0_conformance() {
1455        let json_str = include_str!("../tests/data.json");
1456        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1457        let section = &data["gen_video_code_v0"];
1458        let cases = section.as_object().unwrap();
1459
1460        let mut tested = 0;
1461
1462        for (tc_name, tc) in cases {
1463            let inputs = tc["inputs"].as_array().unwrap();
1464            let frames_json = inputs[0].as_array().unwrap();
1465            let bits = inputs[1].as_u64().unwrap() as u32;
1466            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1467
1468            let frame_sigs: Vec<Vec<i32>> = frames_json
1469                .iter()
1470                .map(|frame| {
1471                    frame
1472                        .as_array()
1473                        .unwrap()
1474                        .iter()
1475                        .map(|v| v.as_i64().unwrap() as i32)
1476                        .collect()
1477                })
1478                .collect();
1479
1480            let result = gen_video_code_v0(&frame_sigs, bits)
1481                .unwrap_or_else(|e| panic!("gen_video_code_v0 failed for {tc_name}: {e}"));
1482            assert_eq!(
1483                result.iscc, expected_iscc,
1484                "ISCC mismatch in test case {tc_name}"
1485            );
1486
1487            tested += 1;
1488        }
1489
1490        assert_eq!(tested, 3, "expected 3 conformance tests to run");
1491    }
1492
1493    #[test]
1494    fn test_gen_mixed_code_v0_conformance() {
1495        let json_str = include_str!("../tests/data.json");
1496        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1497        let section = &data["gen_mixed_code_v0"];
1498        let cases = section.as_object().unwrap();
1499
1500        let mut tested = 0;
1501
1502        for (tc_name, tc) in cases {
1503            let inputs = tc["inputs"].as_array().unwrap();
1504            let codes_json = inputs[0].as_array().unwrap();
1505            let bits = inputs[1].as_u64().unwrap() as u32;
1506            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1507            let expected_parts: Vec<&str> = tc["outputs"]["parts"]
1508                .as_array()
1509                .unwrap()
1510                .iter()
1511                .map(|v| v.as_str().unwrap())
1512                .collect();
1513
1514            let codes: Vec<&str> = codes_json.iter().map(|v| v.as_str().unwrap()).collect();
1515
1516            let result = gen_mixed_code_v0(&codes, bits)
1517                .unwrap_or_else(|e| panic!("gen_mixed_code_v0 failed for {tc_name}: {e}"));
1518            assert_eq!(
1519                result.iscc, expected_iscc,
1520                "ISCC mismatch in test case {tc_name}"
1521            );
1522
1523            // Verify parts from struct match expected
1524            let result_parts: Vec<&str> = result.parts.iter().map(|s| s.as_str()).collect();
1525            assert_eq!(
1526                result_parts, expected_parts,
1527                "parts mismatch in test case {tc_name}"
1528            );
1529
1530            tested += 1;
1531        }
1532
1533        assert_eq!(tested, 2, "expected 2 conformance tests to run");
1534    }
1535
1536    #[test]
1537    fn test_gen_mixed_code_v0_too_few_codes() {
1538        assert!(matches!(
1539            gen_mixed_code_v0(&["EUA6GIKXN42IQV3S"], 64),
1540            Err(IsccError::InvalidInput(_))
1541        ));
1542    }
1543
1544    /// Build raw Content-Code bytes (header + body) for a given bit length.
1545    fn make_content_code_raw(stype: codec::SubType, bit_length: u32) -> Vec<u8> {
1546        let nbytes = (bit_length / 8) as usize;
1547        let body: Vec<u8> = (0..nbytes).map(|i| (i & 0xFF) as u8).collect();
1548        let base32 = codec::encode_component(
1549            codec::MainType::Content,
1550            stype,
1551            codec::Version::V0,
1552            bit_length,
1553            &body,
1554        )
1555        .unwrap();
1556        codec::decode_base32(&base32).unwrap()
1557    }
1558
1559    #[test]
1560    fn test_soft_hash_codes_v0_rejects_short_code() {
1561        // One code with 64 bits, one with only 32 bits — should reject when requesting 64
1562        let code_64 = make_content_code_raw(codec::SubType::None, 64);
1563        let code_32 = make_content_code_raw(codec::SubType::Image, 32);
1564        let result = soft_hash_codes_v0(&[code_64, code_32], 64);
1565        assert!(
1566            matches!(&result, Err(IsccError::InvalidInput(msg)) if msg.contains("too short")),
1567            "expected InvalidInput with 'too short', got {result:?}"
1568        );
1569    }
1570
1571    #[test]
1572    fn test_soft_hash_codes_v0_accepts_exact_length() {
1573        // Two codes with exactly 64 bits each — should succeed when requesting 64
1574        let code_a = make_content_code_raw(codec::SubType::None, 64);
1575        let code_b = make_content_code_raw(codec::SubType::Image, 64);
1576        let result = soft_hash_codes_v0(&[code_a, code_b], 64);
1577        assert!(result.is_ok(), "expected Ok, got {result:?}");
1578    }
1579
1580    #[test]
1581    fn test_soft_hash_codes_v0_accepts_longer_codes() {
1582        // Two codes with 128 bits each — should succeed when requesting 64
1583        let code_a = make_content_code_raw(codec::SubType::None, 128);
1584        let code_b = make_content_code_raw(codec::SubType::Audio, 128);
1585        let result = soft_hash_codes_v0(&[code_a, code_b], 64);
1586        assert!(result.is_ok(), "expected Ok, got {result:?}");
1587    }
1588
1589    #[test]
1590    fn test_gen_data_code_v0_conformance() {
1591        let json_str = include_str!("../tests/data.json");
1592        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1593        let section = &data["gen_data_code_v0"];
1594        let cases = section.as_object().unwrap();
1595
1596        let mut tested = 0;
1597
1598        for (tc_name, tc) in cases {
1599            let inputs = tc["inputs"].as_array().unwrap();
1600            let stream_str = inputs[0].as_str().unwrap();
1601            let bits = inputs[1].as_u64().unwrap() as u32;
1602            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1603
1604            // Parse "stream:" prefix — remainder is hex-encoded bytes
1605            let hex_data = stream_str
1606                .strip_prefix("stream:")
1607                .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {tc_name}"));
1608            let input_bytes = hex::decode(hex_data)
1609                .unwrap_or_else(|e| panic!("invalid hex in test case {tc_name}: {e}"));
1610
1611            let result = gen_data_code_v0(&input_bytes, bits)
1612                .unwrap_or_else(|e| panic!("gen_data_code_v0 failed for {tc_name}: {e}"));
1613            assert_eq!(
1614                result.iscc, expected_iscc,
1615                "ISCC mismatch in test case {tc_name}"
1616            );
1617
1618            tested += 1;
1619        }
1620
1621        assert_eq!(tested, 4, "expected 4 conformance tests to run");
1622    }
1623
1624    #[test]
1625    fn test_gen_instance_code_v0_empty() {
1626        let result = gen_instance_code_v0(b"", 64).unwrap();
1627        assert_eq!(result.iscc, "ISCC:IAA26E2JXH27TING");
1628        assert_eq!(result.filesize, 0);
1629        assert_eq!(
1630            result.datahash,
1631            "1e20af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262"
1632        );
1633    }
1634
1635    #[test]
1636    fn test_gen_instance_code_v0_conformance() {
1637        let json_str = include_str!("../tests/data.json");
1638        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1639        let section = &data["gen_instance_code_v0"];
1640        let cases = section.as_object().unwrap();
1641
1642        for (name, tc) in cases {
1643            let inputs = tc["inputs"].as_array().unwrap();
1644            let stream_str = inputs[0].as_str().unwrap();
1645            let bits = inputs[1].as_u64().unwrap() as u32;
1646            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1647
1648            // Parse "stream:" prefix — remainder is hex-encoded bytes
1649            let hex_data = stream_str
1650                .strip_prefix("stream:")
1651                .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {name}"));
1652            let input_bytes = hex::decode(hex_data)
1653                .unwrap_or_else(|e| panic!("invalid hex in test case {name}: {e}"));
1654
1655            let result = gen_instance_code_v0(&input_bytes, bits)
1656                .unwrap_or_else(|e| panic!("gen_instance_code_v0 failed for {name}: {e}"));
1657            assert_eq!(
1658                result.iscc, expected_iscc,
1659                "ISCC mismatch in test case {name}"
1660            );
1661
1662            // Verify datahash from struct
1663            if let Some(expected_datahash) = tc["outputs"].get("datahash") {
1664                let expected_datahash = expected_datahash.as_str().unwrap();
1665                assert_eq!(
1666                    result.datahash, expected_datahash,
1667                    "datahash mismatch in test case {name}"
1668                );
1669            }
1670
1671            // Verify filesize from struct
1672            if let Some(expected_filesize) = tc["outputs"].get("filesize") {
1673                let expected_filesize = expected_filesize.as_u64().unwrap();
1674                assert_eq!(
1675                    result.filesize, expected_filesize,
1676                    "filesize mismatch in test case {name}"
1677                );
1678            }
1679
1680            // Also verify filesize matches input data length
1681            assert_eq!(
1682                result.filesize,
1683                input_bytes.len() as u64,
1684                "filesize should match input length in test case {name}"
1685            );
1686        }
1687    }
1688
1689    #[test]
1690    fn test_gen_iscc_code_v0_conformance() {
1691        let json_str = include_str!("../tests/data.json");
1692        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
1693        let section = &data["gen_iscc_code_v0"];
1694        let cases = section.as_object().unwrap();
1695
1696        let mut tested = 0;
1697
1698        for (tc_name, tc) in cases {
1699            let inputs = tc["inputs"].as_array().unwrap();
1700            let codes_json = inputs[0].as_array().unwrap();
1701            let expected_iscc = tc["outputs"]["iscc"].as_str().unwrap();
1702
1703            let codes: Vec<&str> = codes_json.iter().map(|v| v.as_str().unwrap()).collect();
1704
1705            let result = gen_iscc_code_v0(&codes, false)
1706                .unwrap_or_else(|e| panic!("gen_iscc_code_v0 failed for {tc_name}: {e}"));
1707            assert_eq!(
1708                result.iscc, expected_iscc,
1709                "ISCC mismatch in test case {tc_name}"
1710            );
1711
1712            tested += 1;
1713        }
1714
1715        assert_eq!(tested, 5, "expected 5 conformance tests to run");
1716    }
1717
1718    #[test]
1719    fn test_gen_iscc_code_v0_too_few_codes() {
1720        assert!(matches!(
1721            gen_iscc_code_v0(&["AAAWKLHFPV6OPKDG"], false),
1722            Err(IsccError::InvalidInput(_))
1723        ));
1724    }
1725
1726    #[test]
1727    fn test_gen_iscc_code_v0_missing_instance() {
1728        // Two Meta codes — missing Data and Instance
1729        assert!(matches!(
1730            gen_iscc_code_v0(&["AAAWKLHFPV6OPKDG", "AAAWKLHFPV6OPKDG"], false),
1731            Err(IsccError::InvalidInput(_))
1732        ));
1733    }
1734
1735    #[test]
1736    fn test_gen_iscc_code_v0_short_code() {
1737        // Code too short (< 16 chars)
1738        assert!(matches!(
1739            gen_iscc_code_v0(&["AAAWKLHFPV6", "AAAWKLHFPV6OPKDG"], false),
1740            Err(IsccError::InvalidInput(_))
1741        ));
1742    }
1743
1744    /// Verify that a Data-URL with empty base64 payload enters the meta bytes path.
1745    ///
1746    /// Python reference: `if meta:` is truthy for `"data:application/json;base64,"` (non-empty
1747    /// string), so it enters the meta branch with `payload = b""`. The result must have
1748    /// `meta = Some(...)` containing the original Data-URL and `metahash` equal to
1749    /// `multi_hash_blake3(&[])` (BLAKE3 of empty bytes).
1750    #[cfg(feature = "meta-code")]
1751    #[test]
1752    fn test_gen_meta_code_empty_data_url_enters_meta_branch() {
1753        let result =
1754            gen_meta_code_v0("Test", None, Some("data:application/json;base64,"), 64).unwrap();
1755
1756        // Result should be Ok
1757        assert_eq!(result.name, "Test");
1758
1759        // meta should contain the original Data-URL string (not None)
1760        assert_eq!(
1761            result.meta,
1762            Some("data:application/json;base64,".to_string()),
1763            "empty Data-URL payload should still enter meta branch"
1764        );
1765
1766        // metahash should be BLAKE3 of empty bytes
1767        let expected_metahash = utils::multi_hash_blake3(&[]);
1768        assert_eq!(
1769            result.metahash, expected_metahash,
1770            "metahash should be BLAKE3 of empty bytes"
1771        );
1772    }
1773
1774    /// Verify that `soft_hash_meta_v0_with_bytes` with empty bytes produces the same
1775    /// digest as `soft_hash_meta_v0` with no extra text.
1776    ///
1777    /// Python reference (`code_meta.py:142`): `if extra in {None, "", b""}:` returns
1778    /// name-only simhash without interleaving for all empty-like values.
1779    #[cfg(feature = "meta-code")]
1780    #[test]
1781    fn test_soft_hash_meta_v0_with_bytes_empty_equals_name_only() {
1782        let name_only = soft_hash_meta_v0("test", None);
1783        let empty_bytes = soft_hash_meta_v0_with_bytes("test", &[]);
1784        assert_eq!(
1785            name_only, empty_bytes,
1786            "empty bytes should produce same digest as name-only (no interleaving)"
1787        );
1788    }
1789
1790    // ---- Algorithm constants tests ----
1791
1792    #[cfg(feature = "meta-code")]
1793    #[test]
1794    fn test_meta_trim_name_value() {
1795        assert_eq!(META_TRIM_NAME, 128);
1796    }
1797
1798    #[cfg(feature = "meta-code")]
1799    #[test]
1800    fn test_meta_trim_description_value() {
1801        assert_eq!(META_TRIM_DESCRIPTION, 4096);
1802    }
1803
1804    #[test]
1805    fn test_io_read_size_value() {
1806        assert_eq!(IO_READ_SIZE, 4_194_304);
1807    }
1808
1809    #[test]
1810    fn test_text_ngram_size_value() {
1811        assert_eq!(TEXT_NGRAM_SIZE, 13);
1812    }
1813
1814    // ---- encode_component Tier 1 wrapper tests ----
1815
1816    /// Encode a known digest and verify the output matches the codec version.
1817    #[test]
1818    fn test_encode_component_matches_codec() {
1819        let digest = [0xABu8; 8];
1820        let tier1 = encode_component(3, 0, 0, 64, &digest).unwrap();
1821        let tier2 = codec::encode_component(
1822            codec::MainType::Data,
1823            codec::SubType::None,
1824            codec::Version::V0,
1825            64,
1826            &digest,
1827        )
1828        .unwrap();
1829        assert_eq!(tier1, tier2);
1830    }
1831
1832    /// Round-trip: encode a digest and verify the result is a valid ISCC unit.
1833    #[test]
1834    fn test_encode_component_round_trip() {
1835        let digest = [0x42u8; 32];
1836        let result = encode_component(0, 0, 0, 64, &digest).unwrap();
1837        // Meta-Code with 64-bit digest should start with "AA"
1838        assert!(!result.is_empty());
1839    }
1840
1841    /// Reject MainType::Iscc (value 5).
1842    #[test]
1843    fn test_encode_component_rejects_iscc() {
1844        let result = encode_component(5, 0, 0, 64, &[0u8; 8]);
1845        assert!(result.is_err());
1846    }
1847
1848    /// Reject digest shorter than bit_length / 8.
1849    #[test]
1850    fn test_encode_component_rejects_short_digest() {
1851        let result = encode_component(0, 0, 0, 64, &[0u8; 4]);
1852        assert!(result.is_err());
1853        let err = result.unwrap_err().to_string();
1854        assert!(
1855            err.contains("digest length 4 < bit_length/8 (8)"),
1856            "unexpected error: {err}"
1857        );
1858    }
1859
1860    /// Reject invalid MainType value.
1861    #[test]
1862    fn test_encode_component_rejects_invalid_mtype() {
1863        let result = encode_component(99, 0, 0, 64, &[0u8; 8]);
1864        assert!(result.is_err());
1865    }
1866
1867    /// Reject invalid SubType value.
1868    #[test]
1869    fn test_encode_component_rejects_invalid_stype() {
1870        let result = encode_component(0, 99, 0, 64, &[0u8; 8]);
1871        assert!(result.is_err());
1872    }
1873
1874    /// Reject invalid Version value.
1875    #[test]
1876    fn test_encode_component_rejects_invalid_version() {
1877        let result = encode_component(0, 0, 99, 64, &[0u8; 8]);
1878        assert!(result.is_err());
1879    }
1880
1881    // ---- iscc_decode tests ----
1882
1883    /// Round-trip: encode a Meta-Code digest, decode back, verify all fields match.
1884    #[test]
1885    fn test_iscc_decode_round_trip_meta() {
1886        let digest = [0xaa_u8; 8];
1887        let encoded = encode_component(0, 0, 0, 64, &digest).unwrap();
1888        let (mt, st, vs, li, decoded_digest) = iscc_decode(&encoded).unwrap();
1889        assert_eq!(mt, 0, "MainType::Meta");
1890        assert_eq!(st, 0, "SubType::None");
1891        assert_eq!(vs, 0, "Version::V0");
1892        // encode_length(Meta, 64) → 64/32 - 1 = 1
1893        assert_eq!(li, 1, "length_index");
1894        assert_eq!(decoded_digest, digest.to_vec());
1895    }
1896
1897    /// Round-trip with Content-Code (MainType=2, SubType::TEXT=0).
1898    #[test]
1899    fn test_iscc_decode_round_trip_content() {
1900        let digest = [0xbb_u8; 8];
1901        let encoded = encode_component(2, 0, 0, 64, &digest).unwrap();
1902        let (mt, st, vs, _li, decoded_digest) = iscc_decode(&encoded).unwrap();
1903        assert_eq!(mt, 2, "MainType::Content");
1904        assert_eq!(st, 0, "SubType::TEXT");
1905        assert_eq!(vs, 0, "Version::V0");
1906        assert_eq!(decoded_digest, digest.to_vec());
1907    }
1908
1909    /// Round-trip with Data-Code (MainType=3).
1910    #[test]
1911    fn test_iscc_decode_round_trip_data() {
1912        let digest = [0xcc_u8; 8];
1913        let encoded = encode_component(3, 0, 0, 64, &digest).unwrap();
1914        let (mt, _st, _vs, _li, decoded_digest) = iscc_decode(&encoded).unwrap();
1915        assert_eq!(mt, 3, "MainType::Data");
1916        assert_eq!(decoded_digest, digest.to_vec());
1917    }
1918
1919    /// Round-trip with Instance-Code (MainType=4).
1920    #[test]
1921    fn test_iscc_decode_round_trip_instance() {
1922        let digest = [0xdd_u8; 8];
1923        let encoded = encode_component(4, 0, 0, 64, &digest).unwrap();
1924        let (mt, _st, _vs, _li, decoded_digest) = iscc_decode(&encoded).unwrap();
1925        assert_eq!(mt, 4, "MainType::Instance");
1926        assert_eq!(decoded_digest, digest.to_vec());
1927    }
1928
1929    /// Decode with "ISCC:" prefix produces the same result.
1930    #[test]
1931    fn test_iscc_decode_with_prefix() {
1932        let digest = [0xaa_u8; 8];
1933        let encoded = encode_component(0, 0, 0, 64, &digest).unwrap();
1934        let with_prefix = format!("ISCC:{encoded}");
1935        let (mt, st, vs, li, decoded_digest) = iscc_decode(&with_prefix).unwrap();
1936        assert_eq!(mt, 0);
1937        assert_eq!(st, 0);
1938        assert_eq!(vs, 0);
1939        assert_eq!(li, 1);
1940        assert_eq!(decoded_digest, digest.to_vec());
1941    }
1942
1943    /// Decode with dashes inserted in the string.
1944    #[test]
1945    fn test_iscc_decode_with_dashes() {
1946        let digest = [0xaa_u8; 8];
1947        let encoded = encode_component(0, 0, 0, 64, &digest).unwrap();
1948        // Insert dashes at arbitrary positions
1949        let with_dashes = format!("{}-{}-{}", &encoded[..4], &encoded[4..8], &encoded[8..]);
1950        let (mt, st, vs, li, decoded_digest) = iscc_decode(&with_dashes).unwrap();
1951        assert_eq!(mt, 0);
1952        assert_eq!(st, 0);
1953        assert_eq!(vs, 0);
1954        assert_eq!(li, 1);
1955        assert_eq!(decoded_digest, digest.to_vec());
1956    }
1957
1958    /// Error on invalid base32 characters.
1959    #[test]
1960    fn test_iscc_decode_invalid_base32() {
1961        let result = iscc_decode("!!!INVALID!!!");
1962        assert!(result.is_err());
1963        let err = result.unwrap_err().to_string();
1964        assert!(err.contains("base32"), "expected base32 error: {err}");
1965    }
1966
1967    /// Known value from conformance vectors: Meta-Code "ISCC:AAAZXZ6OU74YAZIM".
1968    /// MainType=Meta(0), SubType=None(0), Version=V0(0), 64-bit digest.
1969    #[test]
1970    fn test_iscc_decode_known_meta_code() {
1971        let (mt, st, vs, li, digest) = iscc_decode("ISCC:AAAZXZ6OU74YAZIM").unwrap();
1972        assert_eq!(mt, 0, "MainType::Meta");
1973        assert_eq!(st, 0, "SubType::None");
1974        assert_eq!(vs, 0, "Version::V0");
1975        assert_eq!(li, 1, "length_index for 64-bit");
1976        assert_eq!(digest.len(), 8, "64-bit = 8 bytes");
1977    }
1978
1979    /// Known value from conformance vectors: Instance-Code "ISCC:IAA26E2JXH27TING".
1980    /// MainType=Instance(4), SubType=None(0), Version=V0(0), 64-bit digest.
1981    #[test]
1982    fn test_iscc_decode_known_instance_code() {
1983        let (mt, st, vs, li, digest) = iscc_decode("ISCC:IAA26E2JXH27TING").unwrap();
1984        assert_eq!(mt, 4, "MainType::Instance");
1985        assert_eq!(st, 0, "SubType::None");
1986        assert_eq!(vs, 0, "Version::V0");
1987        assert_eq!(li, 1, "length_index for 64-bit");
1988        assert_eq!(digest.len(), 8, "64-bit = 8 bytes");
1989    }
1990
1991    /// Known value: Data-Code "ISCC:GAAXL2XYM5BQIAZ3".
1992    /// MainType=Data(3), SubType=None(0), Version=V0(0), 64-bit digest.
1993    #[test]
1994    fn test_iscc_decode_known_data_code() {
1995        let (mt, st, vs, _li, digest) = iscc_decode("ISCC:GAAXL2XYM5BQIAZ3").unwrap();
1996        assert_eq!(mt, 3, "MainType::Data");
1997        assert_eq!(st, 0, "SubType::None");
1998        assert_eq!(vs, 0, "Version::V0");
1999        assert_eq!(digest.len(), 8, "64-bit = 8 bytes");
2000    }
2001
2002    /// Verification criterion: round-trip with specific known values.
2003    /// encode_component(0, 0, 0, 64, &[0xaa;8]) → iscc_decode → (0, 0, 0, 1, vec![0xaa;8])
2004    #[test]
2005    fn test_iscc_decode_verification_round_trip() {
2006        let digest = [0xaa_u8; 8];
2007        let encoded = encode_component(0, 0, 0, 64, &digest).unwrap();
2008        let result = iscc_decode(&encoded).unwrap();
2009        assert_eq!(result, (0, 0, 0, 1, vec![0xaa; 8]));
2010    }
2011
2012    /// Error on truncated input where body is shorter than expected digest length.
2013    #[test]
2014    fn test_iscc_decode_truncated_input() {
2015        // Encode a valid 256-bit Meta-Code, then truncate the base32 string
2016        let digest = [0xff_u8; 32];
2017        let encoded = encode_component(0, 0, 0, 256, &digest).unwrap();
2018        // Truncate to just the header portion (first few chars)
2019        let truncated = &encoded[..6];
2020        let result = iscc_decode(truncated);
2021        assert!(result.is_err(), "should fail on truncated input");
2022    }
2023
2024    // --- json_to_data_url tests ---
2025
2026    /// Basic JSON object produces a data URL with application/json media type.
2027    #[cfg(feature = "meta-code")]
2028    #[test]
2029    fn test_json_to_data_url_basic() {
2030        let url = json_to_data_url(r#"{"key": "value"}"#).unwrap();
2031        assert!(
2032            url.starts_with("data:application/json;base64,"),
2033            "expected application/json prefix, got: {url}"
2034        );
2035    }
2036
2037    /// JSON with `@context` key uses application/ld+json media type.
2038    #[cfg(feature = "meta-code")]
2039    #[test]
2040    fn test_json_to_data_url_ld_json() {
2041        let url = json_to_data_url(r#"{"@context": "https://schema.org"}"#).unwrap();
2042        assert!(
2043            url.starts_with("data:application/ld+json;base64,"),
2044            "expected application/ld+json prefix, got: {url}"
2045        );
2046    }
2047
2048    /// JCS canonicalization reorders keys alphabetically.
2049    #[cfg(feature = "meta-code")]
2050    #[test]
2051    fn test_json_to_data_url_jcs_ordering() {
2052        let url = json_to_data_url(r#"{"b":1,"a":2}"#).unwrap();
2053        // Extract and decode the base64 payload
2054        let b64 = url.split_once(',').unwrap().1;
2055        let decoded = data_encoding::BASE64.decode(b64.as_bytes()).unwrap();
2056        let canonical = std::str::from_utf8(&decoded).unwrap();
2057        assert_eq!(canonical, r#"{"a":2,"b":1}"#, "JCS should sort keys");
2058    }
2059
2060    /// Round-trip: json_to_data_url output fed into decode_data_url recovers
2061    /// the JCS-canonical bytes.
2062    #[cfg(feature = "meta-code")]
2063    #[test]
2064    fn test_json_to_data_url_round_trip() {
2065        let input = r#"{"hello": "world", "num": 42}"#;
2066        let url = json_to_data_url(input).unwrap();
2067        let decoded_bytes = decode_data_url(&url).unwrap();
2068        // The decoded bytes should be JCS-canonical JSON
2069        let canonical: serde_json::Value =
2070            serde_json::from_slice(&decoded_bytes).expect("decoded bytes should be valid JSON");
2071        let original: serde_json::Value = serde_json::from_str(input).unwrap();
2072        assert_eq!(canonical, original, "round-trip preserves JSON semantics");
2073    }
2074
2075    /// Invalid JSON string returns an error.
2076    #[cfg(feature = "meta-code")]
2077    #[test]
2078    fn test_json_to_data_url_invalid_json() {
2079        let result = json_to_data_url("not json");
2080        assert!(result.is_err(), "should reject invalid JSON");
2081        let err = result.unwrap_err().to_string();
2082        assert!(
2083            err.contains("invalid JSON"),
2084            "expected 'invalid JSON' in error: {err}"
2085        );
2086    }
2087
2088    /// Compatibility with conformance vector test_0016_meta_data_url.
2089    ///
2090    /// The conformance vector's meta field is:
2091    ///   data:application/json;charset=utf-8;base64,eyJzb21lIjogIm9iamVjdCJ9
2092    /// which encodes `{"some": "object"}` (with space after colon).
2093    ///
2094    /// Our function differs in two ways:
2095    /// 1. No `charset=utf-8` parameter (matching Python's DataURL.from_byte_data)
2096    /// 2. JCS canonicalization removes whitespace: `{"some":"object"}` (no space)
2097    ///
2098    /// We verify: (a) correct media type prefix, and (b) decoded payload equals
2099    /// JCS-canonical form of the same JSON input.
2100    #[cfg(feature = "meta-code")]
2101    #[test]
2102    fn test_json_to_data_url_conformance_0016() {
2103        let url = json_to_data_url(r#"{"some": "object"}"#).unwrap();
2104        // (a) Correct media type prefix (no charset, no @context → application/json)
2105        assert!(
2106            url.starts_with("data:application/json;base64,"),
2107            "expected application/json prefix"
2108        );
2109        // (b) Decoded payload is JCS-canonical (no whitespace)
2110        let b64 = url.split_once(',').unwrap().1;
2111        let decoded = data_encoding::BASE64.decode(b64.as_bytes()).unwrap();
2112        let canonical = std::str::from_utf8(&decoded).unwrap();
2113        assert_eq!(
2114            canonical, r#"{"some":"object"}"#,
2115            "JCS removes whitespace from JSON"
2116        );
2117    }
2118
2119    #[cfg(feature = "meta-code")]
2120    #[test]
2121    fn test_meta_trim_meta_value() {
2122        assert_eq!(META_TRIM_META, 128_000);
2123    }
2124
2125    #[cfg(feature = "meta-code")]
2126    #[test]
2127    fn test_gen_meta_code_v0_meta_at_limit() {
2128        // Create a JSON payload that decodes to exactly 128,000 bytes
2129        // JSON: {"x":"<padding>"} where padding fills to 128,000 bytes
2130        // The canonical JSON overhead is {"x":""} = 8 bytes, so padding = 127,992 bytes
2131        let padding = "a".repeat(128_000 - 8);
2132        let json_str = format!(r#"{{"x":"{padding}"}}"#);
2133        let result = gen_meta_code_v0("test", None, Some(&json_str), 64);
2134        assert!(
2135            result.is_ok(),
2136            "payload at exactly META_TRIM_META should succeed"
2137        );
2138    }
2139
2140    #[cfg(feature = "meta-code")]
2141    #[test]
2142    fn test_gen_meta_code_v0_meta_over_limit() {
2143        // Create a JSON payload that decodes to 128,001 bytes (one over limit)
2144        let padding = "a".repeat(128_000 - 8 + 1);
2145        let json_str = format!(r#"{{"x":"{padding}"}}"#);
2146        let result = gen_meta_code_v0("test", None, Some(&json_str), 64);
2147        assert!(
2148            matches!(result, Err(IsccError::InvalidInput(ref msg)) if msg.contains("size limit")),
2149            "payload exceeding META_TRIM_META should return InvalidInput"
2150        );
2151    }
2152
2153    #[cfg(feature = "meta-code")]
2154    #[test]
2155    fn test_gen_meta_code_v0_data_url_pre_decode_reject() {
2156        // Create a Data-URL string exceeding the pre-decode limit
2157        // PRE_DECODE_LIMIT = META_TRIM_META * 4 / 3 + 256 = 170,922
2158        let pre_decode_limit = META_TRIM_META * 4 / 3 + 256;
2159        let padding = "A".repeat(pre_decode_limit + 1);
2160        let data_url = format!("data:application/octet-stream;base64,{padding}");
2161        let result = gen_meta_code_v0("test", None, Some(&data_url), 64);
2162        assert!(
2163            matches!(result, Err(IsccError::InvalidInput(ref msg)) if msg.contains("size limit")),
2164            "oversized Data-URL should be rejected before decoding"
2165        );
2166    }
2167
2168    // ---- gen_sum_code_v0 tests ----
2169
2170    /// Helper: write data to a unique temp file and return the path.
2171    fn write_temp_file(name: &str, data: &[u8]) -> std::path::PathBuf {
2172        let path = std::env::temp_dir().join(format!("iscc_test_{name}"));
2173        std::fs::write(&path, data).expect("failed to write temp file");
2174        path
2175    }
2176
2177    #[test]
2178    fn test_gen_sum_code_v0_equivalence() {
2179        let data = b"Hello, ISCC World! This is a test of gen_sum_code_v0.";
2180        let path = write_temp_file("sum_equiv", data);
2181
2182        let sum_result = gen_sum_code_v0(&path, 64, false, false).unwrap();
2183
2184        // Compute the same result via separate functions
2185        let data_result = gen_data_code_v0(data, 64).unwrap();
2186        let instance_result = gen_instance_code_v0(data, 64).unwrap();
2187        let iscc_result =
2188            gen_iscc_code_v0(&[&data_result.iscc, &instance_result.iscc], false).unwrap();
2189
2190        assert_eq!(sum_result.iscc, iscc_result.iscc);
2191        assert_eq!(sum_result.datahash, instance_result.datahash);
2192        assert_eq!(sum_result.filesize, instance_result.filesize);
2193        assert_eq!(sum_result.filesize, data.len() as u64);
2194        assert_eq!(sum_result.units, None);
2195
2196        std::fs::remove_file(&path).ok();
2197    }
2198
2199    #[test]
2200    fn test_gen_sum_code_v0_empty_file() {
2201        let path = write_temp_file("sum_empty", b"");
2202
2203        let sum_result = gen_sum_code_v0(&path, 64, false, false).unwrap();
2204
2205        let data_result = gen_data_code_v0(b"", 64).unwrap();
2206        let instance_result = gen_instance_code_v0(b"", 64).unwrap();
2207        let iscc_result =
2208            gen_iscc_code_v0(&[&data_result.iscc, &instance_result.iscc], false).unwrap();
2209
2210        assert_eq!(sum_result.iscc, iscc_result.iscc);
2211        assert_eq!(sum_result.datahash, instance_result.datahash);
2212        assert_eq!(sum_result.filesize, 0);
2213
2214        std::fs::remove_file(&path).ok();
2215    }
2216
2217    #[test]
2218    fn test_gen_sum_code_v0_file_not_found() {
2219        let path = std::env::temp_dir().join("iscc_test_nonexistent_file_xyz");
2220        let result = gen_sum_code_v0(&path, 64, false, false);
2221        assert!(result.is_err());
2222        let err_msg = result.unwrap_err().to_string();
2223        assert!(
2224            err_msg.contains("Cannot open file"),
2225            "error message should mention file open failure: {err_msg}"
2226        );
2227    }
2228
2229    #[test]
2230    fn test_gen_sum_code_v0_wide_mode() {
2231        let data = b"Testing wide mode for gen_sum_code_v0 function.";
2232        let path = write_temp_file("sum_wide", data);
2233
2234        let narrow = gen_sum_code_v0(&path, 64, false, false).unwrap();
2235        let wide = gen_sum_code_v0(&path, 64, true, false).unwrap();
2236
2237        // Wide mode with 64-bit codes doesn't trigger (need 128+), so they should be equal
2238        assert_eq!(narrow.iscc, wide.iscc);
2239
2240        // With 128 bits, wide mode should produce a different (longer) ISCC
2241        let narrow_128 = gen_sum_code_v0(&path, 128, false, false).unwrap();
2242        let wide_128 = gen_sum_code_v0(&path, 128, true, false).unwrap();
2243        assert_ne!(narrow_128.iscc, wide_128.iscc);
2244
2245        // Both should have the same datahash and filesize
2246        assert_eq!(narrow_128.datahash, wide_128.datahash);
2247        assert_eq!(narrow_128.filesize, wide_128.filesize);
2248
2249        std::fs::remove_file(&path).ok();
2250    }
2251
2252    #[test]
2253    fn test_gen_sum_code_v0_bits_64() {
2254        let data = b"Testing 64-bit gen_sum_code_v0.";
2255        let path = write_temp_file("sum_bits64", data);
2256
2257        let sum_result = gen_sum_code_v0(&path, 64, false, false).unwrap();
2258
2259        let data_result = gen_data_code_v0(data, 64).unwrap();
2260        let instance_result = gen_instance_code_v0(data, 64).unwrap();
2261        let iscc_result =
2262            gen_iscc_code_v0(&[&data_result.iscc, &instance_result.iscc], false).unwrap();
2263
2264        assert_eq!(sum_result.iscc, iscc_result.iscc);
2265
2266        std::fs::remove_file(&path).ok();
2267    }
2268
2269    #[test]
2270    fn test_gen_sum_code_v0_bits_128() {
2271        let data = b"Testing 128-bit gen_sum_code_v0.";
2272        let path = write_temp_file("sum_bits128", data);
2273
2274        let sum_result = gen_sum_code_v0(&path, 128, false, false).unwrap();
2275
2276        let data_result = gen_data_code_v0(data, 128).unwrap();
2277        let instance_result = gen_instance_code_v0(data, 128).unwrap();
2278        let iscc_result =
2279            gen_iscc_code_v0(&[&data_result.iscc, &instance_result.iscc], false).unwrap();
2280
2281        assert_eq!(sum_result.iscc, iscc_result.iscc);
2282        assert_eq!(sum_result.datahash, instance_result.datahash);
2283        assert_eq!(sum_result.filesize, data.len() as u64);
2284
2285        std::fs::remove_file(&path).ok();
2286    }
2287
2288    #[test]
2289    fn test_gen_sum_code_v0_large_data() {
2290        // Generate data large enough to produce multiple CDC chunks
2291        let data: Vec<u8> = (0..50_000).map(|i| (i % 256) as u8).collect();
2292        let path = write_temp_file("sum_large", &data);
2293
2294        let sum_result = gen_sum_code_v0(&path, 64, false, false).unwrap();
2295
2296        let data_result = gen_data_code_v0(&data, 64).unwrap();
2297        let instance_result = gen_instance_code_v0(&data, 64).unwrap();
2298        let iscc_result =
2299            gen_iscc_code_v0(&[&data_result.iscc, &instance_result.iscc], false).unwrap();
2300
2301        assert_eq!(sum_result.iscc, iscc_result.iscc);
2302        assert_eq!(sum_result.datahash, instance_result.datahash);
2303        assert_eq!(sum_result.filesize, data.len() as u64);
2304
2305        std::fs::remove_file(&path).ok();
2306    }
2307
2308    #[test]
2309    fn test_gen_sum_code_v0_units_enabled() {
2310        let data = b"Hello, ISCC World! This is a test of gen_sum_code_v0 units.";
2311        let path = write_temp_file("sum_units_on", data);
2312
2313        let sum_result = gen_sum_code_v0(&path, 64, false, true).unwrap();
2314
2315        // units should be Some with exactly 2 elements
2316        let units = sum_result.units.as_ref().expect("units should be Some");
2317        assert_eq!(
2318            units.len(),
2319            2,
2320            "units should contain [Data-Code, Instance-Code]"
2321        );
2322
2323        // First unit should be a Data-Code (MainType::Data = 3)
2324        let (maintype, ..) = iscc_decode(&units[0]).unwrap();
2325        assert_eq!(
2326            maintype, 3,
2327            "first unit should be a Data-Code (MainType::Data = 3)"
2328        );
2329
2330        // Second unit should be an Instance-Code (MainType::Instance = 4)
2331        let (maintype, ..) = iscc_decode(&units[1]).unwrap();
2332        assert_eq!(
2333            maintype, 4,
2334            "second unit should be an Instance-Code (MainType::Instance = 4)"
2335        );
2336
2337        // Units should match individually computed codes
2338        let data_result = gen_data_code_v0(data, 64).unwrap();
2339        let instance_result = gen_instance_code_v0(data, 64).unwrap();
2340        assert_eq!(units[0], data_result.iscc);
2341        assert_eq!(units[1], instance_result.iscc);
2342
2343        // The composite ISCC should still be correct
2344        let iscc_result =
2345            gen_iscc_code_v0(&[&data_result.iscc, &instance_result.iscc], false).unwrap();
2346        assert_eq!(sum_result.iscc, iscc_result.iscc);
2347
2348        std::fs::remove_file(&path).ok();
2349    }
2350
2351    #[test]
2352    fn test_gen_sum_code_v0_units_disabled() {
2353        let data = b"Hello, ISCC World! This is a test of gen_sum_code_v0 no units.";
2354        let path = write_temp_file("sum_units_off", data);
2355
2356        let sum_result = gen_sum_code_v0(&path, 64, false, false).unwrap();
2357
2358        assert_eq!(
2359            sum_result.units, None,
2360            "units should be None when add_units is false"
2361        );
2362
2363        // The composite ISCC should still be correct
2364        let data_result = gen_data_code_v0(data, 64).unwrap();
2365        let instance_result = gen_instance_code_v0(data, 64).unwrap();
2366        let iscc_result =
2367            gen_iscc_code_v0(&[&data_result.iscc, &instance_result.iscc], false).unwrap();
2368        assert_eq!(sum_result.iscc, iscc_result.iscc);
2369
2370        std::fs::remove_file(&path).ok();
2371    }
2372}