host-chain-core 0.3.8

//! CARv1 / UnixFS parser for .prod bundles.
//!
//! Parses a CARv1 archive (a sequential list of IPFS blocks) and walks the
//! dag-pb / UnixFS directory tree to produce a flat `HashMap<String, Vec<u8>>`
//! of filename → content, suitable for loading into a `ProdBundle`.
//!
//! This module is identical to `host-chain/src/car.rs` except that
//! `zstd::decode_all` is replaced with `ruzstd::StreamingDecoder` so that
//! the crate compiles to `wasm32-unknown-unknown` (zstd uses C FFI).

use sha2::{Digest, Sha256};
use std::collections::HashMap;

/// Returns `true` if the bytes look like a CARv1 file (CBOR header containing "roots").
pub fn is_car_file(data: &[u8]) -> bool {
    data.len() >= 20 && data[1..20].windows(5).any(|w| w == b"roots")
}

/// Parse the CARv1 DAG-CBOR header payload and extract root CID bytes.
///
/// The header is a CBOR map with two keys: "version" (uint) and "roots" (array
/// of tag-42 byte strings). Each CID is wrapped in CBOR tag 42 (`0xd8 0x2a`)
/// followed by a byte string whose first byte is `0x00` (identity multibase
/// prefix). That prefix is stripped before returning, leaving raw binary CIDs
/// (version varint + codec varint + multihash) matching the keys in the block
/// `HashMap`.
fn parse_car_header_roots(header: &[u8]) -> Result<Vec<Vec<u8>>, String> {
    let mut pos = 0;

    // Helper: read a CBOR length value based on the additional-info nibble.
    // Returns (length, bytes_consumed_for_length_encoding).
    let read_cbor_len =
        |additional_info: u8, data: &[u8], offset: usize| -> Result<(usize, usize), String> {
            if additional_info <= 23 {
                Ok((additional_info as usize, 0))
            } else if additional_info == 24 {
                if offset >= data.len() {
                    return Err("CBOR truncated: expected 1-byte length".into());
                }
                Ok((data[offset] as usize, 1))
            } else if additional_info == 25 {
                if offset + 1 >= data.len() {
                    return Err("CBOR truncated: expected 2-byte length".into());
                }
                Ok((
                    ((data[offset] as usize) << 8) | data[offset + 1] as usize,
                    2,
                ))
            } else if additional_info == 26 {
                if offset + 3 >= data.len() {
                    return Err("CBOR truncated: expected 4-byte length".into());
                }
                let v = ((data[offset] as usize) << 24)
                    | ((data[offset + 1] as usize) << 16)
                    | ((data[offset + 2] as usize) << 8)
                    | data[offset + 3] as usize;
                Ok((v, 4))
            } else {
                Err(format!(
                    "unsupported CBOR additional info: {additional_info}"
                ))
            }
        };

    if pos >= header.len() {
        return Err("CAR header is empty".into());
    }

    // Outer CBOR item must be a map (major type 5).
    let initial = header[pos];
    pos += 1;
    let major = initial >> 5;
    let additional = initial & 0x1f;
    if major != 5 {
        return Err(format!("CAR header is not a CBOR map (major type {major})"));
    }
    let (map_len, extra) = read_cbor_len(additional, header, pos)?;
    pos += extra;

    // CARv1 headers have exactly 2 keys ("version" + "roots"). Cap to reject
    // crafted inputs with huge map counts before entering the loop.
    const MAX_MAP_ENTRIES: usize = 8;
    if map_len > MAX_MAP_ENTRIES {
        return Err(format!(
            "CAR header CBOR map has {map_len} entries, exceeding maximum {MAX_MAP_ENTRIES}"
        ));
    }

    let mut roots: Option<Vec<Vec<u8>>> = None;

    for _ in 0..map_len {
        // Read map key — must be a text string (major type 3).
        if pos >= header.len() {
            return Err("CBOR map key missing".into());
        }
        let key_initial = header[pos];
        pos += 1;
        let key_major = key_initial >> 5;
        let key_additional = key_initial & 0x1f;
        if key_major != 3 {
            return Err(format!(
                "CBOR map key is not a text string (major type {key_major})"
            ));
        }
        let (key_len, extra) = read_cbor_len(key_additional, header, pos)?;
        pos += extra;
        if pos + key_len > header.len() {
            return Err("CBOR map key truncated".into());
        }
        let key = &header[pos..pos + key_len];
        pos += key_len;

        if key == b"roots" {
            // Value must be a CBOR array (major type 4).
            if pos >= header.len() {
                return Err("CBOR roots value missing".into());
            }
            let arr_initial = header[pos];
            pos += 1;
            let arr_major = arr_initial >> 5;
            let arr_additional = arr_initial & 0x1f;
            if arr_major != 4 {
                return Err(format!(
                    "CAR header 'roots' is not a CBOR array (major type {arr_major})"
                ));
            }
            let (arr_len, extra) = read_cbor_len(arr_additional, header, pos)?;
            pos += extra;

            // Cap root count to prevent oversized allocation from crafted input.
            const MAX_ROOTS: usize = 16;
            if arr_len > MAX_ROOTS {
                return Err(format!(
                    "CAR header declares {arr_len} roots, exceeding maximum {MAX_ROOTS}"
                ));
            }
            let mut cids = Vec::with_capacity(arr_len);
            for _ in 0..arr_len {
                // Each element is CBOR tag 42: 0xd8 0x2a.
                if pos + 2 > header.len() {
                    return Err("CBOR tag-42 truncated in roots array".into());
                }
                if header[pos] != 0xd8 || header[pos + 1] != 0x2a {
                    return Err(format!(
                        "expected CBOR tag-42 (0xd8 0x2a), got 0x{:02x} 0x{:02x}",
                        header[pos],
                        header[pos + 1]
                    ));
                }
                pos += 2;

                // The tagged value is a byte string (major type 2).
                if pos >= header.len() {
                    return Err("CBOR byte string missing after tag-42".into());
                }
                let bs_initial = header[pos];
                pos += 1;
                let bs_major = bs_initial >> 5;
                let bs_additional = bs_initial & 0x1f;
                if bs_major != 2 {
                    return Err(format!(
                        "tag-42 content is not a byte string (major type {bs_major})"
                    ));
                }
                let (bs_len, extra) = read_cbor_len(bs_additional, header, pos)?;
                pos += extra;
                if pos + bs_len > header.len() {
                    return Err("CBOR byte string truncated in roots array".into());
                }
                if bs_len < 2 {
                    return Err("tag-42 byte string too short (must have 0x00 prefix + at least 1 CID byte)".into());
                }
                if header[pos] != 0x00 {
                    return Err(format!(
                        "tag-42 byte string has non-identity multibase prefix 0x{:02x}",
                        header[pos]
                    ));
                }
                // Strip the leading 0x00 identity multibase prefix.
                let cid = header[pos + 1..pos + bs_len].to_vec();
                pos += bs_len;
                cids.push(cid);
            }
            roots = Some(cids);
        } else {
            // Skip the value for any other key.
            if pos >= header.len() {
                return Err("CBOR map value missing".into());
            }
            let val_initial = header[pos];
            pos += 1;
            let val_major = val_initial >> 5;
            let val_additional = val_initial & 0x1f;
            match val_major {
                // Unsigned int (major 0) or negative int (major 1).
                0 | 1 => {
                    if val_additional >= 24 {
                        let (_, extra) = read_cbor_len(val_additional, header, pos)?;
                        pos += extra;
                    }
                }
                // Byte string (major 2) or text string (major 3) — skip payload.
                2 | 3 => {
                    let (len, extra) = read_cbor_len(val_additional, header, pos)?;
                    pos += extra + len;
                }
                // Tag (major 6) — skip the tag value then the tagged item.
                // The tag value is encoded like an unsigned int; the tagged item follows.
                6 => {
                    if val_additional >= 24 {
                        let (_, extra) = read_cbor_len(val_additional, header, pos)?;
                        pos += extra;
                    }
                    // Now skip the tagged item (recurse one level — the tagged item
                    // in CARv1 headers is always a byte string).
                    if pos < header.len() {
                        let inner = header[pos];
                        pos += 1;
                        let inner_major = inner >> 5;
                        let inner_additional = inner & 0x1f;
                        if inner_major == 2 || inner_major == 3 {
                            let (len, extra) = read_cbor_len(inner_additional, header, pos)?;
                            pos += extra + len;
                        } else if (inner_major == 0 || inner_major == 1) && inner_additional >= 24 {
                            let (_, extra) = read_cbor_len(inner_additional, header, pos)?;
                            pos += extra;
                        }
                    }
                }
                // Array (major 4) — skip elements best-effort.
                4 => {
                    let (len, extra) = read_cbor_len(val_additional, header, pos)?;
                    pos += extra;
                    for _ in 0..len {
                        if pos >= header.len() {
                            break;
                        }
                        let el = header[pos];
                        pos += 1;
                        let el_additional = el & 0x1f;
                        let el_major = el >> 5;
                        match el_major {
                            2 | 3 => {
                                let (elen, eextra) = read_cbor_len(el_additional, header, pos)?;
                                pos += eextra + elen;
                            }
                            0 | 1 => {
                                if el_additional >= 24 {
                                    let (_, extra) = read_cbor_len(el_additional, header, pos)?;
                                    pos += extra;
                                }
                            }
                            6 => {
                                // Tag — skip tag value then inner byte string.
                                if el_additional >= 24 {
                                    let (_, extra) = read_cbor_len(el_additional, header, pos)?;
                                    pos += extra;
                                }
                                if pos < header.len() {
                                    let inner = header[pos];
                                    pos += 1;
                                    let inner_additional = inner & 0x1f;
                                    let inner_major = inner >> 5;
                                    if inner_major == 2 || inner_major == 3 {
                                        let (ilen, iextra) =
                                            read_cbor_len(inner_additional, header, pos)?;
                                        pos += iextra + ilen;
                                    }
                                }
                            }
                            _ => {} // unknown element — already advanced past initial byte
                        }
                    }
                }
                // Simple/float (major 7), map (major 5) — not expected in CARv1 headers.
                _ => {
                    return Err(format!(
                        "unexpected CBOR major type {val_major} for map value"
                    ));
                }
            }
        }
    }

    roots.ok_or_else(|| "CAR header CBOR map has no 'roots' key".into())
}

/// Parse a CARv1 file into a flat asset map.
///
/// Walks the dag-pb directory tree rooted at the first CID declared in the
/// header's roots array and maps filenames to their reassembled content bytes.
pub fn parse_car_to_assets(data: &[u8]) -> Result<HashMap<String, Vec<u8>>, String> {
    let mut pos = 0;

    // 1. Read header: varint(header_len) + CBOR header
    let (header_len, n) = read_uvarint(&data[pos..])?;
    pos += n;
    if pos + header_len > data.len() {
        return Err("CAR header length exceeds data".into());
    }

    // Parse header roots before advancing past the header.
    let header_roots = parse_car_header_roots(&data[pos..pos + header_len])?;
    if header_roots.is_empty() {
        return Err("CAR header declares no roots".into());
    }
    pos += header_len;

    // 2. Read all blocks: varint(block_len) + CID + block_data
    let mut blocks: HashMap<Vec<u8>, Vec<u8>> = HashMap::new();

    while pos < data.len() {
        let (block_len, n) = read_uvarint(&data[pos..])?;
        pos += n;

        if pos + block_len > data.len() {
            break;
        }

        let block_start = pos;
        // Parse CID: version(varint) + codec(varint) + multihash
        let (_version, n) = read_uvarint(&data[pos..])?;
        pos += n;
        let (_codec, n) = read_uvarint(&data[pos..])?;
        pos += n;
        // Multihash: hash_function(varint) + digest_size(varint) + digest
        let (hash_fn, n) = read_uvarint(&data[pos..])?;
        pos += n;
        let (digest_size, n) = read_uvarint(&data[pos..])?;
        pos += n;
        let digest_start = pos;
        if pos + digest_size > data.len() || pos + digest_size > block_start + block_len {
            return Err("CID digest extends beyond block boundary".into());
        }
        pos += digest_size;

        let cid_bytes = data[block_start..pos].to_vec();
        let block_data = data[pos..block_start + block_len].to_vec();

        // Verify block integrity: recompute hash and compare against CID digest.
        if hash_fn == 0x12 && digest_size == 32 {
            let expected = &data[digest_start..digest_start + 32];
            let actual = Sha256::digest(&block_data);
            if actual.as_slice() != expected {
                return Err(format!(
                    "CID integrity check failed: block hash mismatch (expected {}, got {})",
                    hex(&expected[..4]),
                    hex(&actual[..4]),
                ));
            }
        }

        blocks.insert(cid_bytes, block_data);
        pos = block_start + block_len;
    }

    log::info!("[car] parsed {} blocks", blocks.len());

    // Resolve root from the header declaration rather than block insertion order.
    // Only the first declared root is used; multi-root CARs are not expected for
    // web-app bundles but are accepted with the first root selected.
    let root = header_roots
        .into_iter()
        .next()
        .expect("guaranteed non-empty by check above");
    let root_block = blocks
        .get(&root)
        .ok_or("root CID declared in header not found in block section")?;

    // 3. Check if the root is a file (not a directory).
    //
    // Some IPFS content is published as a large file whose bytes are themselves
    // a CARv1 archive (e.g. `ipfs add` of a .car file). In that case the
    // gateway's `?format=car` response wraps the file's chunks in an outer CAR.
    // We detect this by checking the root's UnixFS type: if it is `File`, we
    // reassemble the chunks, and if the result is itself a valid CAR, we
    // recursively parse the inner archive.
    const MAX_DEPTH: usize = 32;

    if !is_directory_node(root_block) {
        // Root is a file node — try to reassemble it.
        if let Ok(file_data) = reassemble_file(&blocks, &root, MAX_DEPTH) {
            if is_car_file(&file_data) {
                log::info!(
                    "[car] root is a file ({} bytes) containing a nested CAR, parsing inner archive...",
                    file_data.len()
                );
                return parse_car_to_assets(&file_data);
            }
            // Not a CAR — return as a single index.html file.
            log::info!("[car] root is a single file ({} bytes)", file_data.len());
            let mut assets = HashMap::new();
            assets.insert("index.html".into(), file_data);
            return Ok(assets);
        }
    }

    // 4. Parse root dag-pb node to get directory links
    let links = parse_dagpb_links(root_block);
    log::info!("[car] root directory has {} entries", links.len());

    // 5. For each link, reassemble content (recurse into subdirectories)
    let mut assets = HashMap::new();
    for (name, cid_bytes, _size) in &links {
        match reassemble_file(&blocks, cid_bytes, MAX_DEPTH) {
            Ok(content) => {
                log::info!("[car] extracted: {name} ({} bytes)", content.len());
                assets.insert(name.clone(), content);
            }
            Err(_) => {
                if let Some(dir_block) = blocks.get(cid_bytes) {
                    let sub_links = parse_dagpb_links(dir_block);
                    if !sub_links.is_empty() {
                        extract_directory_recursive(
                            &blocks,
                            name,
                            &sub_links,
                            &mut assets,
                            MAX_DEPTH - 1,
                        );
                    }
                }
            }
        }
    }

    if assets.is_empty() {
        return Err("CAR file contained no extractable assets".into());
    }

    Ok(assets)
}

/// Recursively extract files from a dag-pb directory subtree.
fn extract_directory_recursive(
    blocks: &HashMap<Vec<u8>, Vec<u8>>,
    prefix: &str,
    links: &[(String, Vec<u8>, u64)],
    assets: &mut HashMap<String, Vec<u8>>,
    depth: usize,
) {
    if depth == 0 {
        log::warn!("[car] max recursion depth reached at {prefix}");
        return;
    }
    for (name, cid_bytes, _size) in links {
        let full_path = format!("{prefix}/{name}");
        match reassemble_file(blocks, cid_bytes, depth - 1) {
            Ok(content) => {
                log::info!("[car] extracted: {full_path} ({} bytes)", content.len());
                assets.insert(full_path, content);
            }
            Err(_) => {
                if let Some(dir_block) = blocks.get(cid_bytes) {
                    let sub_links = parse_dagpb_links(dir_block);
                    if !sub_links.is_empty() {
                        extract_directory_recursive(
                            blocks,
                            &full_path,
                            &sub_links,
                            assets,
                            depth - 1,
                        );
                    }
                }
            }
        }
    }
}

/// Parse dag-pb protobuf to extract PBLink entries: (name, CID bytes, tsize).
pub fn parse_dagpb_links(pb: &[u8]) -> Vec<(String, Vec<u8>, u64)> {
    let mut links = Vec::new();
    let mut pos = 0;

    while pos < pb.len() {
        let tag = pb[pos];
        let field_num = tag >> 3;
        let wire_type = tag & 0x7;
        pos += 1;

        if wire_type == 2 {
            let (length, n) = match read_uvarint(&pb[pos..]) {
                Ok(v) => v,
                Err(_) => break,
            };
            pos += n;
            if pos + length > pb.len() {
                break;
            }

            if field_num == 2 {
                // PBLink — parse inner fields
                let inner = &pb[pos..pos + length];
                let mut hash = Vec::new();
                let mut name = String::new();
                let mut tsize: u64 = 0;
                let mut ipos = 0;
                while ipos < inner.len() {
                    let itag = inner[ipos];
                    let inum = itag >> 3;
                    let iwire = itag & 0x7;
                    ipos += 1;
                    match iwire {
                        2 => {
                            let (ilen, n) = match read_uvarint(&inner[ipos..]) {
                                Ok(v) => v,
                                Err(_) => break,
                            };
                            ipos += n;
                            if ipos + ilen > inner.len() {
                                break;
                            }
                            match inum {
                                1 => hash = inner[ipos..ipos + ilen].to_vec(),
                                2 => {
                                    name = String::from_utf8_lossy(&inner[ipos..ipos + ilen])
                                        .to_string()
                                }
                                _ => {}
                            }
                            ipos += ilen;
                        }
                        0 => {
                            let (val, n) = match read_uvarint(&inner[ipos..]) {
                                Ok(v) => v,
                                Err(_) => break,
                            };
                            ipos += n;
                            if inum == 3 {
                                tsize = val as u64;
                            }
                        }
                        _ => break,
                    }
                }
                if !hash.is_empty() {
                    links.push((name, hash, tsize));
                }
            }

            pos += length;
        } else if wire_type == 0 {
            let (_, n) = match read_uvarint(&pb[pos..]) {
                Ok(v) => v,
                Err(_) => break,
            };
            pos += n;
        } else {
            break;
        }
    }

    links
}

/// UnixFS node type from the protobuf Type field.
#[derive(Debug, PartialEq)]
enum UnixFsType {
    Raw,
    Directory,
    File,
    Unknown,
}

/// Extract the UnixFS type from a dag-pb Data field.
fn unixfs_type(unixfs: &[u8]) -> UnixFsType {
    let mut pos = 0;
    while pos < unixfs.len() {
        let tag = unixfs[pos];
        let field_num = tag >> 3;
        let wire_type = tag & 0x7;
        pos += 1;
        if wire_type == 0 {
            let (val, n) = match read_uvarint(&unixfs[pos..]) {
                Ok(v) => v,
                Err(_) => return UnixFsType::Unknown,
            };
            pos += n;
            if field_num == 1 {
                return match val {
                    0 => UnixFsType::Raw,
                    1 => UnixFsType::Directory,
                    2 => UnixFsType::File,
                    _ => UnixFsType::Unknown,
                };
            }
        } else if wire_type == 2 {
            let (length, n) = match read_uvarint(&unixfs[pos..]) {
                Ok(v) => v,
                Err(_) => return UnixFsType::Unknown,
            };
            pos += n + length;
        } else {
            break;
        }
    }
    UnixFsType::Unknown
}

/// Check if a dag-pb block is a UnixFS directory node.
fn is_directory_node(block: &[u8]) -> bool {
    if let Some(data) = extract_dagpb_data(block) {
        unixfs_type(&data) == UnixFsType::Directory
    } else {
        false
    }
}

/// zstd frame magic number.
const ZSTD_MAGIC: [u8; 4] = [0x28, 0xB5, 0x2F, 0xFD];

/// If data starts with the zstd magic number, decompress it; otherwise return as-is.
///
/// Uses `ruzstd::StreamingDecoder` (pure Rust) instead of `zstd::decode_all`
/// (C FFI) so this function compiles to `wasm32-unknown-unknown`.
fn maybe_decompress_zstd(data: Vec<u8>) -> Result<Vec<u8>, String> {
    const MAX_DECOMPRESSED: usize = 256 * 1024 * 1024; // 256 MB
    if data.len() >= 4 && data[..4] == ZSTD_MAGIC {
        use std::io::Read;
        let mut decoder = ruzstd::decoding::StreamingDecoder::new(data.as_slice())
            .map_err(|e| format!("zstd init failed: {e}"))?;
        let mut out = Vec::new();
        let mut buf = [0u8; 8192];
        loop {
            let n = decoder
                .read(&mut buf)
                .map_err(|e| format!("zstd decompress failed: {e}"))?;
            if n == 0 {
                break;
            }
            out.extend_from_slice(&buf[..n]);
            if out.len() > MAX_DECOMPRESSED {
                return Err(format!(
                    "zstd decompressed size exceeds {MAX_DECOMPRESSED} bytes"
                ));
            }
        }
        Ok(out)
    } else {
        Ok(data)
    }
}

/// Reassemble a file from its dag-pb block(s).
/// Handles single-chunk files, multi-chunk files, and raw leaves.
/// Returns Err for directory nodes (caller should recurse instead).
fn reassemble_file(
    blocks: &HashMap<Vec<u8>, Vec<u8>>,
    cid_bytes: &[u8],
    depth: usize,
) -> Result<Vec<u8>, String> {
    if depth == 0 {
        return Err("max recursion depth reached".into());
    }
    let block = blocks.get(cid_bytes).ok_or("block not found")?;

    if is_directory_node(block) {
        return Err("directory node".into());
    }

    let links = parse_dagpb_links(block);

    if links.is_empty() {
        // Leaf node — extract data from UnixFS protobuf
        if let Some(data) = extract_dagpb_data(block) {
            return maybe_decompress_zstd(extract_unixfs_data(&data)?);
        }
        // Raw leaf — the block IS the data (possibly zstd-compressed)
        return maybe_decompress_zstd(block.clone());
    }

    // Multi-chunk file — concatenate child blocks in order
    let mut result = Vec::new();
    for (_name, child_cid, _size) in &links {
        let chunk = reassemble_file(blocks, child_cid, depth - 1)?;
        result.extend_from_slice(&chunk);
    }
    Ok(result)
}

/// Extract the Data field (field 1) from a dag-pb node.
fn extract_dagpb_data(pb: &[u8]) -> Option<Vec<u8>> {
    let mut pos = 0;
    while pos < pb.len() {
        let tag = pb[pos];
        let field_num = tag >> 3;
        let wire_type = tag & 0x7;
        pos += 1;
        if wire_type == 2 {
            let (length, n) = read_uvarint(&pb[pos..]).ok()?;
            pos += n;
            if field_num == 1 {
                return Some(pb[pos..pos + length].to_vec());
            }
            pos += length;
        } else if wire_type == 0 {
            let (_, n) = read_uvarint(&pb[pos..]).ok()?;
            pos += n;
        } else {
            break;
        }
    }
    None
}

/// Extract file data from a UnixFS protobuf.
/// UnixFS: field 1 = Type (varint), field 2 = Data (bytes), field 3 = filesize.
fn extract_unixfs_data(unixfs: &[u8]) -> Result<Vec<u8>, String> {
    let mut pos = 0;
    while pos < unixfs.len() {
        let tag = unixfs[pos];
        let field_num = tag >> 3;
        let wire_type = tag & 0x7;
        pos += 1;
        match wire_type {
            2 => {
                let (length, n) = read_uvarint(&unixfs[pos..]).map_err(|e| e.to_string())?;
                pos += n;
                if field_num == 2 {
                    return Ok(unixfs[pos..pos + length].to_vec());
                }
                pos += length;
            }
            0 => {
                let (_, n) = read_uvarint(&unixfs[pos..]).map_err(|e| e.to_string())?;
                pos += n;
            }
            _ => break,
        }
    }
    Err("no data in UnixFS node".into())
}

/// Format a byte slice as hex (for error messages).
fn hex(bytes: &[u8]) -> String {
    bytes.iter().map(|b| format!("{b:02x}")).collect()
}

/// Read an unsigned varint (LEB128). Returns (value, bytes_consumed).
pub fn read_uvarint(data: &[u8]) -> Result<(usize, usize), String> {
    if data.is_empty() {
        return Err("empty data for uvarint".into());
    }
    let mut value: usize = 0;
    let mut shift = 0;
    for (i, &byte) in data.iter().enumerate() {
        value |= ((byte & 0x7f) as usize) << shift;
        if byte & 0x80 == 0 {
            return Ok((value, i + 1));
        }
        shift += 7;
        if shift > 63 {
            return Err("uvarint too long".into());
        }
    }
    Err("unterminated uvarint".into())
}

#[cfg(test)]
mod tests {
    use super::*;
    use sha2::{Digest, Sha256};

    // ---- encoding helpers ----

    /// Encode an unsigned integer as a LEB128 varint.
    fn encode_uvarint(mut val: usize) -> Vec<u8> {
        let mut buf = Vec::new();
        loop {
            let mut byte = (val & 0x7f) as u8;
            val >>= 7;
            if val != 0 {
                byte |= 0x80;
            }
            buf.push(byte);
            if val == 0 {
                break;
            }
        }
        buf
    }

    /// Build a CIDv1 (version=1, codec=0x70 dag-pb, sha256 multihash) for `data`.
    fn build_cidv1_sha256(data: &[u8]) -> Vec<u8> {
        let digest = Sha256::digest(data);
        let mut cid = Vec::new();
        // version = 1
        cid.extend_from_slice(&encode_uvarint(1));
        // codec = 0x70 (dag-pb)
        cid.extend_from_slice(&encode_uvarint(0x70));
        // multihash: hash-fn = 0x12 (sha2-256), digest-len = 32, digest
        cid.extend_from_slice(&encode_uvarint(0x12));
        cid.extend_from_slice(&encode_uvarint(32));
        cid.extend_from_slice(digest.as_slice());
        cid
    }

    /// Encode a CBOR text string (major type 3).
    fn cbor_text(s: &str) -> Vec<u8> {
        let mut out = Vec::new();
        let len = s.len();
        if len <= 23 {
            out.push(0x60 | len as u8);
        } else {
            out.push(0x78);
            out.push(len as u8);
        }
        out.extend_from_slice(s.as_bytes());
        out
    }

    /// Encode a CBOR byte string (major type 2).
    fn cbor_bytes(b: &[u8]) -> Vec<u8> {
        let mut out = Vec::new();
        let len = b.len();
        if len <= 23 {
            out.push(0x40 | len as u8);
        } else {
            out.push(0x58);
            out.push(len as u8);
        }
        out.extend_from_slice(b);
        out
    }

    /// Wrap a byte string in CBOR tag-42 (0xd8 0x2a) with leading 0x00 multibase prefix.
    fn cbor_tag42_cid(raw_cid: &[u8]) -> Vec<u8> {
        let mut out = Vec::new();
        // tag 42
        out.push(0xd8);
        out.push(0x2a);
        // byte string: 0x00 prefix + raw cid bytes
        let mut payload = vec![0x00u8];
        payload.extend_from_slice(raw_cid);
        out.extend_from_slice(&cbor_bytes(&payload));
        out
    }

    /// Build a valid CARv1 DAG-CBOR header bytes for the given root CIDs.
    fn build_car_header(root_cids: &[&[u8]]) -> Vec<u8> {
        // CBOR map with 2 entries: "version" -> 1, "roots" -> [tag42(cid), ...]
        let mut map_body = Vec::new();
        // key "version"
        map_body.extend_from_slice(&cbor_text("version"));
        // value 1 (unsigned int, major type 0, additional info 1)
        map_body.push(0x01);
        // key "roots"
        map_body.extend_from_slice(&cbor_text("roots"));
        // value: array of tag-42 CIDs
        let n = root_cids.len();
        if n <= 23 {
            map_body.push(0x80 | n as u8);
        } else {
            map_body.push(0x98);
            map_body.push(n as u8);
        }
        for cid in root_cids {
            map_body.extend_from_slice(&cbor_tag42_cid(cid));
        }
        // Outer map: 0xa2 = major type 5, additional info 2 (two entries)
        let mut out = vec![0xa2u8];
        out.extend_from_slice(&map_body);
        out
    }

    /// Wrap a (CID, block_data) pair with a varint length prefix for the CAR block section.
    fn build_car_block(cid: &[u8], data: &[u8]) -> Vec<u8> {
        let content_len = cid.len() + data.len();
        let mut out = encode_uvarint(content_len);
        out.extend_from_slice(cid);
        out.extend_from_slice(data);
        out
    }

    /// Assemble a complete CARv1 file: length-prefixed header + block section.
    fn build_car(header: &[u8], blocks: &[Vec<u8>]) -> Vec<u8> {
        let mut out = encode_uvarint(header.len());
        out.extend_from_slice(header);
        for block in blocks {
            out.extend_from_slice(block);
        }
        out
    }

    // ---- dag-pb / UnixFS encoding helpers ----

    /// Encode a protobuf field with wire type 2 (length-delimited).
    fn pb_field_bytes(field_num: u8, payload: &[u8]) -> Vec<u8> {
        let tag = (field_num << 3) | 2;
        let mut out = vec![tag];
        out.extend_from_slice(&encode_uvarint(payload.len()));
        out.extend_from_slice(payload);
        out
    }

    /// Encode a protobuf field with wire type 0 (varint).
    fn pb_field_varint(field_num: u8, val: usize) -> Vec<u8> {
        let tag = (field_num << 3) | 0;
        let mut out = vec![tag];
        out.extend_from_slice(&encode_uvarint(val));
        out
    }

    /// Build a UnixFS leaf node (dag-pb block) carrying file data.
    /// dag-pb field 1 (Data) = UnixFS{ Type=File(2), Data=file_bytes }
    fn build_unixfs_file_block(file_data: &[u8]) -> Vec<u8> {
        // UnixFS protobuf: field 1 = Type varint 2, field 2 = Data bytes
        let mut unixfs = pb_field_varint(1, 2);
        unixfs.extend_from_slice(&pb_field_bytes(2, file_data));
        // dag-pb: field 1 = Data (the UnixFS bytes)
        pb_field_bytes(1, &unixfs)
    }

    /// Build a UnixFS directory block (dag-pb) with a single link to `child_cid`.
    /// dag-pb field 2 (PBLink): Hash=child_cid, Name=name, Tsize=tsize
    /// dag-pb field 1 (Data): UnixFS{ Type=Directory(1) }
    fn build_unixfs_dir_block(child_cid: &[u8], name: &str, tsize: usize) -> Vec<u8> {
        // Inner PBLink fields
        let mut link = pb_field_bytes(1, child_cid); // Hash
        link.extend_from_slice(&pb_field_bytes(2, name.as_bytes())); // Name
        link.extend_from_slice(&pb_field_varint(3, tsize)); // Tsize

        // UnixFS directory data: Type=Directory(1)
        let unixfs_dir = pb_field_varint(1, 1);

        // dag-pb: field 2 = PBLink, field 1 = Data
        let mut pb = pb_field_bytes(2, &link);
        pb.extend_from_slice(&pb_field_bytes(1, &unixfs_dir));
        pb
    }

    // ---- tests ----

    /// Regression test: root directory block appears LAST in the block section.
    /// The old heuristic ("first CIDv1 block = root") would have incorrectly
    /// picked the leaf block as the root.
    #[test]
    fn test_root_block_last() {
        let file_content = b"hello";

        // Build leaf block
        let leaf_block_data = build_unixfs_file_block(file_content);
        let leaf_cid = build_cidv1_sha256(&leaf_block_data);

        // Build directory block referencing the leaf
        let dir_block_data = build_unixfs_dir_block(&leaf_cid, "test.txt", file_content.len());
        let dir_cid = build_cidv1_sha256(&dir_block_data);

        // Header declares the directory as root
        let header = build_car_header(&[dir_cid.as_slice()]);

        // Block section: leaf first, then directory (root is LAST)
        let blocks = vec![
            build_car_block(&leaf_cid, &leaf_block_data),
            build_car_block(&dir_cid, &dir_block_data),
        ];

        let car = build_car(&header, &blocks);
        let assets = parse_car_to_assets(&car).expect("should parse successfully");

        assert_eq!(
            assets.get("test.txt").map(|v| v.as_slice()),
            Some(file_content.as_slice()),
            "file content should match"
        );
    }

    /// A CAR with an empty roots array must be rejected.
    #[test]
    fn test_rejects_empty_roots() {
        let header = build_car_header(&[]);
        let car = build_car(&header, &[]);
        let err = parse_car_to_assets(&car).expect_err("should fail for empty roots");
        assert!(
            err.contains("no roots"),
            "error should mention 'no roots', got: {err}"
        );
    }

    /// A CAR whose header root CID is not present in the block section must be rejected.
    #[test]
    fn test_rejects_missing_root_block() {
        // Use a real-looking CID but provide no blocks
        let fake_cid = build_cidv1_sha256(b"nonexistent block data");
        let header = build_car_header(&[fake_cid.as_slice()]);
        let car = build_car(&header, &[]);
        let err = parse_car_to_assets(&car).expect_err("should fail for missing root block");
        assert!(
            err.contains("not found in block section"),
            "error should mention 'not found in block section', got: {err}"
        );
    }

    /// Unit test for `parse_car_header_roots` with a single valid root.
    #[test]
    fn test_parse_car_header_roots_single() {
        let cid = build_cidv1_sha256(b"some block");
        let header = build_car_header(&[cid.as_slice()]);
        let roots = parse_car_header_roots(&header).expect("should parse header");
        assert_eq!(roots.len(), 1);
        assert_eq!(roots[0], cid, "extracted CID should match the input CID");
    }

    /// Truncating a valid header mid-way through the roots array must return Err.
    #[test]
    fn test_parse_car_header_roots_rejects_truncated() {
        let cid = build_cidv1_sha256(b"some block");
        let header = build_car_header(&[cid.as_slice()]);
        // Truncate to just past the start of the roots array, before any element
        let truncated = &header[..header.len() / 2];
        let result = parse_car_header_roots(truncated);
        assert!(result.is_err(), "truncated header should return Err");
    }

    /// When the CID points to a file whose content is itself a CARv1 archive
    /// (nested CAR), the parser should reassemble the outer file and then
    /// recursively parse the inner CAR to produce the final asset map.
    #[test]
    fn test_nested_car_file_root() {
        // 1. Build an inner CAR containing a directory with one file.
        let inner_file_content = b"<html>hello</html>";
        let inner_leaf = build_unixfs_file_block(inner_file_content);
        let inner_leaf_cid = build_cidv1_sha256(&inner_leaf);
        let inner_dir =
            build_unixfs_dir_block(&inner_leaf_cid, "index.html", inner_file_content.len());
        let inner_dir_cid = build_cidv1_sha256(&inner_dir);
        let inner_header = build_car_header(&[inner_dir_cid.as_slice()]);
        let inner_car = build_car(
            &inner_header,
            &[
                build_car_block(&inner_leaf_cid, &inner_leaf),
                build_car_block(&inner_dir_cid, &inner_dir),
            ],
        );

        // 2. Wrap the inner CAR bytes in a UnixFS file block (the outer root).
        let outer_leaf = build_unixfs_file_block(&inner_car);
        let outer_leaf_cid = build_cidv1_sha256(&outer_leaf);

        // 3. Build the outer CAR with the file as root.
        let outer_header = build_car_header(&[outer_leaf_cid.as_slice()]);
        let outer_car = build_car(
            &outer_header,
            &[build_car_block(&outer_leaf_cid, &outer_leaf)],
        );

        // 4. parse_car_to_assets should detect the nested CAR and extract its files.
        let assets = parse_car_to_assets(&outer_car).expect("should parse nested CAR");
        assert_eq!(
            assets.get("index.html").map(|v| v.as_slice()),
            Some(inner_file_content.as_slice()),
            "nested CAR should yield the inner file"
        );
    }

    /// When the root is a single file (not a directory, not a nested CAR),
    /// it should be returned as index.html.
    #[test]
    fn test_single_file_root() {
        let file_content = b"<html>single page</html>";
        let leaf = build_unixfs_file_block(file_content);
        let leaf_cid = build_cidv1_sha256(&leaf);
        let header = build_car_header(&[leaf_cid.as_slice()]);
        let car = build_car(&header, &[build_car_block(&leaf_cid, &leaf)]);

        let assets = parse_car_to_assets(&car).expect("should parse single file root");
        assert_eq!(
            assets.get("index.html").map(|v| v.as_slice()),
            Some(file_content.as_slice()),
            "single file root should be returned as index.html"
        );
    }
}