massmap 0.1.5

Static hash table that scales via disk-backed expansion to trim memory usage while ensuring each lookup needs exactly one I/O.
Documentation
use clap::{Parser, Subcommand};
use foldhash::fast::FixedState;
use massmap::{
    MassMap, MassMapBuilder, MassMapDefaultHashLoader, MassMapHashConfig, MassMapHashLoader,
    MassMapInner, MassMapMerger, MassMapReader,
};
use serde::{Deserialize, Serialize};
use serde_json::Value;
use std::fmt::Display;
use std::fs::File;
use std::io::{BufReader, Error, ErrorKind, Result};
use std::path::{Path, PathBuf};

fn main() -> Result<()> {
    let cli = Cli::parse();
    match cli.command {
        Command::Info(args) => run_info(args),
        Command::Convert(args) => run_convert(args),
        Command::Merge(args) => run_merge(args),
    }
}

#[derive(Parser)]
#[command(
    author,
    version,
    about = "massmap utility for inspecting and creating massmap files",
    subcommand_required = true,
    arg_required_else_help = true
)]
struct Cli {
    #[command(subcommand)]
    command: Command,
}

#[derive(Subcommand)]
enum Command {
    /// Inspect a massmap file and print basic information
    Info(InfoArgs),
    /// Convert a JSON key-value file into a massmap binary file
    Convert(ConvertArgs),
    /// Merge multiple massmap binary files into a single massmap binary file
    Merge(MergeArgs),
}

#[derive(clap::Args)]
struct InfoArgs {
    /// Path to the massmap binary file
    #[arg(value_name = "FILE")]
    input: PathBuf,

    /// Optional key to look up in the massmap
    #[arg(short, long)]
    key: Option<String>,

    /// Optional bucket index to inspect
    #[arg(short, long)]
    bucket: Option<u64>,
}

#[derive(clap::Args)]
struct ConvertArgs {
    /// Path to the source JSON file containing key-value pairs
    #[arg(short, long, value_name = "FILE")]
    input: PathBuf,

    /// Path to the massmap binary file to produce
    #[arg(short, long, value_name = "FILE")]
    output: PathBuf,

    /// Seed value for the hash function
    #[arg(long, value_name = "SEED", default_value_t = 0)]
    hash_seed: u64,

    /// Number of buckets in the massmap
    #[arg(long, value_name = "COUNT", default_value_t = 1 << 16)]
    bucket_count: u64,

    /// Buffer size in bytes for writing the massmap file
    #[arg(long, value_name = "BYTES", default_value_t = 16 << 20)]
    buffer_size: usize,
}

#[derive(clap::Args)]
struct MergeArgs {
    /// Path to the source JSON file containing key-value pairs
    #[arg(short, long, value_name = "FILE")]
    input: Vec<PathBuf>,

    /// Path to the massmap binary file to produce
    #[arg(short, long, value_name = "FILE")]
    output: PathBuf,

    /// Buffer size in bytes for writing the massmap file
    #[arg(long, value_name = "BYTES", default_value_t = 16 << 20)]
    buffer_size: usize,
}

#[derive(Debug, Default)]
pub struct MassMapTolerableHashLoader;

impl MassMapHashLoader for MassMapTolerableHashLoader {
    type BuildHasher = FixedState;

    fn load(config: &MassMapHashConfig) -> Result<Self::BuildHasher> {
        if config.name != MassMapDefaultHashLoader::NAME {
            println!(
                "Warning: Unsupported hash type: {}, defaulting to foldhash",
                config.name
            );
        }
        let seed = config
            .parameters
            .get("seed")
            .and_then(|v| v.as_u64())
            .unwrap_or(0);
        Ok(FixedState::with_seed(seed))
    }
}

fn do_query<K, R>(
    map: MassMap<K, serde_json::Value, R, MassMapTolerableHashLoader>,
    key: Option<K>,
    bucket: Option<u64>,
) -> Result<()>
where
    K: Serialize + for<'de> Deserialize<'de> + Display + std::hash::Hash + Eq,
    R: MassMapReader,
{
    if let Some(key) = key {
        println!("Get {}: {:?}", key, map.get(&key)?);
    }

    if let Some(bucket_index) = bucket {
        if bucket_index as usize >= map.bucket_count() {
            return Err(Error::new(
                ErrorKind::InvalidInput,
                format!(
                    "Bucket index {} out of range >= {}",
                    bucket_index,
                    map.bucket_count()
                ),
            ));
        }
        let entries = map.get_bucket(bucket_index as usize)?;
        let json = serde_json::to_string_pretty(&entries)
            .map_err(|e| Error::other(format!("Failed to format JSON: {e}")))?;
        println!("Bucket {} entries:\n{}", bucket_index, json);
    }

    Ok(())
}

fn run_info(args: InfoArgs) -> Result<()> {
    let file = File::open(&args.input)?;

    let map = MassMapInner::<_, MassMapTolerableHashLoader>::load(file)?;

    let json = serde_json::to_string_pretty(&map.info())
        .map_err(|e| Error::other(format!("Failed to format JSON: {e}")))?;
    println!("{}", json);

    match map.meta.key_type.as_str() {
        "u8" => do_query(
            map.cast::<u8, _>(),
            args.key.map(|x| x.parse().unwrap()),
            args.bucket,
        )?,
        "u16" => do_query(
            map.cast::<u16, _>(),
            args.key.map(|x| x.parse().unwrap()),
            args.bucket,
        )?,
        "u32" => do_query(
            map.cast::<u32, _>(),
            args.key.map(|x| x.parse().unwrap()),
            args.bucket,
        )?,
        "u64" => do_query(
            map.cast::<u64, _>(),
            args.key.map(|x| x.parse().unwrap()),
            args.bucket,
        )?,
        "u128" => do_query(
            map.cast::<u128, _>(),
            args.key.map(|x| x.parse().unwrap()),
            args.bucket,
        )?,
        _ if map.meta.key_type == std::any::type_name::<String>() => {
            do_query(map.cast::<String, _>(), args.key, args.bucket)?
        }
        _ => {
            assert!(
                args.key.is_none() && args.bucket.is_none(),
                "Unsupported key type: {}",
                map.meta.key_type
            );
        }
    }

    Ok(())
}

fn run_convert(args: ConvertArgs) -> Result<()> {
    let entries = load_entries_from_json(&args.input)?;
    let writer = File::create(&args.output)?;

    let info = MassMapBuilder::default()
        .with_hash_seed(args.hash_seed)
        .with_bucket_count(args.bucket_count)
        .with_writer_buffer_size(args.buffer_size)
        .build(&writer, entries.iter())?;

    let json = serde_json::to_string_pretty(&info)
        .map_err(|e| Error::other(format!("Failed to format JSON: {e}")))?;
    println!("{}", json);

    Ok(())
}

fn load_entries_from_json(path: &Path) -> Result<Vec<(String, Value)>> {
    let file = File::open(path)?;
    let reader = BufReader::new(file);
    let value: Value = serde_json::from_reader(reader)
        .map_err(|e| invalid_json(format!("Failed to parse JSON input: {e}")))?;
    extract_entries(value)
}

fn extract_entries(value: Value) -> Result<Vec<(String, Value)>> {
    match value {
        Value::Object(map) => Ok(map.into_iter().collect::<Vec<_>>()),
        Value::Array(items) => {
            let mut entries = Vec::with_capacity(items.len());
            for (index, item) in items.into_iter().enumerate() {
                match item {
                    Value::Object(mut obj) => {
                        let key = obj.remove("key").ok_or_else(|| {
                            invalid_json(format!("entry {index} missing 'key' field"))
                        })?;
                        let value = obj.remove("value").ok_or_else(|| {
                            invalid_json(format!("entry {index} missing 'value' field"))
                        })?;
                        entries.push((expect_string(key, index)?, value));
                    }
                    Value::Array(mut pair) => {
                        if pair.len() != 2 {
                            return Err(invalid_json(format!(
                                "entry {index} expected array of length 2"
                            )));
                        }
                        let value = pair.pop().unwrap();
                        let key = pair.pop().unwrap();
                        entries.push((expect_string(key, index)?, value));
                    }
                    other => {
                        return Err(invalid_json(format!(
                            "unsupported entry format at index {index}: {other}"
                        )));
                    }
                }
            }
            Ok(entries)
        }
        other => Err(invalid_json(format!(
            "unsupported JSON top-level type: {other}"
        ))),
    }
}

fn expect_string(value: Value, index: usize) -> Result<String> {
    match value {
        Value::String(s) => Ok(s),
        other => Err(invalid_json(format!(
            "entry {index} expects string key, found {other}"
        ))),
    }
}

fn invalid_json(message: String) -> Error {
    Error::new(ErrorKind::InvalidData, message)
}

fn run_merge(args: MergeArgs) -> Result<()> {
    let maps = args
        .input
        .iter()
        .map(|path| {
            let file = File::open(path)?;
            MassMap::<String, serde_json::Value, _, MassMapTolerableHashLoader>::load(file)
        })
        .collect::<Result<Vec<_>>>()?;

    let writer = File::create(&args.output)?;
    let info = MassMapMerger::default()
        .with_writer_buffer_size(args.buffer_size)
        .merge(&writer, maps)?;

    let json = serde_json::to_string_pretty(&info)
        .map_err(|e| Error::other(format!("Failed to format JSON: {e}")))?;
    println!("{}", json);

    Ok(())
}