use crate::error::{Result, RypeError};
use serde::{Deserialize, Serialize};
use std::fs;
use std::path::Path;
use super::options::hex_u64;
use super::{files, FORMAT_MAGIC, FORMAT_VERSION};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ParquetManifest {
pub magic: String,
pub format_version: u32,
pub k: usize,
pub w: usize,
#[serde(with = "hex_u64")]
pub salt: u64,
#[serde(with = "hex_u64")]
pub source_hash: u64,
pub num_buckets: u32,
pub total_minimizers: u64,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub inverted: Option<InvertedManifest>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
#[serde(rename_all = "lowercase")]
pub enum ParquetShardFormat {
#[default]
Parquet,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct InvertedManifest {
#[serde(default)]
pub format: ParquetShardFormat,
pub num_shards: u32,
pub total_entries: u64,
pub has_overlapping_shards: bool,
pub shards: Vec<InvertedShardInfo>,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct InvertedShardInfo {
pub shard_id: u32,
#[serde(with = "hex_u64")]
pub min_minimizer: u64,
#[serde(with = "hex_u64")]
pub max_minimizer: u64,
pub num_entries: u64,
}
impl ParquetManifest {
pub fn new(k: usize, w: usize, salt: u64) -> Self {
Self {
magic: FORMAT_MAGIC.to_string(),
format_version: FORMAT_VERSION,
k,
w,
salt,
source_hash: 0,
num_buckets: 0,
total_minimizers: 0,
inverted: None,
}
}
pub fn save(&self, index_dir: &Path) -> Result<()> {
let path = index_dir.join(files::MANIFEST);
let toml_str = toml::to_string_pretty(self)
.map_err(|e| RypeError::encoding(format!("serialize manifest: {}", e)))?;
fs::write(&path, &toml_str)
.map_err(|e| RypeError::io(path.clone(), "write manifest", e))?;
Ok(())
}
pub fn load(index_dir: &Path) -> Result<Self> {
let path = index_dir.join(files::MANIFEST);
let toml_str = fs::read_to_string(&path)
.map_err(|e| RypeError::io(path.clone(), "read manifest", e))?;
let manifest: Self = toml::from_str(&toml_str)
.map_err(|e| RypeError::format(path.clone(), format!("parse manifest: {}", e)))?;
if manifest.magic != FORMAT_MAGIC {
return Err(RypeError::format(
path,
format!(
"invalid manifest magic: expected '{}', got '{}'",
FORMAT_MAGIC, manifest.magic
),
));
}
if manifest.format_version > FORMAT_VERSION {
return Err(RypeError::format(
path,
format!(
"unsupported format version: {} (max supported: {})",
manifest.format_version, FORMAT_VERSION
),
));
}
Ok(manifest)
}
}
#[derive(Debug, Clone)]
pub struct BucketMetadata {
pub bucket_id: u32,
pub bucket_name: String,
pub sources: Vec<String>,
pub minimizer_count: usize,
}
#[derive(Debug, Clone)]
pub struct BucketData {
pub bucket_id: u32,
pub bucket_name: String,
pub sources: Vec<String>,
pub minimizers: Vec<u64>,
}
impl BucketData {
pub fn validate(&self) -> Result<()> {
for i in 1..self.minimizers.len() {
if self.minimizers[i] <= self.minimizers[i - 1] {
if self.minimizers[i] == self.minimizers[i - 1] {
return Err(RypeError::validation(format!(
"bucket {} has duplicate minimizer at position {}: {:#x}",
self.bucket_id, i, self.minimizers[i]
)));
} else {
return Err(RypeError::validation(format!(
"bucket {} has unsorted minimizers at positions {}-{}: {:#x} > {:#x}",
self.bucket_id,
i - 1,
i,
self.minimizers[i - 1],
self.minimizers[i]
)));
}
}
}
Ok(())
}
}
pub fn is_parquet_index(path: &Path) -> bool {
if !path.is_dir() {
return false;
}
let manifest_path = path.join(files::MANIFEST);
if !manifest_path.exists() {
return false;
}
if let Ok(content) = fs::read_to_string(&manifest_path) {
return content.contains(FORMAT_MAGIC);
}
false
}
pub fn create_index_directory(path: &Path) -> Result<()> {
fs::create_dir_all(path)
.map_err(|e| RypeError::io(path.to_path_buf(), "create index directory", e))?;
let inverted_dir = path.join(files::INVERTED_DIR);
fs::create_dir_all(&inverted_dir)
.map_err(|e| RypeError::io(inverted_dir.clone(), "create inverted directory", e))?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[test]
fn test_manifest_round_trip() {
let tmp = TempDir::new().unwrap();
let index_dir = tmp.path().join("test.ryidx");
create_index_directory(&index_dir).unwrap();
let mut manifest = ParquetManifest::new(64, 50, 12345);
manifest.num_buckets = 10;
manifest.total_minimizers = 1_000_000;
manifest.source_hash = 0xDEADBEEF;
manifest.save(&index_dir).unwrap();
let loaded = ParquetManifest::load(&index_dir).unwrap();
assert_eq!(loaded.magic, FORMAT_MAGIC);
assert_eq!(loaded.format_version, FORMAT_VERSION);
assert_eq!(loaded.k, 64);
assert_eq!(loaded.w, 50);
assert_eq!(loaded.salt, 12345);
assert_eq!(loaded.num_buckets, 10);
assert_eq!(loaded.total_minimizers, 1_000_000);
assert_eq!(loaded.source_hash, 0xDEADBEEF);
}
#[test]
fn test_is_parquet_index() {
let tmp = TempDir::new().unwrap();
let file_path = tmp.path().join("not_a_dir.ryidx");
std::fs::write(&file_path, "test").unwrap();
assert!(!is_parquet_index(&file_path));
let empty_dir = tmp.path().join("empty.ryidx");
std::fs::create_dir(&empty_dir).unwrap();
assert!(!is_parquet_index(&empty_dir));
let valid_dir = tmp.path().join("valid.ryidx");
create_index_directory(&valid_dir).unwrap();
let manifest = ParquetManifest::new(64, 50, 0);
manifest.save(&valid_dir).unwrap();
assert!(is_parquet_index(&valid_dir));
}
}