dictx-index 0.1.0

Index builder and binary entry storage for DictX.
Documentation
use crate::{tantivy_error, DictxSchema, EntryPackWriter, ENTRY_PACK_FILE, SCHEMA_VERSION};
use dictx_core::{DictEntry, Result};
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::fs;
use std::io::Read;
use std::path::{Path, PathBuf};
use std::time::Instant;
use tantivy::Index;

#[derive(Debug, Clone)]
pub struct BuildOptions {
    pub ram_mb: usize,
    pub force: bool,
    pub source_name: String,
    pub source_path: Option<PathBuf>,
}

impl Default for BuildOptions {
    fn default() -> Self {
        Self {
            ram_mb: 128,
            force: false,
            source_name: "default".to_string(),
            source_path: None,
        }
    }
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IndexMetadata {
    pub schema_version: u32,
    pub source_name: String,
    pub source_path: Option<PathBuf>,
    pub source_sha256: Option<String>,
    pub entries: usize,
    pub built_at_unix: u64,
    pub index_bytes: u64,
    pub entry_pack: Option<String>,
    pub entry_pack_bytes: Option<u64>,
}

#[derive(Debug, Clone)]
pub struct BuildStats {
    pub entries: usize,
    pub elapsed_ms: u128,
    pub index_bytes: u64,
}

pub fn open_index(path: &Path) -> Result<Index> {
    Index::open_in_dir(path).map_err(tantivy_error)
}

pub fn build_index<I>(index_dir: &Path, entries: I, options: &BuildOptions) -> Result<BuildStats>
where
    I: IntoIterator<Item = Result<DictEntry>>,
{
    // `build` is a full materialization step: rebuilding avoids duplicate documents
    // and lets schema/storage upgrades switch formats cleanly.
    if index_dir.exists() {
        fs::remove_dir_all(index_dir)?;
    }
    fs::create_dir_all(index_dir)?;

    let dictx_schema = DictxSchema::build();
    let index =
        Index::create_in_dir(index_dir, dictx_schema.schema.clone()).map_err(tantivy_error)?;

    let mut writer = index
        .writer((options.ram_mb.max(16) * 1024 * 1024) as usize)
        .map_err(tantivy_error)?;
    let mut pack = EntryPackWriter::create(&index_dir.join(ENTRY_PACK_FILE))?;
    let start = Instant::now();
    let mut count = 0usize;

    for entry in entries {
        let entry = entry?;
        let locator = pack.append(&entry)?;
        let doc = dictx_schema.to_document(&entry, Some(locator))?;
        writer.add_document(doc).map_err(tantivy_error)?;
        count += 1;
    }

    pack.finish()?;
    writer.commit().map_err(tantivy_error)?;
    writer.wait_merging_threads().map_err(tantivy_error)?;

    let index_bytes = dir_size(index_dir)?;
    let stats = BuildStats {
        entries: count,
        elapsed_ms: start.elapsed().as_millis(),
        index_bytes,
    };
    write_metadata(index_dir, options, &stats)?;
    Ok(stats)
}

pub fn read_metadata(index_dir: &Path) -> Result<Option<IndexMetadata>> {
    let path = metadata_path(index_dir);
    if !path.exists() {
        return Ok(None);
    }
    let text = fs::read_to_string(path)?;
    Ok(Some(serde_json::from_str(&text)?))
}

fn write_metadata(index_dir: &Path, options: &BuildOptions, stats: &BuildStats) -> Result<()> {
    let metadata = IndexMetadata {
        schema_version: SCHEMA_VERSION,
        source_name: options.source_name.clone(),
        source_path: options.source_path.clone(),
        source_sha256: options
            .source_path
            .as_deref()
            .and_then(|path| sha256_file(path).ok()),
        entries: stats.entries,
        built_at_unix: std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)
            .unwrap_or_default()
            .as_secs(),
        index_bytes: stats.index_bytes,
        entry_pack: Some(ENTRY_PACK_FILE.to_string()),
        entry_pack_bytes: Some(
            fs::metadata(index_dir.join(ENTRY_PACK_FILE))
                .map(|metadata| metadata.len())
                .unwrap_or(0),
        ),
    };
    fs::write(
        metadata_path(index_dir),
        serde_json::to_string_pretty(&metadata)?,
    )?;
    Ok(())
}

fn metadata_path(index_dir: &Path) -> PathBuf {
    index_dir.join("dictx-meta.json")
}

pub fn dir_size(path: &Path) -> Result<u64> {
    let mut total = 0u64;
    for entry in walkdir::WalkDir::new(path) {
        let entry = entry.map_err(|err| dictx_core::DictxError::Message(err.to_string()))?;
        if entry.file_type().is_file() {
            total += entry
                .metadata()
                .map_err(|err| dictx_core::DictxError::Message(err.to_string()))?
                .len();
        }
    }
    Ok(total)
}

fn sha256_file(path: &Path) -> Result<String> {
    let mut file = fs::File::open(path)?;
    let mut hasher = Sha256::new();
    let mut buf = [0u8; 64 * 1024];
    loop {
        let read = file.read(&mut buf)?;
        if read == 0 {
            break;
        }
        hasher.update(&buf[..read]);
    }
    Ok(format!("{:x}", hasher.finalize()))
}

#[cfg(test)]
mod tests {
    use super::*;
    use dictx_core::{Definition, DictSource};

    #[test]
    fn builds_small_index() {
        let dir = tempfile::tempdir().unwrap();
        let mut entry = DictEntry::new(DictSource::Custom { name: "t".into() }, "apple");
        entry
            .definitions
            .push(Definition::new("fruit", "苹果", Some("n".into())));

        let stats = build_index(
            dir.path(),
            vec![Ok(entry)],
            &BuildOptions {
                force: true,
                ..BuildOptions::default()
            },
        )
        .unwrap();
        assert_eq!(stats.entries, 1);
        assert!(dir.path().join("dictx-meta.json").exists());
    }
}