use crate::{tantivy_error, DictxSchema, EntryPackWriter, ENTRY_PACK_FILE, SCHEMA_VERSION};
use dictx_core::{DictEntry, Result};
use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::fs;
use std::io::Read;
use std::path::{Path, PathBuf};
use std::time::Instant;
use tantivy::Index;
#[derive(Debug, Clone)]
pub struct BuildOptions {
pub ram_mb: usize,
pub force: bool,
pub source_name: String,
pub source_path: Option<PathBuf>,
}
impl Default for BuildOptions {
fn default() -> Self {
Self {
ram_mb: 128,
force: false,
source_name: "default".to_string(),
source_path: None,
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IndexMetadata {
pub schema_version: u32,
pub source_name: String,
pub source_path: Option<PathBuf>,
pub source_sha256: Option<String>,
pub entries: usize,
pub built_at_unix: u64,
pub index_bytes: u64,
pub entry_pack: Option<String>,
pub entry_pack_bytes: Option<u64>,
}
#[derive(Debug, Clone)]
pub struct BuildStats {
pub entries: usize,
pub elapsed_ms: u128,
pub index_bytes: u64,
}
pub fn open_index(path: &Path) -> Result<Index> {
Index::open_in_dir(path).map_err(tantivy_error)
}
pub fn build_index<I>(index_dir: &Path, entries: I, options: &BuildOptions) -> Result<BuildStats>
where
I: IntoIterator<Item = Result<DictEntry>>,
{
if index_dir.exists() {
fs::remove_dir_all(index_dir)?;
}
fs::create_dir_all(index_dir)?;
let dictx_schema = DictxSchema::build();
let index =
Index::create_in_dir(index_dir, dictx_schema.schema.clone()).map_err(tantivy_error)?;
let mut writer = index
.writer((options.ram_mb.max(16) * 1024 * 1024) as usize)
.map_err(tantivy_error)?;
let mut pack = EntryPackWriter::create(&index_dir.join(ENTRY_PACK_FILE))?;
let start = Instant::now();
let mut count = 0usize;
for entry in entries {
let entry = entry?;
let locator = pack.append(&entry)?;
let doc = dictx_schema.to_document(&entry, Some(locator))?;
writer.add_document(doc).map_err(tantivy_error)?;
count += 1;
}
pack.finish()?;
writer.commit().map_err(tantivy_error)?;
writer.wait_merging_threads().map_err(tantivy_error)?;
let index_bytes = dir_size(index_dir)?;
let stats = BuildStats {
entries: count,
elapsed_ms: start.elapsed().as_millis(),
index_bytes,
};
write_metadata(index_dir, options, &stats)?;
Ok(stats)
}
pub fn read_metadata(index_dir: &Path) -> Result<Option<IndexMetadata>> {
let path = metadata_path(index_dir);
if !path.exists() {
return Ok(None);
}
let text = fs::read_to_string(path)?;
Ok(Some(serde_json::from_str(&text)?))
}
fn write_metadata(index_dir: &Path, options: &BuildOptions, stats: &BuildStats) -> Result<()> {
let metadata = IndexMetadata {
schema_version: SCHEMA_VERSION,
source_name: options.source_name.clone(),
source_path: options.source_path.clone(),
source_sha256: options
.source_path
.as_deref()
.and_then(|path| sha256_file(path).ok()),
entries: stats.entries,
built_at_unix: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs(),
index_bytes: stats.index_bytes,
entry_pack: Some(ENTRY_PACK_FILE.to_string()),
entry_pack_bytes: Some(
fs::metadata(index_dir.join(ENTRY_PACK_FILE))
.map(|metadata| metadata.len())
.unwrap_or(0),
),
};
fs::write(
metadata_path(index_dir),
serde_json::to_string_pretty(&metadata)?,
)?;
Ok(())
}
fn metadata_path(index_dir: &Path) -> PathBuf {
index_dir.join("dictx-meta.json")
}
pub fn dir_size(path: &Path) -> Result<u64> {
let mut total = 0u64;
for entry in walkdir::WalkDir::new(path) {
let entry = entry.map_err(|err| dictx_core::DictxError::Message(err.to_string()))?;
if entry.file_type().is_file() {
total += entry
.metadata()
.map_err(|err| dictx_core::DictxError::Message(err.to_string()))?
.len();
}
}
Ok(total)
}
fn sha256_file(path: &Path) -> Result<String> {
let mut file = fs::File::open(path)?;
let mut hasher = Sha256::new();
let mut buf = [0u8; 64 * 1024];
loop {
let read = file.read(&mut buf)?;
if read == 0 {
break;
}
hasher.update(&buf[..read]);
}
Ok(format!("{:x}", hasher.finalize()))
}
#[cfg(test)]
mod tests {
use super::*;
use dictx_core::{Definition, DictSource};
#[test]
fn builds_small_index() {
let dir = tempfile::tempdir().unwrap();
let mut entry = DictEntry::new(DictSource::Custom { name: "t".into() }, "apple");
entry
.definitions
.push(Definition::new("fruit", "苹果", Some("n".into())));
let stats = build_index(
dir.path(),
vec![Ok(entry)],
&BuildOptions {
force: true,
..BuildOptions::default()
},
)
.unwrap();
assert_eq!(stats.entries, 1);
assert!(dir.path().join("dictx-meta.json").exists());
}
}