Skip to main content

dictx_index/
builder.rs

1use crate::{tantivy_error, DictxSchema, EntryPackWriter, ENTRY_PACK_FILE, SCHEMA_VERSION};
2use dictx_core::{DictEntry, Result};
3use serde::{Deserialize, Serialize};
4use sha2::{Digest, Sha256};
5use std::fs;
6use std::io::Read;
7use std::path::{Path, PathBuf};
8use std::time::Instant;
9use tantivy::Index;
10
11#[derive(Debug, Clone)]
12pub struct BuildOptions {
13    pub ram_mb: usize,
14    pub force: bool,
15    pub source_name: String,
16    pub source_path: Option<PathBuf>,
17}
18
19impl Default for BuildOptions {
20    fn default() -> Self {
21        Self {
22            ram_mb: 128,
23            force: false,
24            source_name: "default".to_string(),
25            source_path: None,
26        }
27    }
28}
29
30#[derive(Debug, Clone, Serialize, Deserialize)]
31pub struct IndexMetadata {
32    pub schema_version: u32,
33    pub source_name: String,
34    pub source_path: Option<PathBuf>,
35    pub source_sha256: Option<String>,
36    pub entries: usize,
37    pub built_at_unix: u64,
38    pub index_bytes: u64,
39    pub entry_pack: Option<String>,
40    pub entry_pack_bytes: Option<u64>,
41}
42
43#[derive(Debug, Clone)]
44pub struct BuildStats {
45    pub entries: usize,
46    pub elapsed_ms: u128,
47    pub index_bytes: u64,
48}
49
50pub fn open_index(path: &Path) -> Result<Index> {
51    Index::open_in_dir(path).map_err(tantivy_error)
52}
53
54pub fn build_index<I>(index_dir: &Path, entries: I, options: &BuildOptions) -> Result<BuildStats>
55where
56    I: IntoIterator<Item = Result<DictEntry>>,
57{
58    // `build` is a full materialization step: rebuilding avoids duplicate documents
59    // and lets schema/storage upgrades switch formats cleanly.
60    if index_dir.exists() {
61        fs::remove_dir_all(index_dir)?;
62    }
63    fs::create_dir_all(index_dir)?;
64
65    let dictx_schema = DictxSchema::build();
66    let index =
67        Index::create_in_dir(index_dir, dictx_schema.schema.clone()).map_err(tantivy_error)?;
68
69    let mut writer = index
70        .writer((options.ram_mb.max(16) * 1024 * 1024) as usize)
71        .map_err(tantivy_error)?;
72    let mut pack = EntryPackWriter::create(&index_dir.join(ENTRY_PACK_FILE))?;
73    let start = Instant::now();
74    let mut count = 0usize;
75
76    for entry in entries {
77        let entry = entry?;
78        let locator = pack.append(&entry)?;
79        let doc = dictx_schema.to_document(&entry, Some(locator))?;
80        writer.add_document(doc).map_err(tantivy_error)?;
81        count += 1;
82    }
83
84    pack.finish()?;
85    writer.commit().map_err(tantivy_error)?;
86    writer.wait_merging_threads().map_err(tantivy_error)?;
87
88    let index_bytes = dir_size(index_dir)?;
89    let stats = BuildStats {
90        entries: count,
91        elapsed_ms: start.elapsed().as_millis(),
92        index_bytes,
93    };
94    write_metadata(index_dir, options, &stats)?;
95    Ok(stats)
96}
97
98pub fn read_metadata(index_dir: &Path) -> Result<Option<IndexMetadata>> {
99    let path = metadata_path(index_dir);
100    if !path.exists() {
101        return Ok(None);
102    }
103    let text = fs::read_to_string(path)?;
104    Ok(Some(serde_json::from_str(&text)?))
105}
106
107fn write_metadata(index_dir: &Path, options: &BuildOptions, stats: &BuildStats) -> Result<()> {
108    let metadata = IndexMetadata {
109        schema_version: SCHEMA_VERSION,
110        source_name: options.source_name.clone(),
111        source_path: options.source_path.clone(),
112        source_sha256: options
113            .source_path
114            .as_deref()
115            .and_then(|path| sha256_file(path).ok()),
116        entries: stats.entries,
117        built_at_unix: std::time::SystemTime::now()
118            .duration_since(std::time::UNIX_EPOCH)
119            .unwrap_or_default()
120            .as_secs(),
121        index_bytes: stats.index_bytes,
122        entry_pack: Some(ENTRY_PACK_FILE.to_string()),
123        entry_pack_bytes: Some(
124            fs::metadata(index_dir.join(ENTRY_PACK_FILE))
125                .map(|metadata| metadata.len())
126                .unwrap_or(0),
127        ),
128    };
129    fs::write(
130        metadata_path(index_dir),
131        serde_json::to_string_pretty(&metadata)?,
132    )?;
133    Ok(())
134}
135
136fn metadata_path(index_dir: &Path) -> PathBuf {
137    index_dir.join("dictx-meta.json")
138}
139
140pub fn dir_size(path: &Path) -> Result<u64> {
141    let mut total = 0u64;
142    for entry in walkdir::WalkDir::new(path) {
143        let entry = entry.map_err(|err| dictx_core::DictxError::Message(err.to_string()))?;
144        if entry.file_type().is_file() {
145            total += entry
146                .metadata()
147                .map_err(|err| dictx_core::DictxError::Message(err.to_string()))?
148                .len();
149        }
150    }
151    Ok(total)
152}
153
154fn sha256_file(path: &Path) -> Result<String> {
155    let mut file = fs::File::open(path)?;
156    let mut hasher = Sha256::new();
157    let mut buf = [0u8; 64 * 1024];
158    loop {
159        let read = file.read(&mut buf)?;
160        if read == 0 {
161            break;
162        }
163        hasher.update(&buf[..read]);
164    }
165    Ok(format!("{:x}", hasher.finalize()))
166}
167
168#[cfg(test)]
169mod tests {
170    use super::*;
171    use dictx_core::{Definition, DictSource};
172
173    #[test]
174    fn builds_small_index() {
175        let dir = tempfile::tempdir().unwrap();
176        let mut entry = DictEntry::new(DictSource::Custom { name: "t".into() }, "apple");
177        entry
178            .definitions
179            .push(Definition::new("fruit", "苹果", Some("n".into())));
180
181        let stats = build_index(
182            dir.path(),
183            vec![Ok(entry)],
184            &BuildOptions {
185                force: true,
186                ..BuildOptions::default()
187            },
188        )
189        .unwrap();
190        assert_eq!(stats.entries, 1);
191        assert!(dir.path().join("dictx-meta.json").exists());
192    }
193}