use std::fs::File;
use std::io::Read;
use std::path::{Path, PathBuf};
use std::time::SystemTime;
use anyhow::{Context, Result};
use memmap2::Mmap;
use tantivy::schema::{Field, Schema, STORED, STRING};
use tantivy::{doc, Index, IndexWriter, Term};
use tracing::{debug, info};
use crate::scanner::ScannedFile;
use crate::tokenizer::register_xore_tokenizer;
#[derive(Clone)]
pub struct IndexSchema {
schema: Schema,
path_field: Field,
content_field: Field,
file_type_field: Field,
size_field: Field,
modified_field: Field,
}
impl IndexSchema {
pub fn new() -> Self {
let mut schema_builder = Schema::builder();
let path_field = schema_builder.add_text_field("path", STRING | STORED);
let text_options = tantivy::schema::TextOptions::default()
.set_indexing_options(
tantivy::schema::TextFieldIndexing::default()
.set_tokenizer("xore")
.set_index_option(tantivy::schema::IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();
let content_field = schema_builder.add_text_field("content", text_options);
let file_type_field = schema_builder.add_text_field("file_type", STRING | STORED);
let size_field = schema_builder.add_u64_field(
"size",
tantivy::schema::NumericOptions::default().set_indexed().set_stored(),
);
let modified_field = schema_builder.add_u64_field(
"modified",
tantivy::schema::NumericOptions::default().set_indexed().set_stored(),
);
Self {
schema: schema_builder.build(),
path_field,
content_field,
file_type_field,
size_field,
modified_field,
}
}
pub fn schema(&self) -> &Schema {
&self.schema
}
pub fn path_field(&self) -> Field {
self.path_field
}
pub fn content_field(&self) -> Field {
self.content_field
}
pub fn file_type_field(&self) -> Field {
self.file_type_field
}
pub fn size_field(&self) -> Field {
self.size_field
}
pub fn modified_field(&self) -> Field {
self.modified_field
}
}
impl Default for IndexSchema {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct IndexConfig {
pub index_path: PathBuf,
pub writer_buffer_size: usize,
pub max_file_size: u64,
pub use_mmap: bool,
pub mmap_threshold: u64,
}
impl Default for IndexConfig {
fn default() -> Self {
Self {
index_path: PathBuf::from(".xore/index"),
writer_buffer_size: 50_000_000, max_file_size: 100 * 1024 * 1024, use_mmap: true,
mmap_threshold: 1024 * 1024, }
}
}
pub struct IndexBuilder {
index: Index,
writer: IndexWriter,
schema: IndexSchema,
config: IndexConfig,
documents_added: usize,
documents_deleted: usize,
errors: Vec<String>,
}
impl IndexBuilder {
pub fn new(index_path: &Path) -> Result<Self> {
Self::with_config(IndexConfig {
index_path: index_path.to_path_buf(),
..Default::default()
})
}
pub fn with_config(config: IndexConfig) -> Result<Self> {
let schema = IndexSchema::new();
std::fs::create_dir_all(&config.index_path).with_context(|| {
format!(
"无法创建索引目录: {}\n💡 提示: 请检查目录权限,或使用 --index-dir 指定其他路径",
config.index_path.display()
)
})?;
let index = if config.index_path.join("meta.json").exists() {
info!("Opening existing index at {:?}", config.index_path);
Index::open_in_dir(&config.index_path).with_context(|| {
format!(
"无法打开索引: {}\n💡 提示: 索引可能已损坏,尝试运行 'xore f --rebuild' 重建索引",
config.index_path.display()
)
})?
} else {
info!("Creating new index at {:?}", config.index_path);
Index::create_in_dir(&config.index_path, schema.schema().clone()).with_context(
|| {
format!(
"无法创建索引: {}\n💡 提示: 请检查磁盘空间和目录权限",
config.index_path.display()
)
},
)?
};
register_xore_tokenizer(&index)?;
let writer = index.writer(config.writer_buffer_size).with_context(|| {
"无法创建索引写入器\n💡 提示: 可能有其他进程正在使用该索引,或磁盘空间不足"
})?;
Ok(Self {
index,
writer,
schema,
config,
documents_added: 0,
documents_deleted: 0,
errors: Vec::new(),
})
}
pub fn add_document(&mut self, file: &ScannedFile) -> Result<()> {
if file.is_dir {
return Ok(());
}
if file.size > self.config.max_file_size {
debug!("Skipping large file: {:?} ({} bytes)", file.path, file.size);
return Ok(());
}
let content = match self.read_file_content(&file.path, file.size) {
Ok(c) => c,
Err(e) => {
self.errors.push(format!("{:?}: {}", file.path, e));
return Ok(()); }
};
if is_binary_content(&content) {
debug!("Skipping binary file: {:?}", file.path);
return Ok(());
}
let path_str = file.path.to_string_lossy().to_string();
let term = Term::from_field_text(self.schema.path_field(), &path_str);
self.writer.delete_term(term);
let file_type = detect_file_type(&file.path);
let modified_ts = file
.modified
.and_then(|t| t.duration_since(SystemTime::UNIX_EPOCH).ok())
.map(|d| d.as_secs())
.unwrap_or(0);
self.writer.add_document(doc!(
self.schema.path_field() => path_str,
self.schema.content_field() => content,
self.schema.file_type_field() => file_type,
self.schema.size_field() => file.size,
self.schema.modified_field() => modified_ts,
))?;
self.documents_added += 1;
Ok(())
}
pub fn add_documents_batch(&mut self, files: &[ScannedFile]) -> Result<usize> {
let mut added = 0;
for file in files {
if let Err(e) = self.add_document(file) {
self.errors.push(format!("{:?}: {}", file.path, e));
} else {
added += 1;
}
}
Ok(added)
}
pub fn delete_document(&mut self, path: &Path) -> Result<()> {
let path_str = path.to_string_lossy().to_string();
let term = Term::from_field_text(self.schema.path_field(), &path_str);
self.writer.delete_term(term);
self.documents_deleted += 1;
Ok(())
}
pub fn commit_changes(&mut self) -> Result<()> {
info!("Committing index changes");
self.writer
.commit()
.with_context(|| "提交索引变更失败\n💡 提示: 请检查磁盘空间是否充足")?;
Ok(())
}
pub fn build(mut self) -> Result<IndexStats> {
info!("Committing index...");
self.writer.commit().with_context(|| "提交索引失败\n💡 提示: 请检查磁盘空间是否充足")?;
self.writer.wait_merging_threads()?;
let stats = IndexStats {
documents_added: self.documents_added,
documents_deleted: self.documents_deleted,
errors: self.errors,
index_path: self.config.index_path,
};
info!(
"Index built successfully: {} documents added, {} deleted, {} errors",
stats.documents_added,
stats.documents_deleted,
stats.errors.len()
);
Ok(stats)
}
pub fn index(&self) -> &Index {
&self.index
}
pub fn schema(&self) -> &IndexSchema {
&self.schema
}
fn read_file_content(&self, path: &Path, size: u64) -> Result<String> {
if self.config.use_mmap && size > self.config.mmap_threshold {
self.read_file_mmap(path)
} else {
self.read_file_direct(path)
}
}
fn read_file_mmap(&self, path: &Path) -> Result<String> {
let file = File::open(path).with_context(|| format!("Failed to open file: {:?}", path))?;
let mmap = unsafe { Mmap::map(&file) }
.with_context(|| format!("Failed to mmap file: {:?}", path))?;
match std::str::from_utf8(&mmap) {
Ok(s) => Ok(s.to_string()),
Err(_) => {
Ok(String::from_utf8_lossy(&mmap).into_owned())
}
}
}
fn read_file_direct(&self, path: &Path) -> Result<String> {
let mut file =
File::open(path).with_context(|| format!("Failed to open file: {:?}", path))?;
let mut content = String::new();
match file.read_to_string(&mut content) {
Ok(_) => Ok(content),
Err(_) => {
let mut bytes = Vec::new();
let mut file = File::open(path)?;
file.read_to_end(&mut bytes)?;
Ok(String::from_utf8_lossy(&bytes).into_owned())
}
}
}
}
#[derive(Debug)]
pub struct IndexStats {
pub documents_added: usize,
pub documents_deleted: usize,
pub errors: Vec<String>,
pub index_path: PathBuf,
}
fn is_binary_content(content: &str) -> bool {
let check_len = content.len().min(8000);
content.as_bytes()[..check_len].contains(&b'\0')
}
fn detect_file_type(path: &Path) -> String {
path.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase())
.unwrap_or_else(|| "unknown".to_string())
}
pub fn open_index(index_path: &Path) -> Result<(Index, IndexSchema)> {
let schema = IndexSchema::new();
let index = Index::open_in_dir(index_path)
.with_context(|| format!("Failed to open index at {:?}", index_path))?;
register_xore_tokenizer(&index)?;
Ok((index, schema))
}
pub fn index_exists(index_path: &Path) -> bool {
index_path.join("meta.json").exists()
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::TempDir;
fn create_test_file(dir: &Path, name: &str, content: &str) -> PathBuf {
let path = dir.join(name);
let mut file = File::create(&path).unwrap();
file.write_all(content.as_bytes()).unwrap();
path
}
#[test]
fn test_schema_creation() {
let schema = IndexSchema::new();
assert!(schema.schema().get_field("path").is_ok());
assert!(schema.schema().get_field("content").is_ok());
assert!(schema.schema().get_field("file_type").is_ok());
assert!(schema.schema().get_field("size").is_ok());
assert!(schema.schema().get_field("modified").is_ok());
}
#[test]
fn test_index_builder_creation() {
let temp_dir = TempDir::new().unwrap();
let index_path = temp_dir.path().join("test_index");
let _builder = IndexBuilder::new(&index_path).unwrap();
assert!(index_path.exists());
}
#[test]
fn test_add_single_document() {
let temp_dir = TempDir::new().unwrap();
let index_path = temp_dir.path().join("test_index");
let files_dir = temp_dir.path().join("files");
std::fs::create_dir_all(&files_dir).unwrap();
let test_file_path = create_test_file(&files_dir, "test.txt", "Hello World 你好世界");
let mut builder = IndexBuilder::new(&index_path).unwrap();
let scanned_file = ScannedFile {
path: test_file_path,
size: 23,
modified: Some(SystemTime::now()),
is_dir: false,
};
builder.add_document(&scanned_file).unwrap();
let stats = builder.build().unwrap();
assert_eq!(stats.documents_added, 1);
assert!(stats.errors.is_empty());
}
#[test]
fn test_batch_add_documents() {
let temp_dir = TempDir::new().unwrap();
let index_path = temp_dir.path().join("test_index");
let files_dir = temp_dir.path().join("files");
std::fs::create_dir_all(&files_dir).unwrap();
let file1 = create_test_file(&files_dir, "test1.txt", "Content 1");
let file2 = create_test_file(&files_dir, "test2.txt", "Content 2");
let files = vec![
ScannedFile { path: file1, size: 9, modified: Some(SystemTime::now()), is_dir: false },
ScannedFile { path: file2, size: 9, modified: Some(SystemTime::now()), is_dir: false },
];
let mut builder = IndexBuilder::new(&index_path).unwrap();
let added = builder.add_documents_batch(&files).unwrap();
let stats = builder.build().unwrap();
assert_eq!(added, 2);
assert_eq!(stats.documents_added, 2);
}
#[test]
fn test_skip_binary_file() {
let temp_dir = TempDir::new().unwrap();
let index_path = temp_dir.path().join("test_index");
let files_dir = temp_dir.path().join("files");
std::fs::create_dir_all(&files_dir).unwrap();
let binary_path = files_dir.join("binary.bin");
let mut file = File::create(&binary_path).unwrap();
file.write_all(&[0x00, 0x01, 0x02, 0x00]).unwrap();
let mut builder = IndexBuilder::new(&index_path).unwrap();
let scanned_file = ScannedFile {
path: binary_path,
size: 4,
modified: Some(SystemTime::now()),
is_dir: false,
};
builder.add_document(&scanned_file).unwrap();
let stats = builder.build().unwrap();
assert_eq!(stats.documents_added, 0);
}
#[test]
fn test_skip_directory() {
let temp_dir = TempDir::new().unwrap();
let index_path = temp_dir.path().join("test_index");
let mut builder = IndexBuilder::new(&index_path).unwrap();
let scanned_file = ScannedFile {
path: temp_dir.path().to_path_buf(),
size: 0,
modified: Some(SystemTime::now()),
is_dir: true,
};
builder.add_document(&scanned_file).unwrap();
let stats = builder.build().unwrap();
assert_eq!(stats.documents_added, 0);
}
#[test]
fn test_detect_file_type() {
assert_eq!(detect_file_type(Path::new("test.rs")), "rs");
assert_eq!(detect_file_type(Path::new("test.py")), "py");
assert_eq!(detect_file_type(Path::new("test.TXT")), "txt");
assert_eq!(detect_file_type(Path::new("noextension")), "unknown");
}
#[test]
fn test_is_binary_content() {
assert!(!is_binary_content("Hello World"));
assert!(!is_binary_content("你好世界"));
assert!(is_binary_content("Hello\0World"));
}
#[test]
fn test_index_exists() {
let temp_dir = TempDir::new().unwrap();
let index_path = temp_dir.path().join("test_index");
assert!(!index_exists(&index_path));
let builder = IndexBuilder::new(&index_path).unwrap();
builder.build().unwrap();
assert!(index_exists(&index_path));
}
#[test]
fn test_document_update() {
let temp_dir = TempDir::new().unwrap();
let index_path = temp_dir.path().join("test_index");
let files_dir = temp_dir.path().join("files");
std::fs::create_dir_all(&files_dir).unwrap();
let test_file_path = create_test_file(&files_dir, "test.txt", "Original content");
{
let mut builder = IndexBuilder::new(&index_path).unwrap();
let scanned_file = ScannedFile {
path: test_file_path.clone(),
size: 16,
modified: Some(SystemTime::now()),
is_dir: false,
};
builder.add_document(&scanned_file).unwrap();
builder.build().unwrap();
}
create_test_file(&files_dir, "test.txt", "Updated content");
{
let mut builder = IndexBuilder::new(&index_path).unwrap();
let scanned_file = ScannedFile {
path: test_file_path,
size: 15,
modified: Some(SystemTime::now()),
is_dir: false,
};
builder.add_document(&scanned_file).unwrap();
let stats = builder.build().unwrap();
assert_eq!(stats.documents_added, 1);
}
}
}