use crate::indexer::{IndexStats, SearchResult};
use crate::parser::{HtmlParser, ParsedDocument};
use nanofts::{EngineConfig, UnifiedEngine};
use std::collections::HashMap;
use std::fs;
use std::path::Path;
pub struct Indexer {
engine: UnifiedEngine,
parser: HtmlParser,
documents: HashMap<u32, ParsedDocument>,
next_id: u32,
index_path: String,
}
impl Indexer {
pub fn new() -> Self {
let config = EngineConfig::memory_only();
let engine = UnifiedEngine::new(config).expect("Failed to create engine");
Self {
engine,
parser: HtmlParser::new(),
documents: HashMap::new(),
next_id: 1,
index_path: String::new(),
}
}
pub fn new_persistent(path: &str) -> Self {
let index_path = path.to_string();
let index_file = std::path::Path::new(&index_path);
let documents_path = format!("{}.documents.json", path);
let config = EngineConfig::persistent(path).with_drop_if_exists(!index_file.exists());
let engine = UnifiedEngine::new(config).expect("Failed to create engine");
let documents = if index_file.exists() {
Self::load_documents(&documents_path)
} else {
HashMap::new()
};
let next_id = documents.keys().max().map(|k| k + 1).unwrap_or(1);
Self {
engine,
parser: HtmlParser::new(),
documents,
next_id,
index_path,
}
}
fn load_documents(path: &str) -> HashMap<u32, ParsedDocument> {
if let Ok(data) = fs::read_to_string(path) {
serde_json::from_str(&data).unwrap_or_default()
} else {
HashMap::new()
}
}
fn save_documents(&self) {
if !self.index_path.is_empty() {
let path = format!("{}.documents.json", self.index_path);
if let Ok(data) = serde_json::to_string_pretty(&self.documents) {
let _ = fs::write(path, data);
}
}
}
pub fn index_directory(&mut self, dir: &Path) -> std::io::Result<IndexStats> {
self.documents.clear();
self.next_id = 1;
let documents = self.parser.parse_directory(dir)?;
let mut indexed = 0;
for doc in documents {
self.index_document(doc);
indexed += 1;
}
self.engine.flush().ok();
self.save_documents();
Ok(IndexStats {
documents: self.documents.len(),
indexed,
})
}
pub fn index_document(&mut self, doc: ParsedDocument) {
let id = self.next_id;
self.next_id += 1;
self.documents.insert(id, doc.clone());
let mut fields = HashMap::new();
fields.insert("url".to_string(), doc.url.clone());
fields.insert("title".to_string(), doc.title.clone());
fields.insert("content".to_string(), doc.content.clone());
self.engine.add_document(id, fields).ok();
}
pub fn search(&self, query: &str, limit: usize) -> Vec<SearchResult> {
let result = match self.engine.search(query) {
Ok(r) => r,
Err(_) => return Vec::new(),
};
let doc_ids = result.top(limit);
let mut results = Vec::new();
for doc_id in doc_ids {
if let Some(doc) = self.documents.get(&doc_id) {
let snippet = if doc.content.len() > 100 {
let end = doc
.content
.chars()
.take(100)
.map(|c| c.len_utf8())
.sum::<usize>()
.min(doc.content.len());
format!("{}...", &doc.content[..end])
} else {
doc.content.clone()
};
results.push(SearchResult {
url: doc.url.clone(),
title: doc.title.clone(),
snippet,
score: 1.0,
});
}
}
results
}
pub fn flush(&self) {
self.engine.flush().ok();
}
pub fn get_stats(&self) -> IndexStats {
IndexStats {
documents: self.documents.len(),
indexed: self.documents.len(),
}
}
}
impl Default for Indexer {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::TempDir;
fn create_test_html(title: &str, content: &str) -> String {
format!(
r#"<!DOCTYPE html>
<html>
<head><title>{}</title></head>
<body><p>{}</p></body>
</html>"#,
title, content
)
}
#[test]
fn test_index_single_document() {
let mut indexer = Indexer::new();
let doc = ParsedDocument::new(
"test.html".to_string(),
"Test Title".to_string(),
"Test content".to_string(),
);
indexer.index_document(doc);
indexer.flush();
let results = indexer.search("Test", 10);
assert!(!results.is_empty());
}
#[test]
fn test_search_chinese() {
let mut indexer = Indexer::new();
let doc = ParsedDocument::new(
"test.html".to_string(),
"測試標題".to_string(),
"這是中文測試內容".to_string(),
);
indexer.index_document(doc);
indexer.flush();
let results = indexer.search("中文", 10);
assert!(!results.is_empty());
}
#[test]
fn test_search_partial_match() {
let mut indexer = Indexer::new();
let doc = ParsedDocument::new(
"test.html".to_string(),
"全文檢索".to_string(),
"搜尋引擎測試".to_string(),
);
indexer.index_document(doc);
indexer.flush();
let results = indexer.search("搜尋", 10);
assert!(!results.is_empty());
}
#[test]
fn test_search_multiple_documents() {
let mut indexer = Indexer::new();
indexer.index_document(ParsedDocument::new(
"doc1.html".to_string(),
"Rust 程式".to_string(),
"Rust 是一種程式語言".to_string(),
));
indexer.index_document(ParsedDocument::new(
"doc2.html".to_string(),
"Python 程式".to_string(),
"Python 也是一種程式語言".to_string(),
));
indexer.flush();
let results = indexer.search("程式", 10);
assert_eq!(results.len(), 2);
}
#[test]
fn test_get_stats() {
let mut indexer = Indexer::new();
indexer.index_document(ParsedDocument::new(
"test.html".to_string(),
"Title".to_string(),
"Content".to_string(),
));
indexer.flush();
let stats = indexer.get_stats();
assert_eq!(stats.documents, 1);
}
#[test]
fn test_persistent_indexer() {
let temp_dir = TempDir::new().unwrap();
let index_path = temp_dir.path().join("test_index.nfts");
let mut indexer = Indexer::new_persistent(index_path.to_str().unwrap());
indexer.index_document(ParsedDocument::new(
"test.html".to_string(),
"Persistent".to_string(),
"Persistent content".to_string(),
));
indexer.flush();
let stats = indexer.get_stats();
assert_eq!(stats.documents, 1);
}
#[test]
fn test_search_empty_query() {
let indexer = Indexer::new();
let results = indexer.search("", 10);
assert!(results.is_empty());
}
}