use std::fs;
use std::path::{Path, PathBuf};
#[cfg(not(feature = "tokenizer-lindera-ipadic"))]
use anyhow::bail;
use anyhow::{Context, Result, anyhow};
#[cfg(feature = "tokenizer-lindera-ipadic")]
use lindera::dictionary::load_dictionary;
#[cfg(feature = "tokenizer-lindera-ipadic")]
use lindera::mode::Mode;
#[cfg(feature = "tokenizer-lindera-ipadic")]
use lindera::segmenter::Segmenter;
#[cfg(feature = "tokenizer-lindera-ipadic")]
use lindera_tantivy::tokenizer::LinderaTokenizer;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{
Field, IndexRecordOption, STORED, STRING, Schema, TextFieldIndexing, TextOptions, Value,
};
use tantivy::snippet::SnippetGenerator;
use tantivy::tokenizer::{LowerCaser, NgramTokenizer, RemoveLongFilter, TextAnalyzer};
use tantivy::{Index, ReloadPolicy, Term, doc};
const TOKENIZER_NAME: &str = "traverze_ja";
const DEFAULT_INDEX_DIR: &str = ".traverze-index";
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenizerMode {
Ngram,
LinderaIpadic,
}
#[cfg(feature = "tokenizer-lindera-ipadic")]
pub fn default_tokenizer_mode() -> TokenizerMode {
TokenizerMode::LinderaIpadic
}
#[cfg(not(feature = "tokenizer-lindera-ipadic"))]
pub fn default_tokenizer_mode() -> TokenizerMode {
TokenizerMode::Ngram
}
#[derive(Debug, Clone)]
pub struct SearchHit {
pub path: String,
pub score: f32,
pub snippet: Option<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SnippetFormat {
Text,
Html,
}
#[derive(Debug, Clone, Copy)]
pub struct SnippetOptions {
pub max_num_chars: usize,
pub format: SnippetFormat,
}
impl Default for SnippetOptions {
fn default() -> Self {
Self {
max_num_chars: 150,
format: SnippetFormat::Text,
}
}
}
#[derive(Debug, Clone, Copy)]
pub struct SearchOptions {
pub limit: usize,
pub snippet: Option<SnippetOptions>,
}
impl SearchOptions {
pub fn with_limit(limit: usize) -> Self {
Self {
limit,
snippet: None,
}
}
}
impl Default for SearchOptions {
fn default() -> Self {
Self::with_limit(20)
}
}
#[derive(Clone)]
pub struct Traverze {
index: Index,
path_field: Field,
contents_field: Field,
contents_is_stored: bool,
}
impl Traverze {
pub fn new() -> Result<Self> {
Self::new_in_dir(Path::new(DEFAULT_INDEX_DIR))
}
pub fn new_in_dir(index_dir: &Path) -> Result<Self> {
Self::new_in_dir_with_mode(index_dir, default_tokenizer_mode())
}
pub fn new_in_dir_with_mode(index_dir: &Path, mode: TokenizerMode) -> Result<Self> {
Self::open_or_create(index_dir, mode, build_schema(false))
}
pub fn new_in_dir_for_indexing(
index_dir: &Path,
mode: TokenizerMode,
with_snippet: bool,
) -> Result<Self> {
let engine = Self::open_or_create(index_dir, mode, build_schema(with_snippet))?;
if engine.supports_snippet() != with_snippet {
let expected = if with_snippet { "enabled" } else { "disabled" };
let actual = if engine.supports_snippet() {
"enabled"
} else {
"disabled"
};
return Err(anyhow!(
"index snippet support mismatch: expected {expected}, but existing index is {actual}"
));
}
Ok(engine)
}
fn open_or_create(index_dir: &Path, mode: TokenizerMode, schema: Schema) -> Result<Self> {
fs::create_dir_all(index_dir)
.with_context(|| format!("failed to create index dir: {}", index_dir.display()))?;
let index = match Index::open_in_dir(index_dir) {
Ok(index) => index,
Err(_) => Index::create_in_dir(index_dir, schema)
.with_context(|| format!("failed to create index: {}", index_dir.display()))?,
};
register_tokenizer(&index, mode)?;
let schema = index.schema();
let path_field = schema
.get_field("path")
.map_err(|_| anyhow!("`path` field is missing in schema"))?;
let contents_field = schema
.get_field("contents")
.map_err(|_| anyhow!("`contents` field is missing in schema"))?;
let contents_is_stored = schema.get_field_entry(contents_field).is_stored();
Ok(Self {
index,
path_field,
contents_field,
contents_is_stored,
})
}
pub fn index_files(&self, files: &[PathBuf]) -> Result<usize> {
let mut writer = self
.index
.writer::<tantivy::schema::TantivyDocument>(50_000_000)
.context("failed to create index writer")?;
let mut count = 0usize;
for file in files {
if !file.is_file() {
continue;
}
let abs = normalize_path(file);
let content = fs::read_to_string(&abs)
.or_else(|_| fs::read(&abs).map(|b| String::from_utf8_lossy(&b).into_owned()))
.with_context(|| format!("failed to read file: {}", abs.display()))?;
let path_text = abs.to_string_lossy().to_string();
writer.delete_term(Term::from_field_text(self.path_field, &path_text));
writer
.add_document(doc!(
self.path_field => path_text,
self.contents_field => content,
))
.context("failed to add document")?;
count += 1;
}
writer.commit().context("failed to commit index")?;
Ok(count)
}
pub fn remove_files(&self, files: &[PathBuf]) -> Result<usize> {
let mut writer = self
.index
.writer::<tantivy::schema::TantivyDocument>(50_000_000)
.context("failed to create index writer")?;
let mut count = 0usize;
for file in files {
let abs = normalize_path(file);
let path_text = abs.to_string_lossy().to_string();
writer.delete_term(Term::from_field_text(self.path_field, &path_text));
count += 1;
}
writer.commit().context("failed to commit index")?;
Ok(count)
}
pub fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchHit>> {
self.search_with_options(query, SearchOptions::with_limit(limit))
}
pub fn search_with_options(
&self,
query: &str,
options: SearchOptions,
) -> Result<Vec<SearchHit>> {
let reader = self
.index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.try_into()
.context("failed to build index reader")?;
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&self.index, vec![self.contents_field]);
let parsed_query = query_parser
.parse_query(query)
.context("failed to parse query")?;
let top_docs = searcher
.search(&parsed_query, &TopDocs::with_limit(options.limit))
.context("failed to run search")?;
let mut snippet_generator = if let Some(snippet_options) = options.snippet {
if !self.contents_is_stored {
return Err(anyhow!(
"snippet is not available for this index. recreate index with snippet storage enabled"
));
}
let mut generator =
SnippetGenerator::create(&searcher, &*parsed_query, self.contents_field)
.context("failed to create snippet generator")?;
generator.set_max_num_chars(snippet_options.max_num_chars);
Some((generator, snippet_options.format))
} else {
None
};
let mut hits = Vec::with_capacity(top_docs.len());
for (score, doc_addr) in top_docs {
let retrieved = searcher
.doc::<tantivy::schema::TantivyDocument>(doc_addr)
.context("failed to load document")?;
let path = retrieved
.get_first(self.path_field)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
if !path.is_empty() {
let snippet = snippet_generator.as_mut().map(|(generator, format)| {
let snippet = generator.snippet_from_doc(&retrieved);
match format {
SnippetFormat::Text => snippet.fragment().to_string(),
SnippetFormat::Html => snippet.to_html(),
}
});
hits.push(SearchHit {
path,
score,
snippet,
});
}
}
Ok(hits)
}
pub fn supports_snippet(&self) -> bool {
self.contents_is_stored
}
}
fn normalize_path(path: &Path) -> PathBuf {
fs::canonicalize(path).unwrap_or_else(|_| {
if path.is_absolute() {
path.to_path_buf()
} else {
std::env::current_dir()
.map(|cwd| cwd.join(path))
.unwrap_or_else(|_| path.to_path_buf())
}
})
}
fn build_schema(with_snippet: bool) -> Schema {
let mut builder = Schema::builder();
builder.add_text_field("path", STRING | STORED);
let text_indexing = TextFieldIndexing::default()
.set_tokenizer(TOKENIZER_NAME)
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let contents_options = if with_snippet {
TextOptions::default()
.set_stored()
.set_indexing_options(text_indexing)
} else {
TextOptions::default().set_indexing_options(text_indexing)
};
builder.add_text_field("contents", contents_options);
builder.build()
}
fn register_tokenizer(index: &Index, mode: TokenizerMode) -> Result<()> {
match mode {
TokenizerMode::Ngram => {
let analyzer = TextAnalyzer::builder(NgramTokenizer::new(2, 3, false)?)
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.build();
index.tokenizers().register(TOKENIZER_NAME, analyzer);
Ok(())
}
TokenizerMode::LinderaIpadic => {
#[cfg(feature = "tokenizer-lindera-ipadic")]
{
let dictionary = load_dictionary("embedded://ipadic")
.context("failed to load Lindera IPADIC dictionary")?;
let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
index.tokenizers().register(TOKENIZER_NAME, tokenizer);
Ok(())
}
#[cfg(not(feature = "tokenizer-lindera-ipadic"))]
{
bail!(
"Lindera tokenizer is not enabled. Build with `--features tokenizer-lindera-ipadic`."
)
}
}
}
}
#[cfg(test)]
mod tests {
#[cfg(not(feature = "tokenizer-lindera-ipadic"))]
#[test]
fn default_mode_is_ngram_without_lindera_feature() {
assert_eq!(crate::default_tokenizer_mode(), crate::TokenizerMode::Ngram);
}
#[cfg(feature = "tokenizer-lindera-ipadic")]
#[test]
fn default_mode_is_lindera_with_feature() {
assert_eq!(
crate::default_tokenizer_mode(),
crate::TokenizerMode::LinderaIpadic
);
}
}