1use std::fs;
2use std::path::{Path, PathBuf};
3
4#[cfg(not(feature = "tokenizer-lindera-ipadic"))]
5use anyhow::bail;
6use anyhow::{Context, Result, anyhow};
7#[cfg(feature = "tokenizer-lindera-ipadic")]
8use lindera::dictionary::load_dictionary;
9#[cfg(feature = "tokenizer-lindera-ipadic")]
10use lindera::mode::Mode;
11#[cfg(feature = "tokenizer-lindera-ipadic")]
12use lindera::segmenter::Segmenter;
13#[cfg(feature = "tokenizer-lindera-ipadic")]
14use lindera_tantivy::tokenizer::LinderaTokenizer;
15use tantivy::collector::TopDocs;
16use tantivy::query::QueryParser;
17use tantivy::schema::{
18 Field, IndexRecordOption, STORED, STRING, Schema, TextFieldIndexing, TextOptions, Value,
19};
20use tantivy::snippet::SnippetGenerator;
21use tantivy::tokenizer::{LowerCaser, NgramTokenizer, RemoveLongFilter, TextAnalyzer};
22use tantivy::{Index, ReloadPolicy, Term, doc};
23
24const TOKENIZER_NAME: &str = "traverze_ja";
25const DEFAULT_INDEX_DIR: &str = ".traverze-index";
26
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum TokenizerMode {
29 Ngram,
30 LinderaIpadic,
31}
32
33#[cfg(feature = "tokenizer-lindera-ipadic")]
34pub fn default_tokenizer_mode() -> TokenizerMode {
35 TokenizerMode::LinderaIpadic
37}
38
39#[cfg(not(feature = "tokenizer-lindera-ipadic"))]
40pub fn default_tokenizer_mode() -> TokenizerMode {
41 TokenizerMode::Ngram
42}
43
44#[derive(Debug, Clone)]
45pub struct SearchHit {
46 pub path: String,
47 pub score: f32,
48 pub snippet: Option<String>,
49}
50
51#[derive(Debug, Clone, Copy, PartialEq, Eq)]
52pub enum SnippetFormat {
53 Text,
54 Html,
55}
56
57#[derive(Debug, Clone, Copy)]
58pub struct SnippetOptions {
59 pub max_num_chars: usize,
60 pub format: SnippetFormat,
61}
62
63impl Default for SnippetOptions {
64 fn default() -> Self {
65 Self {
66 max_num_chars: 150,
67 format: SnippetFormat::Text,
68 }
69 }
70}
71
72#[derive(Debug, Clone, Copy)]
73pub struct SearchOptions {
74 pub limit: usize,
75 pub snippet: Option<SnippetOptions>,
76}
77
78impl SearchOptions {
79 pub fn with_limit(limit: usize) -> Self {
80 Self {
81 limit,
82 snippet: None,
83 }
84 }
85}
86
87impl Default for SearchOptions {
88 fn default() -> Self {
89 Self::with_limit(20)
90 }
91}
92
93#[derive(Clone)]
94pub struct Traverze {
95 index: Index,
96 path_field: Field,
97 contents_field: Field,
98 contents_is_stored: bool,
99}
100
101impl Traverze {
102 pub fn new() -> Result<Self> {
103 Self::new_in_dir(Path::new(DEFAULT_INDEX_DIR))
104 }
105
106 pub fn new_in_dir(index_dir: &Path) -> Result<Self> {
107 Self::new_in_dir_with_mode(index_dir, default_tokenizer_mode())
108 }
109
110 pub fn new_in_dir_with_mode(index_dir: &Path, mode: TokenizerMode) -> Result<Self> {
111 Self::open_or_create(index_dir, mode, build_schema(false))
112 }
113
114 pub fn new_in_dir_for_indexing(
115 index_dir: &Path,
116 mode: TokenizerMode,
117 with_snippet: bool,
118 ) -> Result<Self> {
119 let engine = Self::open_or_create(index_dir, mode, build_schema(with_snippet))?;
120 if engine.supports_snippet() != with_snippet {
121 let expected = if with_snippet { "enabled" } else { "disabled" };
122 let actual = if engine.supports_snippet() {
123 "enabled"
124 } else {
125 "disabled"
126 };
127 return Err(anyhow!(
128 "index snippet support mismatch: expected {expected}, but existing index is {actual}"
129 ));
130 }
131 Ok(engine)
132 }
133
134 fn open_or_create(index_dir: &Path, mode: TokenizerMode, schema: Schema) -> Result<Self> {
135 fs::create_dir_all(index_dir)
136 .with_context(|| format!("failed to create index dir: {}", index_dir.display()))?;
137
138 let index = match Index::open_in_dir(index_dir) {
139 Ok(index) => index,
140 Err(_) => Index::create_in_dir(index_dir, schema)
141 .with_context(|| format!("failed to create index: {}", index_dir.display()))?,
142 };
143
144 register_tokenizer(&index, mode)?;
145 let schema = index.schema();
146 let path_field = schema
147 .get_field("path")
148 .map_err(|_| anyhow!("`path` field is missing in schema"))?;
149 let contents_field = schema
150 .get_field("contents")
151 .map_err(|_| anyhow!("`contents` field is missing in schema"))?;
152 let contents_is_stored = schema.get_field_entry(contents_field).is_stored();
153
154 Ok(Self {
155 index,
156 path_field,
157 contents_field,
158 contents_is_stored,
159 })
160 }
161
162 pub fn index_files(&self, files: &[PathBuf]) -> Result<usize> {
163 let mut writer = self
164 .index
165 .writer::<tantivy::schema::TantivyDocument>(50_000_000)
166 .context("failed to create index writer")?;
167
168 let mut count = 0usize;
169 for file in files {
170 if !file.is_file() {
171 continue;
172 }
173 let abs = normalize_path(file);
174 let content = fs::read_to_string(&abs)
175 .or_else(|_| fs::read(&abs).map(|b| String::from_utf8_lossy(&b).into_owned()))
176 .with_context(|| format!("failed to read file: {}", abs.display()))?;
177
178 let path_text = abs.to_string_lossy().to_string();
179 writer.delete_term(Term::from_field_text(self.path_field, &path_text));
180 writer
181 .add_document(doc!(
182 self.path_field => path_text,
183 self.contents_field => content,
184 ))
185 .context("failed to add document")?;
186 count += 1;
187 }
188
189 writer.commit().context("failed to commit index")?;
190 Ok(count)
191 }
192
193 pub fn remove_files(&self, files: &[PathBuf]) -> Result<usize> {
194 let mut writer = self
195 .index
196 .writer::<tantivy::schema::TantivyDocument>(50_000_000)
197 .context("failed to create index writer")?;
198
199 let mut count = 0usize;
200 for file in files {
201 let abs = normalize_path(file);
202 let path_text = abs.to_string_lossy().to_string();
203 writer.delete_term(Term::from_field_text(self.path_field, &path_text));
204 count += 1;
205 }
206
207 writer.commit().context("failed to commit index")?;
208 Ok(count)
209 }
210
211 pub fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchHit>> {
212 self.search_with_options(query, SearchOptions::with_limit(limit))
213 }
214
215 pub fn search_with_options(
216 &self,
217 query: &str,
218 options: SearchOptions,
219 ) -> Result<Vec<SearchHit>> {
220 let reader = self
221 .index
222 .reader_builder()
223 .reload_policy(ReloadPolicy::OnCommitWithDelay)
224 .try_into()
225 .context("failed to build index reader")?;
226 let searcher = reader.searcher();
227
228 let query_parser = QueryParser::for_index(&self.index, vec![self.contents_field]);
229 let parsed_query = query_parser
230 .parse_query(query)
231 .context("failed to parse query")?;
232
233 let top_docs = searcher
234 .search(&parsed_query, &TopDocs::with_limit(options.limit))
235 .context("failed to run search")?;
236
237 let mut snippet_generator = if let Some(snippet_options) = options.snippet {
238 if !self.contents_is_stored {
239 return Err(anyhow!(
240 "snippet is not available for this index. recreate index with snippet storage enabled"
241 ));
242 }
243 let mut generator =
244 SnippetGenerator::create(&searcher, &*parsed_query, self.contents_field)
245 .context("failed to create snippet generator")?;
246 generator.set_max_num_chars(snippet_options.max_num_chars);
247 Some((generator, snippet_options.format))
248 } else {
249 None
250 };
251
252 let mut hits = Vec::with_capacity(top_docs.len());
253 for (score, doc_addr) in top_docs {
254 let retrieved = searcher
255 .doc::<tantivy::schema::TantivyDocument>(doc_addr)
256 .context("failed to load document")?;
257 let path = retrieved
258 .get_first(self.path_field)
259 .and_then(|v| v.as_str())
260 .unwrap_or("")
261 .to_string();
262 if !path.is_empty() {
263 let snippet = snippet_generator.as_mut().map(|(generator, format)| {
264 let snippet = generator.snippet_from_doc(&retrieved);
265 match format {
266 SnippetFormat::Text => snippet.fragment().to_string(),
267 SnippetFormat::Html => snippet.to_html(),
268 }
269 });
270 hits.push(SearchHit {
271 path,
272 score,
273 snippet,
274 });
275 }
276 }
277
278 Ok(hits)
279 }
280
281 pub fn supports_snippet(&self) -> bool {
282 self.contents_is_stored
283 }
284}
285
286fn normalize_path(path: &Path) -> PathBuf {
287 fs::canonicalize(path).unwrap_or_else(|_| {
288 if path.is_absolute() {
289 path.to_path_buf()
290 } else {
291 std::env::current_dir()
292 .map(|cwd| cwd.join(path))
293 .unwrap_or_else(|_| path.to_path_buf())
294 }
295 })
296}
297
298fn build_schema(with_snippet: bool) -> Schema {
299 let mut builder = Schema::builder();
300 builder.add_text_field("path", STRING | STORED);
301 let text_indexing = TextFieldIndexing::default()
302 .set_tokenizer(TOKENIZER_NAME)
303 .set_index_option(IndexRecordOption::WithFreqsAndPositions);
304 let contents_options = if with_snippet {
305 TextOptions::default()
306 .set_stored()
307 .set_indexing_options(text_indexing)
308 } else {
309 TextOptions::default().set_indexing_options(text_indexing)
310 };
311 builder.add_text_field("contents", contents_options);
312 builder.build()
313}
314
315fn register_tokenizer(index: &Index, mode: TokenizerMode) -> Result<()> {
316 match mode {
317 TokenizerMode::Ngram => {
318 let analyzer = TextAnalyzer::builder(NgramTokenizer::new(2, 3, false)?)
319 .filter(RemoveLongFilter::limit(40))
320 .filter(LowerCaser)
321 .build();
322 index.tokenizers().register(TOKENIZER_NAME, analyzer);
323 Ok(())
324 }
325 TokenizerMode::LinderaIpadic => {
326 #[cfg(feature = "tokenizer-lindera-ipadic")]
327 {
328 let dictionary = load_dictionary("embedded://ipadic")
329 .context("failed to load Lindera IPADIC dictionary")?;
330 let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
331 let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
332 index.tokenizers().register(TOKENIZER_NAME, tokenizer);
333 Ok(())
334 }
335 #[cfg(not(feature = "tokenizer-lindera-ipadic"))]
336 {
337 bail!(
338 "Lindera tokenizer is not enabled. Build with `--features tokenizer-lindera-ipadic`."
339 )
340 }
341 }
342 }
343}
344
345#[cfg(test)]
346mod tests {
347 #[cfg(not(feature = "tokenizer-lindera-ipadic"))]
348 #[test]
349 fn default_mode_is_ngram_without_lindera_feature() {
350 assert_eq!(crate::default_tokenizer_mode(), crate::TokenizerMode::Ngram);
351 }
352
353 #[cfg(feature = "tokenizer-lindera-ipadic")]
354 #[test]
355 fn default_mode_is_lindera_with_feature() {
356 assert_eq!(
357 crate::default_tokenizer_mode(),
358 crate::TokenizerMode::LinderaIpadic
359 );
360 }
361}