Skip to main content

traverze/
lib.rs

1use std::fs;
2use std::path::{Path, PathBuf};
3
4#[cfg(not(feature = "tokenizer-lindera-ipadic"))]
5use anyhow::bail;
6use anyhow::{Context, Result, anyhow};
7#[cfg(feature = "tokenizer-lindera-ipadic")]
8use lindera::dictionary::load_dictionary;
9#[cfg(feature = "tokenizer-lindera-ipadic")]
10use lindera::mode::Mode;
11#[cfg(feature = "tokenizer-lindera-ipadic")]
12use lindera::segmenter::Segmenter;
13#[cfg(feature = "tokenizer-lindera-ipadic")]
14use lindera_tantivy::tokenizer::LinderaTokenizer;
15use tantivy::collector::TopDocs;
16use tantivy::query::QueryParser;
17use tantivy::schema::{
18    Field, IndexRecordOption, STORED, STRING, Schema, TextFieldIndexing, TextOptions, Value,
19};
20use tantivy::snippet::SnippetGenerator;
21use tantivy::tokenizer::{LowerCaser, NgramTokenizer, RemoveLongFilter, TextAnalyzer};
22use tantivy::{Index, ReloadPolicy, Term, doc};
23
24const TOKENIZER_NAME: &str = "traverze_ja";
25const DEFAULT_INDEX_DIR: &str = ".traverze-index";
26
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum TokenizerMode {
29    Ngram,
30    LinderaIpadic,
31}
32
33#[cfg(feature = "tokenizer-lindera-ipadic")]
34pub fn default_tokenizer_mode() -> TokenizerMode {
35    // Prefer Lindera when both features are enabled.
36    TokenizerMode::LinderaIpadic
37}
38
39#[cfg(not(feature = "tokenizer-lindera-ipadic"))]
40pub fn default_tokenizer_mode() -> TokenizerMode {
41    TokenizerMode::Ngram
42}
43
44#[derive(Debug, Clone)]
45pub struct SearchHit {
46    pub path: String,
47    pub score: f32,
48    pub snippet: Option<String>,
49}
50
51#[derive(Debug, Clone, Copy, PartialEq, Eq)]
52pub enum SnippetFormat {
53    Text,
54    Html,
55}
56
57#[derive(Debug, Clone, Copy)]
58pub struct SnippetOptions {
59    pub max_num_chars: usize,
60    pub format: SnippetFormat,
61}
62
63impl Default for SnippetOptions {
64    fn default() -> Self {
65        Self {
66            max_num_chars: 150,
67            format: SnippetFormat::Text,
68        }
69    }
70}
71
72#[derive(Debug, Clone, Copy)]
73pub struct SearchOptions {
74    pub limit: usize,
75    pub snippet: Option<SnippetOptions>,
76}
77
78impl SearchOptions {
79    pub fn with_limit(limit: usize) -> Self {
80        Self {
81            limit,
82            snippet: None,
83        }
84    }
85}
86
87impl Default for SearchOptions {
88    fn default() -> Self {
89        Self::with_limit(20)
90    }
91}
92
93#[derive(Clone)]
94pub struct Traverze {
95    index: Index,
96    path_field: Field,
97    contents_field: Field,
98    contents_is_stored: bool,
99}
100
101impl Traverze {
102    pub fn new() -> Result<Self> {
103        Self::new_in_dir(Path::new(DEFAULT_INDEX_DIR))
104    }
105
106    pub fn new_in_dir(index_dir: &Path) -> Result<Self> {
107        Self::new_in_dir_with_mode(index_dir, default_tokenizer_mode())
108    }
109
110    pub fn new_in_dir_with_mode(index_dir: &Path, mode: TokenizerMode) -> Result<Self> {
111        Self::open_or_create(index_dir, mode, build_schema(false))
112    }
113
114    pub fn new_in_dir_for_indexing(
115        index_dir: &Path,
116        mode: TokenizerMode,
117        with_snippet: bool,
118    ) -> Result<Self> {
119        let engine = Self::open_or_create(index_dir, mode, build_schema(with_snippet))?;
120        if engine.supports_snippet() != with_snippet {
121            let expected = if with_snippet { "enabled" } else { "disabled" };
122            let actual = if engine.supports_snippet() {
123                "enabled"
124            } else {
125                "disabled"
126            };
127            return Err(anyhow!(
128                "index snippet support mismatch: expected {expected}, but existing index is {actual}"
129            ));
130        }
131        Ok(engine)
132    }
133
134    fn open_or_create(index_dir: &Path, mode: TokenizerMode, schema: Schema) -> Result<Self> {
135        fs::create_dir_all(index_dir)
136            .with_context(|| format!("failed to create index dir: {}", index_dir.display()))?;
137
138        let index = match Index::open_in_dir(index_dir) {
139            Ok(index) => index,
140            Err(_) => Index::create_in_dir(index_dir, schema)
141                .with_context(|| format!("failed to create index: {}", index_dir.display()))?,
142        };
143
144        register_tokenizer(&index, mode)?;
145        let schema = index.schema();
146        let path_field = schema
147            .get_field("path")
148            .map_err(|_| anyhow!("`path` field is missing in schema"))?;
149        let contents_field = schema
150            .get_field("contents")
151            .map_err(|_| anyhow!("`contents` field is missing in schema"))?;
152        let contents_is_stored = schema.get_field_entry(contents_field).is_stored();
153
154        Ok(Self {
155            index,
156            path_field,
157            contents_field,
158            contents_is_stored,
159        })
160    }
161
162    pub fn index_files(&self, files: &[PathBuf]) -> Result<usize> {
163        let mut writer = self
164            .index
165            .writer::<tantivy::schema::TantivyDocument>(50_000_000)
166            .context("failed to create index writer")?;
167
168        let mut count = 0usize;
169        for file in files {
170            if !file.is_file() {
171                continue;
172            }
173            let abs = normalize_path(file);
174            let content = fs::read_to_string(&abs)
175                .or_else(|_| fs::read(&abs).map(|b| String::from_utf8_lossy(&b).into_owned()))
176                .with_context(|| format!("failed to read file: {}", abs.display()))?;
177
178            let path_text = abs.to_string_lossy().to_string();
179            writer.delete_term(Term::from_field_text(self.path_field, &path_text));
180            writer
181                .add_document(doc!(
182                    self.path_field => path_text,
183                    self.contents_field => content,
184                ))
185                .context("failed to add document")?;
186            count += 1;
187        }
188
189        writer.commit().context("failed to commit index")?;
190        Ok(count)
191    }
192
193    pub fn remove_files(&self, files: &[PathBuf]) -> Result<usize> {
194        let mut writer = self
195            .index
196            .writer::<tantivy::schema::TantivyDocument>(50_000_000)
197            .context("failed to create index writer")?;
198
199        let mut count = 0usize;
200        for file in files {
201            let abs = normalize_path(file);
202            let path_text = abs.to_string_lossy().to_string();
203            writer.delete_term(Term::from_field_text(self.path_field, &path_text));
204            count += 1;
205        }
206
207        writer.commit().context("failed to commit index")?;
208        Ok(count)
209    }
210
211    pub fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchHit>> {
212        self.search_with_options(query, SearchOptions::with_limit(limit))
213    }
214
215    pub fn search_with_options(
216        &self,
217        query: &str,
218        options: SearchOptions,
219    ) -> Result<Vec<SearchHit>> {
220        let reader = self
221            .index
222            .reader_builder()
223            .reload_policy(ReloadPolicy::OnCommitWithDelay)
224            .try_into()
225            .context("failed to build index reader")?;
226        let searcher = reader.searcher();
227
228        let query_parser = QueryParser::for_index(&self.index, vec![self.contents_field]);
229        let parsed_query = query_parser
230            .parse_query(query)
231            .context("failed to parse query")?;
232
233        let top_docs = searcher
234            .search(&parsed_query, &TopDocs::with_limit(options.limit))
235            .context("failed to run search")?;
236
237        let mut snippet_generator = if let Some(snippet_options) = options.snippet {
238            if !self.contents_is_stored {
239                return Err(anyhow!(
240                    "snippet is not available for this index. recreate index with snippet storage enabled"
241                ));
242            }
243            let mut generator =
244                SnippetGenerator::create(&searcher, &*parsed_query, self.contents_field)
245                    .context("failed to create snippet generator")?;
246            generator.set_max_num_chars(snippet_options.max_num_chars);
247            Some((generator, snippet_options.format))
248        } else {
249            None
250        };
251
252        let mut hits = Vec::with_capacity(top_docs.len());
253        for (score, doc_addr) in top_docs {
254            let retrieved = searcher
255                .doc::<tantivy::schema::TantivyDocument>(doc_addr)
256                .context("failed to load document")?;
257            let path = retrieved
258                .get_first(self.path_field)
259                .and_then(|v| v.as_str())
260                .unwrap_or("")
261                .to_string();
262            if !path.is_empty() {
263                let snippet = snippet_generator.as_mut().map(|(generator, format)| {
264                    let snippet = generator.snippet_from_doc(&retrieved);
265                    match format {
266                        SnippetFormat::Text => snippet.fragment().to_string(),
267                        SnippetFormat::Html => snippet.to_html(),
268                    }
269                });
270                hits.push(SearchHit {
271                    path,
272                    score,
273                    snippet,
274                });
275            }
276        }
277
278        Ok(hits)
279    }
280
281    pub fn supports_snippet(&self) -> bool {
282        self.contents_is_stored
283    }
284}
285
286fn normalize_path(path: &Path) -> PathBuf {
287    fs::canonicalize(path).unwrap_or_else(|_| {
288        if path.is_absolute() {
289            path.to_path_buf()
290        } else {
291            std::env::current_dir()
292                .map(|cwd| cwd.join(path))
293                .unwrap_or_else(|_| path.to_path_buf())
294        }
295    })
296}
297
298fn build_schema(with_snippet: bool) -> Schema {
299    let mut builder = Schema::builder();
300    builder.add_text_field("path", STRING | STORED);
301    let text_indexing = TextFieldIndexing::default()
302        .set_tokenizer(TOKENIZER_NAME)
303        .set_index_option(IndexRecordOption::WithFreqsAndPositions);
304    let contents_options = if with_snippet {
305        TextOptions::default()
306            .set_stored()
307            .set_indexing_options(text_indexing)
308    } else {
309        TextOptions::default().set_indexing_options(text_indexing)
310    };
311    builder.add_text_field("contents", contents_options);
312    builder.build()
313}
314
315fn register_tokenizer(index: &Index, mode: TokenizerMode) -> Result<()> {
316    match mode {
317        TokenizerMode::Ngram => {
318            let analyzer = TextAnalyzer::builder(NgramTokenizer::new(2, 3, false)?)
319                .filter(RemoveLongFilter::limit(40))
320                .filter(LowerCaser)
321                .build();
322            index.tokenizers().register(TOKENIZER_NAME, analyzer);
323            Ok(())
324        }
325        TokenizerMode::LinderaIpadic => {
326            #[cfg(feature = "tokenizer-lindera-ipadic")]
327            {
328                let dictionary = load_dictionary("embedded://ipadic")
329                    .context("failed to load Lindera IPADIC dictionary")?;
330                let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
331                let tokenizer = LinderaTokenizer::from_segmenter(segmenter);
332                index.tokenizers().register(TOKENIZER_NAME, tokenizer);
333                Ok(())
334            }
335            #[cfg(not(feature = "tokenizer-lindera-ipadic"))]
336            {
337                bail!(
338                    "Lindera tokenizer is not enabled. Build with `--features tokenizer-lindera-ipadic`."
339                )
340            }
341        }
342    }
343}
344
345#[cfg(test)]
346mod tests {
347    #[cfg(not(feature = "tokenizer-lindera-ipadic"))]
348    #[test]
349    fn default_mode_is_ngram_without_lindera_feature() {
350        assert_eq!(crate::default_tokenizer_mode(), crate::TokenizerMode::Ngram);
351    }
352
353    #[cfg(feature = "tokenizer-lindera-ipadic")]
354    #[test]
355    fn default_mode_is_lindera_with_feature() {
356        assert_eq!(
357            crate::default_tokenizer_mode(),
358            crate::TokenizerMode::LinderaIpadic
359        );
360    }
361}