ygrep_core/index/
schema.rs1use std::collections::VecDeque;
2use tantivy::schema::{
3 IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST, STORED, STRING,
4};
5use tantivy::tokenizer::{LowerCaser, RemoveLongFilter, TextAnalyzer, TokenizerManager};
6
7pub const SCHEMA_VERSION: u32 = 4;
9
10pub const CODE_TOKENIZER: &str = "code";
12
13pub fn register_tokenizers(tokenizer_manager: &TokenizerManager) {
15 let code_tokenizer = TextAnalyzer::builder(CodeTokenizer)
18 .filter(LowerCaser)
19 .filter(RemoveLongFilter::limit(100))
20 .build();
21
22 tokenizer_manager.register(CODE_TOKENIZER, code_tokenizer);
23}
24
25#[derive(Clone)]
27struct CodeTokenizer;
28
29impl tantivy::tokenizer::Tokenizer for CodeTokenizer {
30 type TokenStream<'a> = CodeTokenStream<'a>;
31
32 fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
33 CodeTokenStream {
34 text,
35 chars: text.char_indices().peekable(),
36 token: tantivy::tokenizer::Token::default(),
37 subtoken_buffer: VecDeque::new(),
38 subtoken_position: 0,
39 }
40 }
41}
42
43struct CodeTokenStream<'a> {
44 text: &'a str,
45 chars: std::iter::Peekable<std::str::CharIndices<'a>>,
46 token: tantivy::tokenizer::Token,
47 subtoken_buffer: VecDeque<String>,
49 subtoken_position: usize,
51}
52
53fn split_subtokens(text: &str) -> Vec<String> {
56 let mut parts = Vec::new();
57
58 let segments: Vec<&str> = text.split('_').filter(|s| !s.is_empty()).collect();
60
61 for segment in &segments {
63 let chars: Vec<char> = segment.chars().collect();
65 let mut part_start = 0;
66
67 for i in 1..chars.len() {
68 if chars[i - 1].is_lowercase() && chars[i].is_uppercase() {
70 let part: String = chars[part_start..i].iter().collect();
71 if !part.is_empty() {
72 parts.push(part);
73 }
74 part_start = i;
75 }
76 }
77 let part: String = chars[part_start..].iter().collect();
79 if !part.is_empty() {
80 parts.push(part);
81 }
82 }
83
84 if parts.len() <= 1 {
86 return Vec::new();
87 }
88
89 parts
90}
91
92impl<'a> tantivy::tokenizer::TokenStream for CodeTokenStream<'a> {
93 fn advance(&mut self) -> bool {
94 if let Some(subtoken) = self.subtoken_buffer.pop_front() {
96 self.token.text.clear();
97 self.token.text.push_str(&subtoken);
98 self.token.position = self.subtoken_position;
100 return true;
101 }
102
103 self.token.text.clear();
104 self.token.position = self.token.position.wrapping_add(1);
105
106 while let Some(&(_, c)) = self.chars.peek() {
108 if !c.is_whitespace() {
109 break;
110 }
111 self.chars.next();
112 }
113
114 let start = match self.chars.peek() {
115 Some(&(pos, _)) => pos,
116 None => return false,
117 };
118
119 let mut end = start;
121 while let Some(&(pos, c)) = self.chars.peek() {
122 if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' || c == '#' || c == '-' {
123 end = pos + c.len_utf8();
124 self.chars.next();
125 } else if c.is_whitespace() {
126 break;
127 } else {
128 self.chars.next();
130 if start == pos {
131 return self.advance();
133 }
134 break;
135 }
136 }
137
138 if end > start {
139 self.token.offset_from = start;
140 self.token.offset_to = end;
141 let token_text = &self.text[start..end];
142 self.token.text.push_str(token_text);
143
144 let subtokens = split_subtokens(token_text);
146 if !subtokens.is_empty() {
147 self.subtoken_position = self.token.position;
148 for sub in subtokens {
149 self.subtoken_buffer.push_back(sub);
150 }
151 }
152
153 true
154 } else {
155 false
156 }
157 }
158
159 fn token(&self) -> &tantivy::tokenizer::Token {
160 &self.token
161 }
162
163 fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
164 &mut self.token
165 }
166}
167
168pub mod fields {
170 pub const DOC_ID: &str = "doc_id";
171 pub const PATH: &str = "path";
172 pub const WORKSPACE: &str = "workspace";
173 pub const CONTENT: &str = "content";
174 pub const MTIME: &str = "mtime";
175 pub const SIZE: &str = "size";
176 pub const EXTENSION: &str = "extension";
177 pub const LINE_START: &str = "line_start";
178 pub const LINE_END: &str = "line_end";
179 pub const CHUNK_ID: &str = "chunk_id";
180 pub const PARENT_DOC: &str = "parent_doc";
181 pub const FILEPATH: &str = "filepath";
182}
183
184pub fn build_document_schema() -> Schema {
186 let mut schema_builder = Schema::builder();
187
188 let text_options = TextOptions::default()
191 .set_indexing_options(
192 TextFieldIndexing::default()
193 .set_tokenizer(CODE_TOKENIZER)
194 .set_index_option(IndexRecordOption::WithFreqsAndPositions),
195 )
196 .set_stored();
197
198 let string_stored_fast = TextOptions::default()
200 .set_indexing_options(
201 TextFieldIndexing::default()
202 .set_tokenizer("raw")
203 .set_index_option(IndexRecordOption::Basic),
204 )
205 .set_stored()
206 .set_fast(None);
207
208 schema_builder.add_text_field(fields::DOC_ID, string_stored_fast.clone());
210 schema_builder.add_text_field(fields::PATH, string_stored_fast.clone());
211 schema_builder.add_text_field(fields::WORKSPACE, STRING | STORED);
212
213 schema_builder.add_u64_field(fields::MTIME, FAST | STORED);
215 schema_builder.add_u64_field(fields::SIZE, FAST | STORED);
216 schema_builder.add_text_field(fields::EXTENSION, STRING | STORED);
217
218 let filepath_options = TextOptions::default()
220 .set_indexing_options(
221 TextFieldIndexing::default()
222 .set_tokenizer(CODE_TOKENIZER)
223 .set_index_option(IndexRecordOption::Basic),
224 )
225 .set_stored();
226 schema_builder.add_text_field(fields::FILEPATH, filepath_options);
227
228 schema_builder.add_text_field(fields::CONTENT, text_options);
230
231 schema_builder.add_u64_field(fields::LINE_START, FAST | STORED);
233 schema_builder.add_u64_field(fields::LINE_END, FAST | STORED);
234
235 schema_builder.add_text_field(fields::CHUNK_ID, string_stored_fast);
237 schema_builder.add_text_field(fields::PARENT_DOC, STRING | STORED);
238
239 schema_builder.build()
240}
241
242#[derive(Clone)]
244pub struct SchemaFields {
245 pub doc_id: tantivy::schema::Field,
246 pub path: tantivy::schema::Field,
247 pub filepath: Option<tantivy::schema::Field>,
248 pub workspace: tantivy::schema::Field,
249 pub content: tantivy::schema::Field,
250 pub mtime: tantivy::schema::Field,
251 pub size: tantivy::schema::Field,
252 pub extension: tantivy::schema::Field,
253 pub line_start: tantivy::schema::Field,
254 pub line_end: tantivy::schema::Field,
255 pub chunk_id: tantivy::schema::Field,
256 pub parent_doc: tantivy::schema::Field,
257}
258
259impl SchemaFields {
260 pub fn new(schema: &Schema) -> Self {
261 Self {
262 doc_id: schema.get_field(fields::DOC_ID).unwrap(),
263 path: schema.get_field(fields::PATH).unwrap(),
264 filepath: schema.get_field(fields::FILEPATH).ok(),
265 workspace: schema.get_field(fields::WORKSPACE).unwrap(),
266 content: schema.get_field(fields::CONTENT).unwrap(),
267 mtime: schema.get_field(fields::MTIME).unwrap(),
268 size: schema.get_field(fields::SIZE).unwrap(),
269 extension: schema.get_field(fields::EXTENSION).unwrap(),
270 line_start: schema.get_field(fields::LINE_START).unwrap(),
271 line_end: schema.get_field(fields::LINE_END).unwrap(),
272 chunk_id: schema.get_field(fields::CHUNK_ID).unwrap(),
273 parent_doc: schema.get_field(fields::PARENT_DOC).unwrap(),
274 }
275 }
276}
277
278#[cfg(test)]
279mod tests {
280 use super::*;
281 use tantivy::tokenizer::TokenStream;
282
283 fn tokenize(text: &str) -> Vec<String> {
285 let mut tokenizer = TextAnalyzer::builder(CodeTokenizer)
286 .filter(LowerCaser)
287 .filter(RemoveLongFilter::limit(100))
288 .build();
289 let mut stream = tokenizer.token_stream(text);
290 let mut tokens = Vec::new();
291 while stream.advance() {
292 tokens.push(stream.token().text.clone());
293 }
294 tokens
295 }
296
297 #[test]
298 fn test_schema_creation() {
299 let schema = build_document_schema();
300 let fields = SchemaFields::new(&schema);
301
302 assert!(schema.get_field(fields::DOC_ID).is_ok());
304 assert!(schema.get_field(fields::PATH).is_ok());
305 assert!(schema.get_field(fields::CONTENT).is_ok());
306
307 let _ = fields.doc_id;
309 let _ = fields.content;
310 }
311
312 #[test]
313 fn test_tokenizer_preserves_code_chars() {
314 let tokens = tokenize("$variable @decorator #include");
316 assert!(tokens.contains(&"$variable".to_string()));
317 assert!(tokens.contains(&"@decorator".to_string()));
318 assert!(tokens.contains(&"#include".to_string()));
319
320 let tokens = tokenize("my-class foo-bar");
322 assert!(tokens.contains(&"my-class".to_string()));
323 assert!(tokens.contains(&"foo-bar".to_string()));
324
325 let tokens = tokenize("hello_world some_func");
327 assert!(tokens.contains(&"hello_world".to_string()));
328 assert!(tokens.contains(&"some_func".to_string()));
329 }
330
331 #[test]
332 fn test_tokenizer_lowercases() {
333 let tokens = tokenize("FnMain HelloWorld UPPER");
334 assert!(tokens.contains(&"fnmain".to_string()));
336 assert!(tokens.contains(&"helloworld".to_string()));
337 assert!(tokens.contains(&"upper".to_string()));
338 }
339
340 #[test]
341 fn test_tokenizer_camelcase_subtokens() {
342 let tokens = tokenize("sendCampaign");
343 assert!(tokens.contains(&"sendcampaign".to_string()));
345 assert!(tokens.contains(&"send".to_string()));
347 assert!(tokens.contains(&"campaign".to_string()));
348 }
349
350 #[test]
351 fn test_tokenizer_snake_case_subtokens() {
352 let tokens = tokenize("send_campaign");
353 assert!(tokens.contains(&"send_campaign".to_string()));
355 assert!(tokens.contains(&"send".to_string()));
357 assert!(tokens.contains(&"campaign".to_string()));
358 }
359
360 #[test]
361 fn test_tokenizer_mixed_case_subtokens() {
362 let tokens = tokenize("myQueue_sendCampaign");
364 assert!(tokens.contains(&"myqueue_sendcampaign".to_string()));
365 assert!(tokens.contains(&"my".to_string()));
367 assert!(tokens.contains(&"queue".to_string()));
368 assert!(tokens.contains(&"send".to_string()));
369 assert!(tokens.contains(&"campaign".to_string()));
370 }
371
372 #[test]
373 fn test_tokenizer_no_subtokens_for_simple() {
374 let tokens = tokenize("hello");
376 assert_eq!(tokens, vec!["hello".to_string()]);
377 }
378
379 #[test]
380 fn test_tokenizer_removes_long_tokens() {
381 let long_token = "a".repeat(101);
382 let text = format!("short {} end", long_token);
383 let tokens = tokenize(&text);
384 assert!(tokens.contains(&"short".to_string()));
385 assert!(tokens.contains(&"end".to_string()));
386 assert!(!tokens.iter().any(|t| t.len() > 100));
388 }
389}