Skip to main content

ygrep_core/index/
schema.rs

1use std::collections::VecDeque;
2use tantivy::schema::{
3    IndexRecordOption, Schema, TextFieldIndexing, TextOptions, FAST, STORED, STRING,
4};
5use tantivy::tokenizer::{LowerCaser, RemoveLongFilter, TextAnalyzer, TokenizerManager};
6
7/// Schema version - increment when schema changes require reindexing
8pub const SCHEMA_VERSION: u32 = 4;
9
10/// Name of our custom code tokenizer
11pub const CODE_TOKENIZER: &str = "code";
12
13/// Register the code-aware tokenizer with an index
14pub fn register_tokenizers(tokenizer_manager: &TokenizerManager) {
15    // Code tokenizer: keeps $, @, # as part of tokens
16    // Uses SimpleTokenizer which splits on whitespace, then we just lowercase
17    let code_tokenizer = TextAnalyzer::builder(CodeTokenizer)
18        .filter(LowerCaser)
19        .filter(RemoveLongFilter::limit(100))
20        .build();
21
22    tokenizer_manager.register(CODE_TOKENIZER, code_tokenizer);
23}
24
25/// Custom tokenizer for code that preserves $, @, #, etc.
26#[derive(Clone)]
27struct CodeTokenizer;
28
29impl tantivy::tokenizer::Tokenizer for CodeTokenizer {
30    type TokenStream<'a> = CodeTokenStream<'a>;
31
32    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
33        CodeTokenStream {
34            text,
35            chars: text.char_indices().peekable(),
36            token: tantivy::tokenizer::Token::default(),
37            subtoken_buffer: VecDeque::new(),
38            subtoken_position: 0,
39        }
40    }
41}
42
43struct CodeTokenStream<'a> {
44    text: &'a str,
45    chars: std::iter::Peekable<std::str::CharIndices<'a>>,
46    token: tantivy::tokenizer::Token,
47    /// Buffered subtokens to emit at the same position as the parent token
48    subtoken_buffer: VecDeque<String>,
49    /// The position value to use for buffered subtokens
50    subtoken_position: usize,
51}
52
53/// Split a token into subtokens at camelCase and snake_case boundaries.
54/// Returns subtokens only if there are 2+ parts; returns empty vec for simple tokens.
55fn split_subtokens(text: &str) -> Vec<String> {
56    let mut parts = Vec::new();
57
58    // First handle snake_case: split on underscores
59    let segments: Vec<&str> = text.split('_').filter(|s| !s.is_empty()).collect();
60
61    // If there were underscores and multiple segments, process each for camelCase too
62    for segment in &segments {
63        // Split on camelCase boundaries within each segment
64        let chars: Vec<char> = segment.chars().collect();
65        let mut part_start = 0;
66
67        for i in 1..chars.len() {
68            // camelCase boundary: lowercase followed by uppercase
69            if chars[i - 1].is_lowercase() && chars[i].is_uppercase() {
70                let part: String = chars[part_start..i].iter().collect();
71                if !part.is_empty() {
72                    parts.push(part);
73                }
74                part_start = i;
75            }
76        }
77        // Push the remaining part
78        let part: String = chars[part_start..].iter().collect();
79        if !part.is_empty() {
80            parts.push(part);
81        }
82    }
83
84    // Only return subtokens if we actually split the token
85    if parts.len() <= 1 {
86        return Vec::new();
87    }
88
89    parts
90}
91
92impl<'a> tantivy::tokenizer::TokenStream for CodeTokenStream<'a> {
93    fn advance(&mut self) -> bool {
94        // First, check if we have buffered subtokens to emit
95        if let Some(subtoken) = self.subtoken_buffer.pop_front() {
96            self.token.text.clear();
97            self.token.text.push_str(&subtoken);
98            // Keep the same position as the parent token
99            self.token.position = self.subtoken_position;
100            return true;
101        }
102
103        self.token.text.clear();
104        self.token.position = self.token.position.wrapping_add(1);
105
106        // Skip whitespace
107        while let Some(&(_, c)) = self.chars.peek() {
108            if !c.is_whitespace() {
109                break;
110            }
111            self.chars.next();
112        }
113
114        let start = match self.chars.peek() {
115            Some(&(pos, _)) => pos,
116            None => return false,
117        };
118
119        // Collect token: alphanumeric + code chars ($, @, #, _, -)
120        let mut end = start;
121        while let Some(&(pos, c)) = self.chars.peek() {
122            if c.is_alphanumeric() || c == '_' || c == '$' || c == '@' || c == '#' || c == '-' {
123                end = pos + c.len_utf8();
124                self.chars.next();
125            } else if c.is_whitespace() {
126                break;
127            } else {
128                // Other punctuation - emit as separate token or skip
129                self.chars.next();
130                if start == pos {
131                    // Started with punctuation, skip and try again
132                    return self.advance();
133                }
134                break;
135            }
136        }
137
138        if end > start {
139            self.token.offset_from = start;
140            self.token.offset_to = end;
141            let token_text = &self.text[start..end];
142            self.token.text.push_str(token_text);
143
144            // Check for camelCase/snake_case subtokens
145            let subtokens = split_subtokens(token_text);
146            if !subtokens.is_empty() {
147                self.subtoken_position = self.token.position;
148                for sub in subtokens {
149                    self.subtoken_buffer.push_back(sub);
150                }
151            }
152
153            true
154        } else {
155            false
156        }
157    }
158
159    fn token(&self) -> &tantivy::tokenizer::Token {
160        &self.token
161    }
162
163    fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
164        &mut self.token
165    }
166}
167
168/// Field names for the document index
169pub mod fields {
170    pub const DOC_ID: &str = "doc_id";
171    pub const PATH: &str = "path";
172    pub const WORKSPACE: &str = "workspace";
173    pub const CONTENT: &str = "content";
174    pub const MTIME: &str = "mtime";
175    pub const SIZE: &str = "size";
176    pub const EXTENSION: &str = "extension";
177    pub const LINE_START: &str = "line_start";
178    pub const LINE_END: &str = "line_end";
179    pub const CHUNK_ID: &str = "chunk_id";
180    pub const PARENT_DOC: &str = "parent_doc";
181    pub const FILEPATH: &str = "filepath";
182}
183
184/// Build the Tantivy schema for document indexing
185pub fn build_document_schema() -> Schema {
186    let mut schema_builder = Schema::builder();
187
188    // Content field with positions for phrase queries
189    // Uses our custom "code" tokenizer that preserves $, @, #, etc.
190    let text_options = TextOptions::default()
191        .set_indexing_options(
192            TextFieldIndexing::default()
193                .set_tokenizer(CODE_TOKENIZER)
194                .set_index_option(IndexRecordOption::WithFreqsAndPositions),
195        )
196        .set_stored();
197
198    // STRING + STORED + FAST for fields used in incremental indexing lookups
199    let string_stored_fast = TextOptions::default()
200        .set_indexing_options(
201            TextFieldIndexing::default()
202                .set_tokenizer("raw")
203                .set_index_option(IndexRecordOption::Basic),
204        )
205        .set_stored()
206        .set_fast(None);
207
208    // Document identification (fast for incremental index map building)
209    schema_builder.add_text_field(fields::DOC_ID, string_stored_fast.clone());
210    schema_builder.add_text_field(fields::PATH, string_stored_fast.clone());
211    schema_builder.add_text_field(fields::WORKSPACE, STRING | STORED);
212
213    // File metadata
214    schema_builder.add_u64_field(fields::MTIME, FAST | STORED);
215    schema_builder.add_u64_field(fields::SIZE, FAST | STORED);
216    schema_builder.add_text_field(fields::EXTENSION, STRING | STORED);
217
218    // Searchable file path (uses code tokenizer so path segments are searchable)
219    let filepath_options = TextOptions::default()
220        .set_indexing_options(
221            TextFieldIndexing::default()
222                .set_tokenizer(CODE_TOKENIZER)
223                .set_index_option(IndexRecordOption::Basic),
224        )
225        .set_stored();
226    schema_builder.add_text_field(fields::FILEPATH, filepath_options);
227
228    // Content for full-text search
229    schema_builder.add_text_field(fields::CONTENT, text_options);
230
231    // Line range for the document/chunk
232    schema_builder.add_u64_field(fields::LINE_START, FAST | STORED);
233    schema_builder.add_u64_field(fields::LINE_END, FAST | STORED);
234
235    // Chunk-specific fields (CHUNK_ID is fast for incremental index filtering)
236    schema_builder.add_text_field(fields::CHUNK_ID, string_stored_fast);
237    schema_builder.add_text_field(fields::PARENT_DOC, STRING | STORED);
238
239    schema_builder.build()
240}
241
242/// Schema field handles for efficient access
243#[derive(Clone)]
244pub struct SchemaFields {
245    pub doc_id: tantivy::schema::Field,
246    pub path: tantivy::schema::Field,
247    pub filepath: Option<tantivy::schema::Field>,
248    pub workspace: tantivy::schema::Field,
249    pub content: tantivy::schema::Field,
250    pub mtime: tantivy::schema::Field,
251    pub size: tantivy::schema::Field,
252    pub extension: tantivy::schema::Field,
253    pub line_start: tantivy::schema::Field,
254    pub line_end: tantivy::schema::Field,
255    pub chunk_id: tantivy::schema::Field,
256    pub parent_doc: tantivy::schema::Field,
257}
258
259impl SchemaFields {
260    pub fn new(schema: &Schema) -> Self {
261        Self {
262            doc_id: schema.get_field(fields::DOC_ID).unwrap(),
263            path: schema.get_field(fields::PATH).unwrap(),
264            filepath: schema.get_field(fields::FILEPATH).ok(),
265            workspace: schema.get_field(fields::WORKSPACE).unwrap(),
266            content: schema.get_field(fields::CONTENT).unwrap(),
267            mtime: schema.get_field(fields::MTIME).unwrap(),
268            size: schema.get_field(fields::SIZE).unwrap(),
269            extension: schema.get_field(fields::EXTENSION).unwrap(),
270            line_start: schema.get_field(fields::LINE_START).unwrap(),
271            line_end: schema.get_field(fields::LINE_END).unwrap(),
272            chunk_id: schema.get_field(fields::CHUNK_ID).unwrap(),
273            parent_doc: schema.get_field(fields::PARENT_DOC).unwrap(),
274        }
275    }
276}
277
278#[cfg(test)]
279mod tests {
280    use super::*;
281    use tantivy::tokenizer::TokenStream;
282
283    /// Helper: tokenize text with the code tokenizer and return token strings
284    fn tokenize(text: &str) -> Vec<String> {
285        let mut tokenizer = TextAnalyzer::builder(CodeTokenizer)
286            .filter(LowerCaser)
287            .filter(RemoveLongFilter::limit(100))
288            .build();
289        let mut stream = tokenizer.token_stream(text);
290        let mut tokens = Vec::new();
291        while stream.advance() {
292            tokens.push(stream.token().text.clone());
293        }
294        tokens
295    }
296
297    #[test]
298    fn test_schema_creation() {
299        let schema = build_document_schema();
300        let fields = SchemaFields::new(&schema);
301
302        // Verify all fields are accessible
303        assert!(schema.get_field(fields::DOC_ID).is_ok());
304        assert!(schema.get_field(fields::PATH).is_ok());
305        assert!(schema.get_field(fields::CONTENT).is_ok());
306
307        // Verify field handles work
308        let _ = fields.doc_id;
309        let _ = fields.content;
310    }
311
312    #[test]
313    fn test_tokenizer_preserves_code_chars() {
314        // $variable, @decorator, #include should be preserved as tokens
315        let tokens = tokenize("$variable @decorator #include");
316        assert!(tokens.contains(&"$variable".to_string()));
317        assert!(tokens.contains(&"@decorator".to_string()));
318        assert!(tokens.contains(&"#include".to_string()));
319
320        // Hyphen is kept (e.g., CSS class names like "my-class")
321        let tokens = tokenize("my-class foo-bar");
322        assert!(tokens.contains(&"my-class".to_string()));
323        assert!(tokens.contains(&"foo-bar".to_string()));
324
325        // Underscore is kept (identifiers like "hello_world")
326        let tokens = tokenize("hello_world some_func");
327        assert!(tokens.contains(&"hello_world".to_string()));
328        assert!(tokens.contains(&"some_func".to_string()));
329    }
330
331    #[test]
332    fn test_tokenizer_lowercases() {
333        let tokens = tokenize("FnMain HelloWorld UPPER");
334        // Full tokens are lowercased
335        assert!(tokens.contains(&"fnmain".to_string()));
336        assert!(tokens.contains(&"helloworld".to_string()));
337        assert!(tokens.contains(&"upper".to_string()));
338    }
339
340    #[test]
341    fn test_tokenizer_camelcase_subtokens() {
342        let tokens = tokenize("sendCampaign");
343        // Full token
344        assert!(tokens.contains(&"sendcampaign".to_string()));
345        // Subtokens from camelCase split
346        assert!(tokens.contains(&"send".to_string()));
347        assert!(tokens.contains(&"campaign".to_string()));
348    }
349
350    #[test]
351    fn test_tokenizer_snake_case_subtokens() {
352        let tokens = tokenize("send_campaign");
353        // Full token
354        assert!(tokens.contains(&"send_campaign".to_string()));
355        // Subtokens from snake_case split
356        assert!(tokens.contains(&"send".to_string()));
357        assert!(tokens.contains(&"campaign".to_string()));
358    }
359
360    #[test]
361    fn test_tokenizer_mixed_case_subtokens() {
362        // camelCase within snake_case segments
363        let tokens = tokenize("myQueue_sendCampaign");
364        assert!(tokens.contains(&"myqueue_sendcampaign".to_string()));
365        // snake_case split
366        assert!(tokens.contains(&"my".to_string()));
367        assert!(tokens.contains(&"queue".to_string()));
368        assert!(tokens.contains(&"send".to_string()));
369        assert!(tokens.contains(&"campaign".to_string()));
370    }
371
372    #[test]
373    fn test_tokenizer_no_subtokens_for_simple() {
374        // Single word should not produce subtokens
375        let tokens = tokenize("hello");
376        assert_eq!(tokens, vec!["hello".to_string()]);
377    }
378
379    #[test]
380    fn test_tokenizer_removes_long_tokens() {
381        let long_token = "a".repeat(101);
382        let text = format!("short {} end", long_token);
383        let tokens = tokenize(&text);
384        assert!(tokens.contains(&"short".to_string()));
385        assert!(tokens.contains(&"end".to_string()));
386        // The 101-char token should be removed by RemoveLongFilter
387        assert!(!tokens.iter().any(|t| t.len() > 100));
388    }
389}