siftdb_core/
ingest.rs

1use std::path::Path;
2use std::fs;
3use std::io::Read;
4use std::collections::HashMap;
5use ignore::WalkBuilder;
6use anyhow::Result;
7use crate::types::*;
8use crate::storage::{SegmentWriter, generate_line_table};
9use crate::index::{PathIndex, HandlesMap};
10
11pub struct IngestOptions {
12    pub include_patterns: Vec<String>,
13    pub exclude_patterns: Vec<String>,
14    pub max_file_bytes: u64,
15    pub binary_ratio_threshold: f32,
16}
17
18impl Default for IngestOptions {
19    fn default() -> Self {
20        Self {
21            include_patterns: vec!["**/*".to_string()],
22            exclude_patterns: vec![
23                "**/target/**".to_string(),
24                "**/node_modules/**".to_string(),
25                "**/.git/**".to_string(),
26                "**/build/**".to_string(),
27                "**/dist/**".to_string(),
28            ],
29            max_file_bytes: 10 * 1024 * 1024, // 10MB
30            binary_ratio_threshold: 0.3, // 30% non-printable chars = binary
31        }
32    }
33}
34
35pub struct Ingester {
36    collection_path: std::path::PathBuf,
37    options: IngestOptions,
38}
39
40impl Ingester {
41    pub fn new(collection_path: std::path::PathBuf, options: IngestOptions) -> Self {
42        Self {
43            collection_path,
44            options,
45        }
46    }
47    
48    pub fn ingest_from_fs(&mut self, source_path: &Path) -> Result<IngestStats> {
49        let mut stats = IngestStats::default();
50        let mut ingested_content = HashMap::new();
51        let mut path_mappings = HashMap::new();
52        let mut handle_metadata = HashMap::new();
53        
54        // Load existing indexes or create new ones
55        let mut path_index = if self.collection_path.join("index/path.json").exists() {
56            PathIndex::read_from_file(&self.collection_path.join("index/path.json"))?
57        } else {
58            PathIndex::new()
59        };
60        let mut handles_map = if self.collection_path.join("index/handles.json").exists() {
61            HandlesMap::read_from_file(&self.collection_path.join("index/handles.json"))?
62        } else {
63            HandlesMap::new()
64        };
65        
66        // Create segment writer
67        let store_path = self.collection_path.join("store");
68        let seg_id = self.find_next_segment_id(&store_path)?;
69        let mut writer = SegmentWriter::new(&store_path, seg_id)?;
70        
71        // Walk directory with ignore patterns
72        let walker = WalkBuilder::new(source_path)
73            .hidden(false) // Include hidden files
74            .git_ignore(true)
75            .git_global(true)
76            .git_exclude(true)
77            .build();
78        
79        for entry in walker {
80            let entry = entry?;
81            let path = entry.path();
82            
83            // Skip directories
84            if path.is_dir() {
85                continue;
86            }
87            
88            // Apply filters
89            if !self.should_include_file(path)? {
90                stats.skipped += 1;
91                continue;
92            }
93            
94            // Read file content
95            let content = match fs::read(path) {
96                Ok(content) => content,
97                Err(e) => {
98                    eprintln!("Warning: Failed to read {}: {}", path.display(), e);
99                    stats.errors += 1;
100                    continue;
101                }
102            };
103            
104            // Check file size
105            if content.len() > self.options.max_file_bytes as usize {
106                stats.skipped += 1;
107                continue;
108            }
109            
110            // Check if binary
111            if self.is_binary(&content) {
112                stats.skipped += 1;
113                continue;
114            }
115            
116            // Generate relative path from source
117            let relative_path = path.strip_prefix(source_path)
118                .unwrap_or(path)
119                .to_string_lossy()
120                .to_string();
121            
122            // Detect language
123            let lang = detect_language(path);
124            
125            // Generate line table
126            let line_table = generate_line_table(&content);
127            
128            // Create frame
129            let header = FileHeader::new(&content, &line_table, lang);
130            let frame = Frame {
131                header,
132                content: content.clone(),
133                line_table,
134            };
135            
136            // Write frame and get handle
137            let handle = path_index.add_path(relative_path.clone());
138            let metadata = writer.write_frame(&frame)?;
139            handles_map.add_handle(handle, metadata.clone());
140            
141            ingested_content.insert(handle, content);
142            path_mappings.insert(relative_path, handle);
143            handle_metadata.insert(handle, metadata);
144            
145            stats.ingested += 1;
146            
147            if stats.ingested % 100 == 0 {
148                println!("Ingested {} files...", stats.ingested);
149            }
150        }
151        
152        // Build and save indexes
153        let mut path_index = PathIndex::new();
154        for (path, handle) in path_mappings {
155            path_index.paths.insert(path, handle);
156        }
157        
158        let mut handles_map = HandlesMap::new();
159        for (handle, metadata) in handle_metadata {
160            handles_map.add_handle(handle, metadata);
161        }
162        
163        path_index.write_to_file(&self.collection_path.join("index/path.json"))?;
164        handles_map.write_to_file(&self.collection_path.join("index/handles.json"))?;
165        
166        // Build inverted index from file contents for O(1) search
167        println!("Building inverted index for O(1) search...");
168        let mut file_contents = HashMap::new();
169        
170        for (file_handle, content) in &ingested_content {
171            if let Ok(content_str) = String::from_utf8(content.clone()) {
172                file_contents.insert(*file_handle as u32, content_str);
173            }
174        }
175        
176        if !file_contents.is_empty() {
177            let inverted_index = crate::inverted_index::InvertedIndex::build_from_content(
178                file_contents,
179                &self.collection_path.join("index/terms.fst"),
180                &self.collection_path.join("index/posting_lists.json")
181            )?;
182            println!("✓ Inverted index built with {} terms", inverted_index.term_count());
183        }
184        
185        println!(
186            "Ingestion complete: {} files ingested, {} skipped, {} errors",
187            stats.ingested, stats.skipped, stats.errors
188        );
189        
190        Ok(stats)
191    }
192    
193    fn should_include_file(&self, path: &Path) -> Result<bool> {
194        let path_str = path.to_string_lossy();
195        
196        // Check exclude patterns first
197        for pattern in &self.options.exclude_patterns {
198            if self.glob_match(pattern, &path_str) {
199                return Ok(false);
200            }
201        }
202        
203        // Check include patterns
204        for pattern in &self.options.include_patterns {
205            if self.glob_match(pattern, &path_str) {
206                return Ok(true);
207            }
208        }
209        
210        Ok(false)
211    }
212    
213    fn is_binary(&self, content: &[u8]) -> bool {
214        if content.is_empty() {
215            return false;
216        }
217        
218        let mut non_printable = 0;
219        for &byte in content.iter().take(1024) { // Check first 1KB
220            if byte < 32 && byte != 9 && byte != 10 && byte != 13 {
221                non_printable += 1;
222            }
223        }
224        
225        let ratio = non_printable as f32 / content.len().min(1024) as f32;
226        ratio > self.options.binary_ratio_threshold
227    }
228    
229    fn glob_match(&self, pattern: &str, text: &str) -> bool {
230        // Simple glob matching for MVP
231        if pattern == "**/*" {
232            return true;
233        }
234        
235        // Handle **/*.ext patterns (recursive file extension matching)
236        if pattern.starts_with("**/") {
237            let suffix = &pattern[3..];
238            if suffix.starts_with("*.") {
239                // Extract extension from pattern like "*.rs"
240                let ext = &suffix[1..]; // Include the dot
241                return text.ends_with(ext);
242            } else {
243                return text.ends_with(suffix);
244            }
245        }
246        
247        if pattern.starts_with("**/") && pattern.ends_with("/**") {
248            let dir_name = &pattern[3..pattern.len()-3];
249            return text.contains(&format!("/{}/", dir_name)) || 
250                   text.starts_with(&format!("{}/", dir_name));
251        }
252        
253        if pattern.ends_with("/**") {
254            let prefix = &pattern[..pattern.len()-3];
255            return text.starts_with(prefix);
256        }
257        
258        // Handle simple extension patterns like "*.rs"
259        if pattern.starts_with("*.") {
260            let ext = &pattern[1..]; // Include the dot
261            return text.ends_with(ext);
262        }
263        
264        // Handle exact filename matches (check if pattern matches just the filename)
265        if !pattern.contains('/') && !pattern.contains('*') {
266            if let Some(filename) = text.split('/').last() {
267                if filename == pattern {
268                    return true;
269                }
270            }
271        }
272        
273        // Wildcard matching
274        if pattern.contains('*') {
275            return self.wildcard_match(pattern, text);
276        }
277        
278        pattern == text
279    }
280    
281    fn wildcard_match(&self, pattern: &str, text: &str) -> bool {
282        let pattern_chars: Vec<char> = pattern.chars().collect();
283        let text_chars: Vec<char> = text.chars().collect();
284        
285        self.match_recursive(&pattern_chars, &text_chars, 0, 0)
286    }
287    
288    fn match_recursive(&self, pattern: &[char], text: &[char], p_idx: usize, t_idx: usize) -> bool {
289        if p_idx == pattern.len() {
290            return t_idx == text.len();
291        }
292        
293        if pattern[p_idx] == '*' {
294            // Try matching 0 or more characters
295            for i in t_idx..=text.len() {
296                if self.match_recursive(pattern, text, p_idx + 1, i) {
297                    return true;
298                }
299            }
300            false
301        } else if t_idx < text.len() && (pattern[p_idx] == text[t_idx] || pattern[p_idx] == '?') {
302            self.match_recursive(pattern, text, p_idx + 1, t_idx + 1)
303        } else {
304            false
305        }
306    }
307    
308    fn find_next_segment_id(&self, store_path: &Path) -> Result<u32> {
309        let mut max_id = 0;
310        
311        if store_path.exists() {
312            for entry in fs::read_dir(store_path)? {
313                let entry = entry?;
314                let name = entry.file_name();
315                let name_str = name.to_string_lossy();
316                
317                if name_str.starts_with("seg-") && name_str.ends_with(".sift") {
318                    if let Some(id_str) = name_str.strip_prefix("seg-").and_then(|s| s.strip_suffix(".sift")) {
319                        if let Ok(id) = id_str.parse::<u32>() {
320                            max_id = max_id.max(id);
321                        }
322                    }
323                }
324            }
325        }
326        
327        Ok(max_id + 1)
328    }
329}
330
331#[derive(Debug, Clone)]
332pub struct IngestStats {
333    pub ingested: u64,
334    pub skipped: u64,
335    pub errors: u64,
336}
337
338impl IngestStats {
339    pub fn new() -> Self {
340        Self {
341            ingested: 0,
342            skipped: 0,
343            errors: 0,
344        }
345    }
346}
347
348impl Default for IngestStats {
349    fn default() -> Self {
350        Self::new()
351    }
352}