siftdb_core/
ingest.rs

1use crate::types::{Frame, FileHeader, detect_language};
2use crate::storage::{SegmentWriter, generate_line_table};
3use crate::index::{PathIndex, HandlesMap};
4use anyhow::Result;
5use ignore::WalkBuilder;
6use std::fs;
7use std::path::Path;
8
9pub struct IngestOptions {
10    pub include_patterns: Vec<String>,
11    pub exclude_patterns: Vec<String>,
12    pub max_file_bytes: u64,
13    pub binary_ratio_threshold: f32,
14}
15
16impl Default for IngestOptions {
17    fn default() -> Self {
18        Self {
19            include_patterns: vec!["**/*".to_string()],
20            exclude_patterns: vec![
21                "**/target/**".to_string(),
22                "**/node_modules/**".to_string(),
23                "**/.git/**".to_string(),
24                "**/build/**".to_string(),
25                "**/dist/**".to_string(),
26            ],
27            max_file_bytes: 10 * 1024 * 1024, // 10MB
28            binary_ratio_threshold: 0.3, // 30% non-printable chars = binary
29        }
30    }
31}
32
33pub struct Ingester {
34    collection_path: std::path::PathBuf,
35    options: IngestOptions,
36}
37
38impl Ingester {
39    pub fn new(collection_path: std::path::PathBuf, options: IngestOptions) -> Self {
40        Self {
41            collection_path,
42            options,
43        }
44    }
45    
46    pub fn ingest_from_fs(&mut self, source_path: &Path) -> Result<IngestStats> {
47        let mut stats = IngestStats::new();
48        
49        // Load existing indexes
50        let mut path_index = PathIndex::read_from_file(
51            &self.collection_path.join("index/path.json")
52        )?;
53        let mut handles_map = HandlesMap::read_from_file(
54            &self.collection_path.join("index/handles.json")
55        )?;
56        
57        // Create segment writer
58        let store_path = self.collection_path.join("store");
59        let seg_id = self.find_next_segment_id(&store_path)?;
60        let mut writer = SegmentWriter::new(&store_path, seg_id)?;
61        
62        // Walk directory with ignore patterns
63        let walker = WalkBuilder::new(source_path)
64            .hidden(false) // Include hidden files
65            .git_ignore(true)
66            .git_global(true)
67            .git_exclude(true)
68            .build();
69        
70        for entry in walker {
71            let entry = entry?;
72            let path = entry.path();
73            
74            // Skip directories
75            if path.is_dir() {
76                continue;
77            }
78            
79            // Apply filters
80            if !self.should_include_file(path)? {
81                stats.skipped += 1;
82                continue;
83            }
84            
85            // Read file content
86            let content = match fs::read(path) {
87                Ok(content) => content,
88                Err(e) => {
89                    eprintln!("Warning: Failed to read {}: {}", path.display(), e);
90                    stats.errors += 1;
91                    continue;
92                }
93            };
94            
95            // Check file size
96            if content.len() > self.options.max_file_bytes as usize {
97                stats.skipped += 1;
98                continue;
99            }
100            
101            // Check if binary
102            if self.is_binary(&content) {
103                stats.skipped += 1;
104                continue;
105            }
106            
107            // Generate relative path from source
108            let relative_path = path.strip_prefix(source_path)
109                .unwrap_or(path)
110                .to_string_lossy()
111                .to_string();
112            
113            // Detect language
114            let lang = detect_language(path);
115            
116            // Generate line table
117            let line_table = generate_line_table(&content);
118            
119            // Create frame
120            let header = FileHeader::new(&content, &line_table, lang);
121            let frame = Frame {
122                header,
123                content,
124                line_table,
125            };
126            
127            // Write frame and get handle
128            let handle = path_index.add_path(relative_path.clone());
129            let metadata = writer.write_frame(&frame)?;
130            handles_map.add_handle(handle, metadata);
131            
132            stats.ingested += 1;
133            
134            if stats.ingested % 100 == 0 {
135                println!("Ingested {} files...", stats.ingested);
136            }
137        }
138        
139        // Write updated indexes
140        path_index.write_to_file(&self.collection_path.join("index/path.json"))?;
141        handles_map.write_to_file(&self.collection_path.join("index/handles.json"))?;
142        
143        println!(
144            "Ingestion complete: {} files ingested, {} skipped, {} errors",
145            stats.ingested, stats.skipped, stats.errors
146        );
147        
148        Ok(stats)
149    }
150    
151    fn should_include_file(&self, path: &Path) -> Result<bool> {
152        let path_str = path.to_string_lossy();
153        
154        // Check exclude patterns first
155        for pattern in &self.options.exclude_patterns {
156            if self.glob_match(pattern, &path_str) {
157                return Ok(false);
158            }
159        }
160        
161        // Check include patterns
162        for pattern in &self.options.include_patterns {
163            if self.glob_match(pattern, &path_str) {
164                return Ok(true);
165            }
166        }
167        
168        Ok(false)
169    }
170    
171    fn is_binary(&self, content: &[u8]) -> bool {
172        if content.is_empty() {
173            return false;
174        }
175        
176        let mut non_printable = 0;
177        for &byte in content.iter().take(1024) { // Check first 1KB
178            if byte < 32 && byte != 9 && byte != 10 && byte != 13 {
179                non_printable += 1;
180            }
181        }
182        
183        let ratio = non_printable as f32 / content.len().min(1024) as f32;
184        ratio > self.options.binary_ratio_threshold
185    }
186    
187    fn glob_match(&self, pattern: &str, text: &str) -> bool {
188        // Simple glob matching for MVP
189        if pattern == "**/*" {
190            return true;
191        }
192        
193        if pattern.starts_with("**/") && pattern.ends_with("/**") {
194            let dir_name = &pattern[3..pattern.len()-3];
195            return text.contains(&format!("/{}/", dir_name)) || 
196                   text.starts_with(&format!("{}/", dir_name));
197        }
198        
199        if pattern.starts_with("**/") {
200            let suffix = &pattern[3..];
201            return text.ends_with(suffix);
202        }
203        
204        if pattern.ends_with("/**") {
205            let prefix = &pattern[..pattern.len()-3];
206            return text.starts_with(prefix);
207        }
208        
209        // Handle simple extension patterns like "*.rs"
210        if pattern.starts_with("*.") {
211            let ext = &pattern[1..]; // Include the dot
212            return text.ends_with(ext);
213        }
214        
215        // Handle exact filename matches (check if pattern matches just the filename)
216        if !pattern.contains('/') && !pattern.contains('*') {
217            if let Some(filename) = text.split('/').last() {
218                if filename == pattern {
219                    return true;
220                }
221            }
222        }
223        
224        // Wildcard matching
225        if pattern.contains('*') {
226            return self.wildcard_match(pattern, text);
227        }
228        
229        pattern == text
230    }
231    
232    fn wildcard_match(&self, pattern: &str, text: &str) -> bool {
233        let pattern_chars: Vec<char> = pattern.chars().collect();
234        let text_chars: Vec<char> = text.chars().collect();
235        
236        self.match_recursive(&pattern_chars, &text_chars, 0, 0)
237    }
238    
239    fn match_recursive(&self, pattern: &[char], text: &[char], p_idx: usize, t_idx: usize) -> bool {
240        if p_idx == pattern.len() {
241            return t_idx == text.len();
242        }
243        
244        if pattern[p_idx] == '*' {
245            // Try matching 0 or more characters
246            for i in t_idx..=text.len() {
247                if self.match_recursive(pattern, text, p_idx + 1, i) {
248                    return true;
249                }
250            }
251            false
252        } else if t_idx < text.len() && (pattern[p_idx] == text[t_idx] || pattern[p_idx] == '?') {
253            self.match_recursive(pattern, text, p_idx + 1, t_idx + 1)
254        } else {
255            false
256        }
257    }
258    
259    fn find_next_segment_id(&self, store_path: &Path) -> Result<u32> {
260        let mut max_id = 0;
261        
262        if store_path.exists() {
263            for entry in fs::read_dir(store_path)? {
264                let entry = entry?;
265                let name = entry.file_name();
266                let name_str = name.to_string_lossy();
267                
268                if name_str.starts_with("seg-") && name_str.ends_with(".sift") {
269                    if let Some(id_str) = name_str.strip_prefix("seg-").and_then(|s| s.strip_suffix(".sift")) {
270                        if let Ok(id) = id_str.parse::<u32>() {
271                            max_id = max_id.max(id);
272                        }
273                    }
274                }
275            }
276        }
277        
278        Ok(max_id + 1)
279    }
280}
281
282#[derive(Debug, Clone)]
283pub struct IngestStats {
284    pub ingested: u64,
285    pub skipped: u64,
286    pub errors: u64,
287}
288
289impl IngestStats {
290    pub fn new() -> Self {
291        Self {
292            ingested: 0,
293            skipped: 0,
294            errors: 0,
295        }
296    }
297}
298
299impl Default for IngestStats {
300    fn default() -> Self {
301        Self::new()
302    }
303}