1use std::path::Path;
2use std::fs;
3use std::io::Read;
4use std::collections::HashMap;
5use ignore::WalkBuilder;
6use anyhow::Result;
7use crate::types::*;
8use crate::storage::{SegmentWriter, generate_line_table};
9use crate::index::{PathIndex, HandlesMap};
10
11pub struct IngestOptions {
12 pub include_patterns: Vec<String>,
13 pub exclude_patterns: Vec<String>,
14 pub max_file_bytes: u64,
15 pub binary_ratio_threshold: f32,
16}
17
18impl Default for IngestOptions {
19 fn default() -> Self {
20 Self {
21 include_patterns: vec!["**/*".to_string()],
22 exclude_patterns: vec![
23 "**/target/**".to_string(),
24 "**/node_modules/**".to_string(),
25 "**/.git/**".to_string(),
26 "**/build/**".to_string(),
27 "**/dist/**".to_string(),
28 ],
29 max_file_bytes: 10 * 1024 * 1024, binary_ratio_threshold: 0.3, }
32 }
33}
34
35pub struct Ingester {
36 collection_path: std::path::PathBuf,
37 options: IngestOptions,
38}
39
40impl Ingester {
41 pub fn new(collection_path: std::path::PathBuf, options: IngestOptions) -> Self {
42 Self {
43 collection_path,
44 options,
45 }
46 }
47
48 pub fn ingest_from_fs(&mut self, source_path: &Path) -> Result<IngestStats> {
49 let mut stats = IngestStats::default();
50 let mut ingested_content = HashMap::new();
51 let mut path_mappings = HashMap::new();
52 let mut handle_metadata = HashMap::new();
53
54 let mut path_index = if self.collection_path.join("index/path.json").exists() {
56 PathIndex::read_from_file(&self.collection_path.join("index/path.json"))?
57 } else {
58 PathIndex::new()
59 };
60 let mut handles_map = if self.collection_path.join("index/handles.json").exists() {
61 HandlesMap::read_from_file(&self.collection_path.join("index/handles.json"))?
62 } else {
63 HandlesMap::new()
64 };
65
66 let store_path = self.collection_path.join("store");
68 let seg_id = self.find_next_segment_id(&store_path)?;
69 let mut writer = SegmentWriter::new(&store_path, seg_id)?;
70
71 let walker = WalkBuilder::new(source_path)
73 .hidden(false) .git_ignore(true)
75 .git_global(true)
76 .git_exclude(true)
77 .build();
78
79 for entry in walker {
80 let entry = entry?;
81 let path = entry.path();
82
83 if path.is_dir() {
85 continue;
86 }
87
88 if !self.should_include_file(path)? {
90 stats.skipped += 1;
91 continue;
92 }
93
94 let content = match fs::read(path) {
96 Ok(content) => content,
97 Err(e) => {
98 eprintln!("Warning: Failed to read {}: {}", path.display(), e);
99 stats.errors += 1;
100 continue;
101 }
102 };
103
104 if content.len() > self.options.max_file_bytes as usize {
106 stats.skipped += 1;
107 continue;
108 }
109
110 if self.is_binary(&content) {
112 stats.skipped += 1;
113 continue;
114 }
115
116 let relative_path = path.strip_prefix(source_path)
118 .unwrap_or(path)
119 .to_string_lossy()
120 .to_string();
121
122 let lang = detect_language(path);
124
125 let line_table = generate_line_table(&content);
127
128 let header = FileHeader::new(&content, &line_table, lang);
130 let frame = Frame {
131 header,
132 content: content.clone(),
133 line_table,
134 };
135
136 let handle = path_index.add_path(relative_path.clone());
138 let metadata = writer.write_frame(&frame)?;
139 handles_map.add_handle(handle, metadata.clone());
140
141 ingested_content.insert(handle, content);
142 path_mappings.insert(relative_path, handle);
143 handle_metadata.insert(handle, metadata);
144
145 stats.ingested += 1;
146
147 if stats.ingested % 100 == 0 {
148 println!("Ingested {} files...", stats.ingested);
149 }
150 }
151
152 let mut path_index = PathIndex::new();
154 for (path, handle) in path_mappings {
155 path_index.paths.insert(path, handle);
156 }
157
158 let mut handles_map = HandlesMap::new();
159 for (handle, metadata) in handle_metadata {
160 handles_map.add_handle(handle, metadata);
161 }
162
163 path_index.write_to_file(&self.collection_path.join("index/path.json"))?;
164 handles_map.write_to_file(&self.collection_path.join("index/handles.json"))?;
165
166 println!("Building inverted index for O(1) search...");
168 let mut file_contents = HashMap::new();
169
170 for (file_handle, content) in &ingested_content {
171 if let Ok(content_str) = String::from_utf8(content.clone()) {
172 file_contents.insert(*file_handle as u32, content_str);
173 }
174 }
175
176 if !file_contents.is_empty() {
177 let inverted_index = crate::inverted_index::InvertedIndex::build_from_content(
178 file_contents,
179 &self.collection_path.join("index/terms.fst"),
180 &self.collection_path.join("index/posting_lists.json")
181 )?;
182 println!("✓ Inverted index built with {} terms", inverted_index.term_count());
183 }
184
185 println!(
186 "Ingestion complete: {} files ingested, {} skipped, {} errors",
187 stats.ingested, stats.skipped, stats.errors
188 );
189
190 Ok(stats)
191 }
192
193 fn should_include_file(&self, path: &Path) -> Result<bool> {
194 let path_str = path.to_string_lossy();
195
196 for pattern in &self.options.exclude_patterns {
198 if self.glob_match(pattern, &path_str) {
199 return Ok(false);
200 }
201 }
202
203 for pattern in &self.options.include_patterns {
205 if self.glob_match(pattern, &path_str) {
206 return Ok(true);
207 }
208 }
209
210 Ok(false)
211 }
212
213 fn is_binary(&self, content: &[u8]) -> bool {
214 if content.is_empty() {
215 return false;
216 }
217
218 let mut non_printable = 0;
219 for &byte in content.iter().take(1024) { if byte < 32 && byte != 9 && byte != 10 && byte != 13 {
221 non_printable += 1;
222 }
223 }
224
225 let ratio = non_printable as f32 / content.len().min(1024) as f32;
226 ratio > self.options.binary_ratio_threshold
227 }
228
229 fn glob_match(&self, pattern: &str, text: &str) -> bool {
230 if pattern == "**/*" {
232 return true;
233 }
234
235 if pattern.starts_with("**/") {
237 let suffix = &pattern[3..];
238 if suffix.starts_with("*.") {
239 let ext = &suffix[1..]; return text.ends_with(ext);
242 } else {
243 return text.ends_with(suffix);
244 }
245 }
246
247 if pattern.starts_with("**/") && pattern.ends_with("/**") {
248 let dir_name = &pattern[3..pattern.len()-3];
249 return text.contains(&format!("/{}/", dir_name)) ||
250 text.starts_with(&format!("{}/", dir_name));
251 }
252
253 if pattern.ends_with("/**") {
254 let prefix = &pattern[..pattern.len()-3];
255 return text.starts_with(prefix);
256 }
257
258 if pattern.starts_with("*.") {
260 let ext = &pattern[1..]; return text.ends_with(ext);
262 }
263
264 if !pattern.contains('/') && !pattern.contains('*') {
266 if let Some(filename) = text.split('/').last() {
267 if filename == pattern {
268 return true;
269 }
270 }
271 }
272
273 if pattern.contains('*') {
275 return self.wildcard_match(pattern, text);
276 }
277
278 pattern == text
279 }
280
281 fn wildcard_match(&self, pattern: &str, text: &str) -> bool {
282 let pattern_chars: Vec<char> = pattern.chars().collect();
283 let text_chars: Vec<char> = text.chars().collect();
284
285 self.match_recursive(&pattern_chars, &text_chars, 0, 0)
286 }
287
288 fn match_recursive(&self, pattern: &[char], text: &[char], p_idx: usize, t_idx: usize) -> bool {
289 if p_idx == pattern.len() {
290 return t_idx == text.len();
291 }
292
293 if pattern[p_idx] == '*' {
294 for i in t_idx..=text.len() {
296 if self.match_recursive(pattern, text, p_idx + 1, i) {
297 return true;
298 }
299 }
300 false
301 } else if t_idx < text.len() && (pattern[p_idx] == text[t_idx] || pattern[p_idx] == '?') {
302 self.match_recursive(pattern, text, p_idx + 1, t_idx + 1)
303 } else {
304 false
305 }
306 }
307
308 fn find_next_segment_id(&self, store_path: &Path) -> Result<u32> {
309 let mut max_id = 0;
310
311 if store_path.exists() {
312 for entry in fs::read_dir(store_path)? {
313 let entry = entry?;
314 let name = entry.file_name();
315 let name_str = name.to_string_lossy();
316
317 if name_str.starts_with("seg-") && name_str.ends_with(".sift") {
318 if let Some(id_str) = name_str.strip_prefix("seg-").and_then(|s| s.strip_suffix(".sift")) {
319 if let Ok(id) = id_str.parse::<u32>() {
320 max_id = max_id.max(id);
321 }
322 }
323 }
324 }
325 }
326
327 Ok(max_id + 1)
328 }
329}
330
331#[derive(Debug, Clone)]
332pub struct IngestStats {
333 pub ingested: u64,
334 pub skipped: u64,
335 pub errors: u64,
336}
337
338impl IngestStats {
339 pub fn new() -> Self {
340 Self {
341 ingested: 0,
342 skipped: 0,
343 errors: 0,
344 }
345 }
346}
347
348impl Default for IngestStats {
349 fn default() -> Self {
350 Self::new()
351 }
352}