1use crate::types::{Frame, FileHeader, detect_language};
2use crate::storage::{SegmentWriter, generate_line_table};
3use crate::index::{PathIndex, HandlesMap};
4use anyhow::Result;
5use ignore::WalkBuilder;
6use std::fs;
7use std::path::Path;
8
9pub struct IngestOptions {
10 pub include_patterns: Vec<String>,
11 pub exclude_patterns: Vec<String>,
12 pub max_file_bytes: u64,
13 pub binary_ratio_threshold: f32,
14}
15
16impl Default for IngestOptions {
17 fn default() -> Self {
18 Self {
19 include_patterns: vec!["**/*".to_string()],
20 exclude_patterns: vec![
21 "**/target/**".to_string(),
22 "**/node_modules/**".to_string(),
23 "**/.git/**".to_string(),
24 "**/build/**".to_string(),
25 "**/dist/**".to_string(),
26 ],
27 max_file_bytes: 10 * 1024 * 1024, binary_ratio_threshold: 0.3, }
30 }
31}
32
33pub struct Ingester {
34 collection_path: std::path::PathBuf,
35 options: IngestOptions,
36}
37
38impl Ingester {
39 pub fn new(collection_path: std::path::PathBuf, options: IngestOptions) -> Self {
40 Self {
41 collection_path,
42 options,
43 }
44 }
45
46 pub fn ingest_from_fs(&mut self, source_path: &Path) -> Result<IngestStats> {
47 let mut stats = IngestStats::new();
48
49 let mut path_index = PathIndex::read_from_file(
51 &self.collection_path.join("index/path.json")
52 )?;
53 let mut handles_map = HandlesMap::read_from_file(
54 &self.collection_path.join("index/handles.json")
55 )?;
56
57 let store_path = self.collection_path.join("store");
59 let seg_id = self.find_next_segment_id(&store_path)?;
60 let mut writer = SegmentWriter::new(&store_path, seg_id)?;
61
62 let walker = WalkBuilder::new(source_path)
64 .hidden(false) .git_ignore(true)
66 .git_global(true)
67 .git_exclude(true)
68 .build();
69
70 for entry in walker {
71 let entry = entry?;
72 let path = entry.path();
73
74 if path.is_dir() {
76 continue;
77 }
78
79 if !self.should_include_file(path)? {
81 stats.skipped += 1;
82 continue;
83 }
84
85 let content = match fs::read(path) {
87 Ok(content) => content,
88 Err(e) => {
89 eprintln!("Warning: Failed to read {}: {}", path.display(), e);
90 stats.errors += 1;
91 continue;
92 }
93 };
94
95 if content.len() > self.options.max_file_bytes as usize {
97 stats.skipped += 1;
98 continue;
99 }
100
101 if self.is_binary(&content) {
103 stats.skipped += 1;
104 continue;
105 }
106
107 let relative_path = path.strip_prefix(source_path)
109 .unwrap_or(path)
110 .to_string_lossy()
111 .to_string();
112
113 let lang = detect_language(path);
115
116 let line_table = generate_line_table(&content);
118
119 let header = FileHeader::new(&content, &line_table, lang);
121 let frame = Frame {
122 header,
123 content,
124 line_table,
125 };
126
127 let handle = path_index.add_path(relative_path.clone());
129 let metadata = writer.write_frame(&frame)?;
130 handles_map.add_handle(handle, metadata);
131
132 stats.ingested += 1;
133
134 if stats.ingested % 100 == 0 {
135 println!("Ingested {} files...", stats.ingested);
136 }
137 }
138
139 path_index.write_to_file(&self.collection_path.join("index/path.json"))?;
141 handles_map.write_to_file(&self.collection_path.join("index/handles.json"))?;
142
143 println!(
144 "Ingestion complete: {} files ingested, {} skipped, {} errors",
145 stats.ingested, stats.skipped, stats.errors
146 );
147
148 Ok(stats)
149 }
150
151 fn should_include_file(&self, path: &Path) -> Result<bool> {
152 let path_str = path.to_string_lossy();
153
154 for pattern in &self.options.exclude_patterns {
156 if self.glob_match(pattern, &path_str) {
157 return Ok(false);
158 }
159 }
160
161 for pattern in &self.options.include_patterns {
163 if self.glob_match(pattern, &path_str) {
164 return Ok(true);
165 }
166 }
167
168 Ok(false)
169 }
170
171 fn is_binary(&self, content: &[u8]) -> bool {
172 if content.is_empty() {
173 return false;
174 }
175
176 let mut non_printable = 0;
177 for &byte in content.iter().take(1024) { if byte < 32 && byte != 9 && byte != 10 && byte != 13 {
179 non_printable += 1;
180 }
181 }
182
183 let ratio = non_printable as f32 / content.len().min(1024) as f32;
184 ratio > self.options.binary_ratio_threshold
185 }
186
187 fn glob_match(&self, pattern: &str, text: &str) -> bool {
188 if pattern == "**/*" {
190 return true;
191 }
192
193 if pattern.starts_with("**/") && pattern.ends_with("/**") {
194 let dir_name = &pattern[3..pattern.len()-3];
195 return text.contains(&format!("/{}/", dir_name)) ||
196 text.starts_with(&format!("{}/", dir_name));
197 }
198
199 if pattern.starts_with("**/") {
200 let suffix = &pattern[3..];
201 return text.ends_with(suffix);
202 }
203
204 if pattern.ends_with("/**") {
205 let prefix = &pattern[..pattern.len()-3];
206 return text.starts_with(prefix);
207 }
208
209 if pattern.starts_with("*.") {
211 let ext = &pattern[1..]; return text.ends_with(ext);
213 }
214
215 if !pattern.contains('/') && !pattern.contains('*') {
217 if let Some(filename) = text.split('/').last() {
218 if filename == pattern {
219 return true;
220 }
221 }
222 }
223
224 if pattern.contains('*') {
226 return self.wildcard_match(pattern, text);
227 }
228
229 pattern == text
230 }
231
232 fn wildcard_match(&self, pattern: &str, text: &str) -> bool {
233 let pattern_chars: Vec<char> = pattern.chars().collect();
234 let text_chars: Vec<char> = text.chars().collect();
235
236 self.match_recursive(&pattern_chars, &text_chars, 0, 0)
237 }
238
239 fn match_recursive(&self, pattern: &[char], text: &[char], p_idx: usize, t_idx: usize) -> bool {
240 if p_idx == pattern.len() {
241 return t_idx == text.len();
242 }
243
244 if pattern[p_idx] == '*' {
245 for i in t_idx..=text.len() {
247 if self.match_recursive(pattern, text, p_idx + 1, i) {
248 return true;
249 }
250 }
251 false
252 } else if t_idx < text.len() && (pattern[p_idx] == text[t_idx] || pattern[p_idx] == '?') {
253 self.match_recursive(pattern, text, p_idx + 1, t_idx + 1)
254 } else {
255 false
256 }
257 }
258
259 fn find_next_segment_id(&self, store_path: &Path) -> Result<u32> {
260 let mut max_id = 0;
261
262 if store_path.exists() {
263 for entry in fs::read_dir(store_path)? {
264 let entry = entry?;
265 let name = entry.file_name();
266 let name_str = name.to_string_lossy();
267
268 if name_str.starts_with("seg-") && name_str.ends_with(".sift") {
269 if let Some(id_str) = name_str.strip_prefix("seg-").and_then(|s| s.strip_suffix(".sift")) {
270 if let Ok(id) = id_str.parse::<u32>() {
271 max_id = max_id.max(id);
272 }
273 }
274 }
275 }
276 }
277
278 Ok(max_id + 1)
279 }
280}
281
282#[derive(Debug, Clone)]
283pub struct IngestStats {
284 pub ingested: u64,
285 pub skipped: u64,
286 pub errors: u64,
287}
288
289impl IngestStats {
290 pub fn new() -> Self {
291 Self {
292 ingested: 0,
293 skipped: 0,
294 errors: 0,
295 }
296 }
297}
298
299impl Default for IngestStats {
300 fn default() -> Self {
301 Self::new()
302 }
303}