1use std::path::Path;
14
15use roboticus_core::Result;
16use serde::{Deserialize, Serialize};
17use tracing::warn;
18
19use crate::retrieval::{ChunkConfig, chunk_text};
20
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
24pub enum FileType {
25 Markdown,
26 PlainText,
27 RustSource,
28 PythonSource,
29 JavaScriptSource,
30 TypeScriptSource,
31 Pdf,
32}
33
34impl FileType {
35 pub fn from_path(path: &Path) -> Option<Self> {
37 let ext = path.extension()?.to_str()?.to_lowercase();
38 match ext.as_str() {
39 "md" | "markdown" => Some(Self::Markdown),
40 "txt" | "text" => Some(Self::PlainText),
41 "rs" => Some(Self::RustSource),
42 "py" => Some(Self::PythonSource),
43 "js" | "jsx" | "mjs" => Some(Self::JavaScriptSource),
44 "ts" | "tsx" | "mts" => Some(Self::TypeScriptSource),
45 "pdf" => Some(Self::Pdf),
46 _ => None,
47 }
48 }
49
50 pub fn is_code(&self) -> bool {
51 matches!(
52 self,
53 Self::RustSource | Self::PythonSource | Self::JavaScriptSource | Self::TypeScriptSource
54 )
55 }
56
57 pub fn label(&self) -> &'static str {
58 match self {
59 Self::Markdown => "markdown",
60 Self::PlainText => "plain_text",
61 Self::RustSource => "rust",
62 Self::PythonSource => "python",
63 Self::JavaScriptSource => "javascript",
64 Self::TypeScriptSource => "typescript",
65 Self::Pdf => "pdf",
66 }
67 }
68}
69
70pub fn extract_text(path: &Path, file_type: FileType) -> Result<String> {
75 match file_type {
76 FileType::Pdf => extract_pdf_text(path),
77 _ => {
78 let content = std::fs::read_to_string(path).map_err(|e| {
79 roboticus_core::RoboticusError::Config(format!(
80 "failed to read {}: {e}",
81 path.display()
82 ))
83 })?;
84 Ok(content)
85 }
86 }
87}
88
89fn extract_pdf_text(path: &Path) -> Result<String> {
90 let bytes = std::fs::read(path).map_err(|e| {
91 roboticus_core::RoboticusError::Config(format!(
92 "failed to read PDF {}: {e}",
93 path.display()
94 ))
95 })?;
96 let text = pdf_extract::extract_text_from_mem(&bytes).map_err(|e| {
97 roboticus_core::RoboticusError::Config(format!(
98 "failed to extract text from PDF {}: {e}",
99 path.display()
100 ))
101 })?;
102 Ok(text)
103}
104
105#[derive(Debug, Clone, Serialize, Deserialize)]
108pub struct IngestResult {
109 pub file_path: String,
110 pub file_type: FileType,
111 pub chunks_stored: usize,
112 pub total_chars: usize,
113 pub source_id: String,
114}
115
116const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
120
121pub fn ingest_file(db: &roboticus_db::Database, path: &Path) -> Result<IngestResult> {
131 let metadata = std::fs::metadata(path).map_err(|e| {
133 roboticus_core::RoboticusError::Config(format!("cannot access {}: {e}", path.display()))
134 })?;
135
136 if !metadata.is_file() {
137 return Err(roboticus_core::RoboticusError::Config(format!(
138 "{} is not a regular file",
139 path.display()
140 )));
141 }
142
143 if metadata.len() > MAX_FILE_SIZE {
144 return Err(roboticus_core::RoboticusError::Config(format!(
145 "{} exceeds maximum file size ({} bytes > {} bytes)",
146 path.display(),
147 metadata.len(),
148 MAX_FILE_SIZE
149 )));
150 }
151
152 let file_type = FileType::from_path(path).ok_or_else(|| {
153 roboticus_core::RoboticusError::Config(format!("unsupported file type: {}", path.display()))
154 })?;
155
156 let text = extract_text(path, file_type)?;
158 let total_chars = text.len();
159
160 if text.trim().is_empty() {
161 return Err(roboticus_core::RoboticusError::Config(format!(
162 "{} contains no extractable text",
163 path.display()
164 )));
165 }
166
167 let config = ChunkConfig::default(); let chunks = chunk_text(&text, &config);
170
171 let canonical = path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
173 let source_id = format!(
174 "ingest:{}",
175 canonical.to_string_lossy().replace(['/', '\\'], ":")
176 );
177
178 let file_name = path
179 .file_name()
180 .and_then(|n| n.to_str())
181 .unwrap_or("unknown");
182
183 let mut stored = 0;
185 for chunk in &chunks {
186 let chunk_id = format!("{}:chunk:{}", source_id, chunk.index);
187 let preview = if chunk.text.len() > 200 {
188 format!("{}...", &chunk.text[..chunk.text.floor_char_boundary(200)])
189 } else {
190 chunk.text.clone()
191 };
192
193 let category = if file_type.is_code() {
195 "ingested_code"
196 } else {
197 "ingested_document"
198 };
199 let key = format!("{}:{}", file_name, chunk.index);
200
201 if let Err(e) = roboticus_db::memory::store_semantic(db, category, &key, &chunk.text, 0.8) {
202 warn!(error = %e, chunk = chunk.index, "failed to store semantic memory for chunk");
203 continue;
204 }
205
206 let embedding = roboticus_llm::fallback_embedding(&chunk.text);
210 if let Err(e) = roboticus_db::embeddings::store_embedding(
211 db,
212 &chunk_id,
213 "ingested_knowledge",
214 &source_id,
215 &preview,
216 &embedding,
217 ) {
218 warn!(error = %e, chunk = chunk.index, "failed to store embedding entry for chunk");
219 continue;
220 }
221
222 stored += 1;
223 }
224
225 let description = format!(
227 "Ingested {} ({}, {} chunks)",
228 file_name,
229 file_type.label(),
230 stored
231 );
232 if let Err(e) = roboticus_db::hippocampus::register_table(
233 db,
234 &format!("knowledge:{}", file_name),
235 &description,
236 &[], "system", false, "read", stored as i64,
241 ) {
242 warn!(error = %e, "failed to register ingested document in hippocampus");
243 }
244
245 Ok(IngestResult {
246 file_path: path.display().to_string(),
247 file_type,
248 chunks_stored: stored,
249 total_chars,
250 source_id,
251 })
252}
253
254pub fn ingest_directory(db: &roboticus_db::Database, dir: &Path) -> Result<Vec<IngestResult>> {
256 if !dir.is_dir() {
257 return Err(roboticus_core::RoboticusError::Config(format!(
258 "{} is not a directory",
259 dir.display()
260 )));
261 }
262
263 let mut results = Vec::new();
264 let entries = std::fs::read_dir(dir).map_err(|e| {
265 roboticus_core::RoboticusError::Config(format!(
266 "cannot read directory {}: {e}",
267 dir.display()
268 ))
269 })?;
270
271 for entry in entries.flatten() {
272 let path = entry.path();
273 if path.is_file() && FileType::from_path(&path).is_some() {
274 match ingest_file(db, &path) {
275 Ok(result) => results.push(result),
276 Err(e) => {
277 warn!(
278 error = %e,
279 file = %path.display(),
280 "skipping file during directory ingestion"
281 );
282 }
283 }
284 }
285 }
286
287 Ok(results)
288}
289
290#[cfg(test)]
293mod tests {
294 use super::*;
295 use std::io::Write;
296
297 fn test_db() -> roboticus_db::Database {
298 let db = roboticus_db::Database::new(":memory:").unwrap();
299 roboticus_db::schema::initialize_db(&db).unwrap();
300 db
301 }
302
303 #[test]
304 fn file_type_detection() {
305 assert_eq!(
306 FileType::from_path(Path::new("readme.md")),
307 Some(FileType::Markdown)
308 );
309 assert_eq!(
310 FileType::from_path(Path::new("main.rs")),
311 Some(FileType::RustSource)
312 );
313 assert_eq!(
314 FileType::from_path(Path::new("app.tsx")),
315 Some(FileType::TypeScriptSource)
316 );
317 assert_eq!(
318 FileType::from_path(Path::new("doc.pdf")),
319 Some(FileType::Pdf)
320 );
321 assert_eq!(FileType::from_path(Path::new("image.png")), None);
322 assert_eq!(FileType::from_path(Path::new("archive.zip")), None);
323 }
324
325 #[test]
326 fn ingest_markdown_file() {
327 let db = test_db();
328 let dir = tempfile::tempdir().unwrap();
329 let file_path = dir.path().join("test.md");
330 {
331 let mut f = std::fs::File::create(&file_path).unwrap();
332 writeln!(
333 f,
334 "# Test Document\n\nThis is a test document with enough content to be meaningful."
335 )
336 .unwrap();
337 writeln!(
338 f,
339 "\n## Section Two\n\nMore content here for the chunker to work with."
340 )
341 .unwrap();
342 }
343
344 let result = ingest_file(&db, &file_path).unwrap();
345 assert_eq!(result.file_type, FileType::Markdown);
346 assert!(result.chunks_stored > 0);
347 assert!(result.total_chars > 50);
348 assert!(result.source_id.starts_with("ingest:"));
349 }
350
351 #[test]
352 fn ingest_code_file() {
353 let db = test_db();
354 let dir = tempfile::tempdir().unwrap();
355 let file_path = dir.path().join("example.rs");
356 {
357 let mut f = std::fs::File::create(&file_path).unwrap();
358 writeln!(f, "fn main() {{").unwrap();
359 writeln!(f, " println!(\"Hello, world!\");").unwrap();
360 writeln!(f, "}}").unwrap();
361 }
362
363 let result = ingest_file(&db, &file_path).unwrap();
364 assert_eq!(result.file_type, FileType::RustSource);
365 assert_eq!(result.chunks_stored, 1); }
367
368 #[test]
369 fn ingest_empty_file_fails() {
370 let db = test_db();
371 let dir = tempfile::tempdir().unwrap();
372 let file_path = dir.path().join("empty.txt");
373 std::fs::File::create(&file_path).unwrap();
374
375 let err = ingest_file(&db, &file_path).unwrap_err();
376 assert!(err.to_string().contains("no extractable text"));
377 }
378
379 #[test]
380 fn ingest_unsupported_extension_fails() {
381 let db = test_db();
382 let dir = tempfile::tempdir().unwrap();
383 let file_path = dir.path().join("photo.png");
384 std::fs::write(&file_path, b"fake png data").unwrap();
385
386 let err = ingest_file(&db, &file_path).unwrap_err();
387 assert!(err.to_string().contains("unsupported file type"));
388 }
389
390 #[test]
391 fn ingest_directory_collects_supported_files() {
392 let db = test_db();
393 let dir = tempfile::tempdir().unwrap();
394
395 std::fs::write(
397 dir.path().join("a.md"),
398 "# Doc A\nSome markdown content here.",
399 )
400 .unwrap();
401 std::fs::write(
402 dir.path().join("b.txt"),
403 "Plain text content for ingestion.",
404 )
405 .unwrap();
406 std::fs::write(dir.path().join("c.png"), b"fake image").unwrap();
408
409 let results = ingest_directory(&db, dir.path()).unwrap();
410 assert_eq!(results.len(), 2);
411 }
412
413 #[test]
414 fn hippocampus_registration_after_ingest() {
415 let db = test_db();
416 let dir = tempfile::tempdir().unwrap();
417 let file_path = dir.path().join("notes.md");
418 std::fs::write(&file_path, "# My Notes\nImportant information.").unwrap();
419
420 ingest_file(&db, &file_path).unwrap();
421
422 let tables = roboticus_db::hippocampus::list_tables(&db).unwrap();
424 let found = tables.iter().any(|t| t.table_name == "knowledge:notes.md");
425 assert!(
426 found,
427 "ingested document should be registered in hippocampus"
428 );
429 }
430
431 #[test]
432 fn ingest_stores_real_embeddings_for_chunks() {
433 let db = test_db();
434 let dir = tempfile::tempdir().unwrap();
435 let file_path = dir.path().join("embeddings.md");
436 std::fs::write(
437 &file_path,
438 "# Embeddings\nThis document should create a non-empty deterministic embedding.",
439 )
440 .unwrap();
441
442 let result = ingest_file(&db, &file_path).unwrap();
443 assert!(result.chunks_stored > 0);
444
445 let conn = db.conn();
446 let (count, min_dimensions): (i64, i64) = conn
447 .query_row(
448 "SELECT COUNT(*), COALESCE(MIN(dimensions), 0)
449 FROM embeddings
450 WHERE source_table = 'ingested_knowledge' AND source_id = ?1",
451 [&result.source_id],
452 |row| Ok((row.get(0)?, row.get(1)?)),
453 )
454 .unwrap();
455 assert_eq!(count, result.chunks_stored as i64);
456 assert!(
457 min_dimensions > 0,
458 "ingested embeddings should not be empty"
459 );
460 }
461}