Skip to main content

roboticus_agent/
ingest.rs

1//! Document ingestion pipeline: file -> parse -> chunk -> embed -> store.
2//!
3//! Supports `.md`, `.txt`, `.rs`, `.py`, `.js`, `.ts`, `.pdf` files.
4//! PDF parsing uses the `pdf-extract` crate (pure Rust, no C dependencies).
5//!
6//! The pipeline:
7//! 1. Detect file type by extension
8//! 2. Extract raw text (plain-text passthrough, or PDF text extraction)
9//! 3. Chunk using existing `ChunkConfig` (512 tokens, 64-token overlap)
10//! 4. Store each chunk as semantic memory + embedding entry
11//! 5. Register the document as a knowledge source in hippocampus
12
13use std::path::Path;
14
15use roboticus_core::Result;
16use serde::{Deserialize, Serialize};
17use tracing::warn;
18
19use crate::retrieval::{ChunkConfig, chunk_text};
20
21// ── File type detection ────────────────────────────────────────
22
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
24pub enum FileType {
25    Markdown,
26    PlainText,
27    RustSource,
28    PythonSource,
29    JavaScriptSource,
30    TypeScriptSource,
31    Pdf,
32}
33
34impl FileType {
35    /// Detect file type from extension. Returns `None` for unsupported types.
36    pub fn from_path(path: &Path) -> Option<Self> {
37        let ext = path.extension()?.to_str()?.to_lowercase();
38        match ext.as_str() {
39            "md" | "markdown" => Some(Self::Markdown),
40            "txt" | "text" => Some(Self::PlainText),
41            "rs" => Some(Self::RustSource),
42            "py" => Some(Self::PythonSource),
43            "js" | "jsx" | "mjs" => Some(Self::JavaScriptSource),
44            "ts" | "tsx" | "mts" => Some(Self::TypeScriptSource),
45            "pdf" => Some(Self::Pdf),
46            _ => None,
47        }
48    }
49
50    pub fn is_code(&self) -> bool {
51        matches!(
52            self,
53            Self::RustSource | Self::PythonSource | Self::JavaScriptSource | Self::TypeScriptSource
54        )
55    }
56
57    pub fn label(&self) -> &'static str {
58        match self {
59            Self::Markdown => "markdown",
60            Self::PlainText => "plain_text",
61            Self::RustSource => "rust",
62            Self::PythonSource => "python",
63            Self::JavaScriptSource => "javascript",
64            Self::TypeScriptSource => "typescript",
65            Self::Pdf => "pdf",
66        }
67    }
68}
69
70// ── Text extraction ────────────────────────────────────────────
71
72/// Extract raw text from a file. For text-based formats, reads UTF-8 content
73/// directly. For PDF, extracts text using pdf-extract.
74pub fn extract_text(path: &Path, file_type: FileType) -> Result<String> {
75    match file_type {
76        FileType::Pdf => extract_pdf_text(path),
77        _ => {
78            let content = std::fs::read_to_string(path).map_err(|e| {
79                roboticus_core::RoboticusError::Config(format!(
80                    "failed to read {}: {e}",
81                    path.display()
82                ))
83            })?;
84            Ok(content)
85        }
86    }
87}
88
89fn extract_pdf_text(path: &Path) -> Result<String> {
90    let bytes = std::fs::read(path).map_err(|e| {
91        roboticus_core::RoboticusError::Config(format!(
92            "failed to read PDF {}: {e}",
93            path.display()
94        ))
95    })?;
96    let text = pdf_extract::extract_text_from_mem(&bytes).map_err(|e| {
97        roboticus_core::RoboticusError::Config(format!(
98            "failed to extract text from PDF {}: {e}",
99            path.display()
100        ))
101    })?;
102    Ok(text)
103}
104
105// ── Ingestion result ───────────────────────────────────────────
106
107#[derive(Debug, Clone, Serialize, Deserialize)]
108pub struct IngestResult {
109    pub file_path: String,
110    pub file_type: FileType,
111    pub chunks_stored: usize,
112    pub total_chars: usize,
113    pub source_id: String,
114}
115
116// ── Pipeline ───────────────────────────────────────────────────
117
118/// Maximum file size we'll ingest (10 MB). Prevents OOM on giant files.
119const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
120
121/// Ingest a single file into the knowledge system.
122///
123/// Steps:
124/// 1. Validate file exists and is within size limits
125/// 2. Detect file type
126/// 3. Extract text
127/// 4. Chunk with standard config (512 tokens, 64-token overlap)
128/// 5. Store each chunk as semantic memory + embedding entry
129/// 6. Register in hippocampus as a knowledge source
130pub fn ingest_file(db: &roboticus_db::Database, path: &Path) -> Result<IngestResult> {
131    // Validate
132    let metadata = std::fs::metadata(path).map_err(|e| {
133        roboticus_core::RoboticusError::Config(format!("cannot access {}: {e}", path.display()))
134    })?;
135
136    if !metadata.is_file() {
137        return Err(roboticus_core::RoboticusError::Config(format!(
138            "{} is not a regular file",
139            path.display()
140        )));
141    }
142
143    if metadata.len() > MAX_FILE_SIZE {
144        return Err(roboticus_core::RoboticusError::Config(format!(
145            "{} exceeds maximum file size ({} bytes > {} bytes)",
146            path.display(),
147            metadata.len(),
148            MAX_FILE_SIZE
149        )));
150    }
151
152    let file_type = FileType::from_path(path).ok_or_else(|| {
153        roboticus_core::RoboticusError::Config(format!("unsupported file type: {}", path.display()))
154    })?;
155
156    // Extract text
157    let text = extract_text(path, file_type)?;
158    let total_chars = text.len();
159
160    if text.trim().is_empty() {
161        return Err(roboticus_core::RoboticusError::Config(format!(
162            "{} contains no extractable text",
163            path.display()
164        )));
165    }
166
167    // Chunk
168    let config = ChunkConfig::default(); // 512 tokens, 64 overlap
169    let chunks = chunk_text(&text, &config);
170
171    // Generate a stable source ID from the file path
172    let canonical = path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
173    let source_id = format!(
174        "ingest:{}",
175        canonical.to_string_lossy().replace(['/', '\\'], ":")
176    );
177
178    let file_name = path
179        .file_name()
180        .and_then(|n| n.to_str())
181        .unwrap_or("unknown");
182
183    // Store each chunk
184    let mut stored = 0;
185    for chunk in &chunks {
186        let chunk_id = format!("{}:chunk:{}", source_id, chunk.index);
187        let preview = if chunk.text.len() > 200 {
188            format!("{}...", &chunk.text[..chunk.text.floor_char_boundary(200)])
189        } else {
190            chunk.text.clone()
191        };
192
193        // Store in semantic memory for FTS5 retrieval
194        let category = if file_type.is_code() {
195            "ingested_code"
196        } else {
197            "ingested_document"
198        };
199        let key = format!("{}:{}", file_name, chunk.index);
200
201        if let Err(e) = roboticus_db::memory::store_semantic(db, category, &key, &chunk.text, 0.8) {
202            warn!(error = %e, chunk = chunk.index, "failed to store semantic memory for chunk");
203            continue;
204        }
205
206        // Persist a real deterministic embedding immediately so ingested
207        // knowledge participates in vector search without waiting for a
208        // follow-up backfill job.
209        let embedding = roboticus_llm::fallback_embedding(&chunk.text);
210        if let Err(e) = roboticus_db::embeddings::store_embedding(
211            db,
212            &chunk_id,
213            "ingested_knowledge",
214            &source_id,
215            &preview,
216            &embedding,
217        ) {
218            warn!(error = %e, chunk = chunk.index, "failed to store embedding entry for chunk");
219            continue;
220        }
221
222        stored += 1;
223    }
224
225    // Register in hippocampus as a knowledge source
226    let description = format!(
227        "Ingested {} ({}, {} chunks)",
228        file_name,
229        file_type.label(),
230        stored
231    );
232    if let Err(e) = roboticus_db::hippocampus::register_table(
233        db,
234        &format!("knowledge:{}", file_name),
235        &description,
236        &[],      // no column schema — knowledge sources aren't relational tables
237        "system", // created_by
238        false,    // not agent-owned — system knowledge
239        "read",   // access_level
240        stored as i64,
241    ) {
242        warn!(error = %e, "failed to register ingested document in hippocampus");
243    }
244
245    Ok(IngestResult {
246        file_path: path.display().to_string(),
247        file_type,
248        chunks_stored: stored,
249        total_chars,
250        source_id,
251    })
252}
253
254/// Ingest all supported files in a directory (non-recursive).
255pub fn ingest_directory(db: &roboticus_db::Database, dir: &Path) -> Result<Vec<IngestResult>> {
256    if !dir.is_dir() {
257        return Err(roboticus_core::RoboticusError::Config(format!(
258            "{} is not a directory",
259            dir.display()
260        )));
261    }
262
263    let mut results = Vec::new();
264    let entries = std::fs::read_dir(dir).map_err(|e| {
265        roboticus_core::RoboticusError::Config(format!(
266            "cannot read directory {}: {e}",
267            dir.display()
268        ))
269    })?;
270
271    for entry in entries.flatten() {
272        let path = entry.path();
273        if path.is_file() && FileType::from_path(&path).is_some() {
274            match ingest_file(db, &path) {
275                Ok(result) => results.push(result),
276                Err(e) => {
277                    warn!(
278                        error = %e,
279                        file = %path.display(),
280                        "skipping file during directory ingestion"
281                    );
282                }
283            }
284        }
285    }
286
287    Ok(results)
288}
289
290// ── Tests ──────────────────────────────────────────────────────
291
292#[cfg(test)]
293mod tests {
294    use super::*;
295    use std::io::Write;
296
297    fn test_db() -> roboticus_db::Database {
298        let db = roboticus_db::Database::new(":memory:").unwrap();
299        roboticus_db::schema::initialize_db(&db).unwrap();
300        db
301    }
302
303    #[test]
304    fn file_type_detection() {
305        assert_eq!(
306            FileType::from_path(Path::new("readme.md")),
307            Some(FileType::Markdown)
308        );
309        assert_eq!(
310            FileType::from_path(Path::new("main.rs")),
311            Some(FileType::RustSource)
312        );
313        assert_eq!(
314            FileType::from_path(Path::new("app.tsx")),
315            Some(FileType::TypeScriptSource)
316        );
317        assert_eq!(
318            FileType::from_path(Path::new("doc.pdf")),
319            Some(FileType::Pdf)
320        );
321        assert_eq!(FileType::from_path(Path::new("image.png")), None);
322        assert_eq!(FileType::from_path(Path::new("archive.zip")), None);
323    }
324
325    #[test]
326    fn ingest_markdown_file() {
327        let db = test_db();
328        let dir = tempfile::tempdir().unwrap();
329        let file_path = dir.path().join("test.md");
330        {
331            let mut f = std::fs::File::create(&file_path).unwrap();
332            writeln!(
333                f,
334                "# Test Document\n\nThis is a test document with enough content to be meaningful."
335            )
336            .unwrap();
337            writeln!(
338                f,
339                "\n## Section Two\n\nMore content here for the chunker to work with."
340            )
341            .unwrap();
342        }
343
344        let result = ingest_file(&db, &file_path).unwrap();
345        assert_eq!(result.file_type, FileType::Markdown);
346        assert!(result.chunks_stored > 0);
347        assert!(result.total_chars > 50);
348        assert!(result.source_id.starts_with("ingest:"));
349    }
350
351    #[test]
352    fn ingest_code_file() {
353        let db = test_db();
354        let dir = tempfile::tempdir().unwrap();
355        let file_path = dir.path().join("example.rs");
356        {
357            let mut f = std::fs::File::create(&file_path).unwrap();
358            writeln!(f, "fn main() {{").unwrap();
359            writeln!(f, "    println!(\"Hello, world!\");").unwrap();
360            writeln!(f, "}}").unwrap();
361        }
362
363        let result = ingest_file(&db, &file_path).unwrap();
364        assert_eq!(result.file_type, FileType::RustSource);
365        assert_eq!(result.chunks_stored, 1); // small file = 1 chunk
366    }
367
368    #[test]
369    fn ingest_empty_file_fails() {
370        let db = test_db();
371        let dir = tempfile::tempdir().unwrap();
372        let file_path = dir.path().join("empty.txt");
373        std::fs::File::create(&file_path).unwrap();
374
375        let err = ingest_file(&db, &file_path).unwrap_err();
376        assert!(err.to_string().contains("no extractable text"));
377    }
378
379    #[test]
380    fn ingest_unsupported_extension_fails() {
381        let db = test_db();
382        let dir = tempfile::tempdir().unwrap();
383        let file_path = dir.path().join("photo.png");
384        std::fs::write(&file_path, b"fake png data").unwrap();
385
386        let err = ingest_file(&db, &file_path).unwrap_err();
387        assert!(err.to_string().contains("unsupported file type"));
388    }
389
390    #[test]
391    fn ingest_directory_collects_supported_files() {
392        let db = test_db();
393        let dir = tempfile::tempdir().unwrap();
394
395        // Create some supported files
396        std::fs::write(
397            dir.path().join("a.md"),
398            "# Doc A\nSome markdown content here.",
399        )
400        .unwrap();
401        std::fs::write(
402            dir.path().join("b.txt"),
403            "Plain text content for ingestion.",
404        )
405        .unwrap();
406        // Unsupported file — should be skipped
407        std::fs::write(dir.path().join("c.png"), b"fake image").unwrap();
408
409        let results = ingest_directory(&db, dir.path()).unwrap();
410        assert_eq!(results.len(), 2);
411    }
412
413    #[test]
414    fn hippocampus_registration_after_ingest() {
415        let db = test_db();
416        let dir = tempfile::tempdir().unwrap();
417        let file_path = dir.path().join("notes.md");
418        std::fs::write(&file_path, "# My Notes\nImportant information.").unwrap();
419
420        ingest_file(&db, &file_path).unwrap();
421
422        // Verify hippocampus has the entry
423        let tables = roboticus_db::hippocampus::list_tables(&db).unwrap();
424        let found = tables.iter().any(|t| t.table_name == "knowledge:notes.md");
425        assert!(
426            found,
427            "ingested document should be registered in hippocampus"
428        );
429    }
430
431    #[test]
432    fn ingest_stores_real_embeddings_for_chunks() {
433        let db = test_db();
434        let dir = tempfile::tempdir().unwrap();
435        let file_path = dir.path().join("embeddings.md");
436        std::fs::write(
437            &file_path,
438            "# Embeddings\nThis document should create a non-empty deterministic embedding.",
439        )
440        .unwrap();
441
442        let result = ingest_file(&db, &file_path).unwrap();
443        assert!(result.chunks_stored > 0);
444
445        let conn = db.conn();
446        let (count, min_dimensions): (i64, i64) = conn
447            .query_row(
448                "SELECT COUNT(*), COALESCE(MIN(dimensions), 0)
449                 FROM embeddings
450                 WHERE source_table = 'ingested_knowledge' AND source_id = ?1",
451                [&result.source_id],
452                |row| Ok((row.get(0)?, row.get(1)?)),
453            )
454            .unwrap();
455        assert_eq!(count, result.chunks_stored as i64);
456        assert!(
457            min_dimensions > 0,
458            "ingested embeddings should not be empty"
459        );
460    }
461}