use std::path::Path;
use roboticus_core::Result;
use serde::{Deserialize, Serialize};
use tracing::warn;
use crate::retrieval::{ChunkConfig, chunk_text};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum FileType {
Markdown,
PlainText,
RustSource,
PythonSource,
JavaScriptSource,
TypeScriptSource,
Pdf,
}
impl FileType {
pub fn from_path(path: &Path) -> Option<Self> {
let ext = path.extension()?.to_str()?.to_lowercase();
match ext.as_str() {
"md" | "markdown" => Some(Self::Markdown),
"txt" | "text" => Some(Self::PlainText),
"rs" => Some(Self::RustSource),
"py" => Some(Self::PythonSource),
"js" | "jsx" | "mjs" => Some(Self::JavaScriptSource),
"ts" | "tsx" | "mts" => Some(Self::TypeScriptSource),
"pdf" => Some(Self::Pdf),
_ => None,
}
}
pub fn is_code(&self) -> bool {
matches!(
self,
Self::RustSource | Self::PythonSource | Self::JavaScriptSource | Self::TypeScriptSource
)
}
pub fn label(&self) -> &'static str {
match self {
Self::Markdown => "markdown",
Self::PlainText => "plain_text",
Self::RustSource => "rust",
Self::PythonSource => "python",
Self::JavaScriptSource => "javascript",
Self::TypeScriptSource => "typescript",
Self::Pdf => "pdf",
}
}
}
pub fn extract_text(path: &Path, file_type: FileType) -> Result<String> {
match file_type {
FileType::Pdf => extract_pdf_text(path),
_ => {
let content = std::fs::read_to_string(path).map_err(|e| {
roboticus_core::RoboticusError::Config(format!(
"failed to read {}: {e}",
path.display()
))
})?;
Ok(content)
}
}
}
fn extract_pdf_text(path: &Path) -> Result<String> {
let bytes = std::fs::read(path).map_err(|e| {
roboticus_core::RoboticusError::Config(format!(
"failed to read PDF {}: {e}",
path.display()
))
})?;
let text = pdf_extract::extract_text_from_mem(&bytes).map_err(|e| {
roboticus_core::RoboticusError::Config(format!(
"failed to extract text from PDF {}: {e}",
path.display()
))
})?;
Ok(text)
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IngestResult {
pub file_path: String,
pub file_type: FileType,
pub chunks_stored: usize,
pub total_chars: usize,
pub source_id: String,
}
const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024;
pub fn ingest_file(db: &roboticus_db::Database, path: &Path) -> Result<IngestResult> {
let metadata = std::fs::metadata(path).map_err(|e| {
roboticus_core::RoboticusError::Config(format!("cannot access {}: {e}", path.display()))
})?;
if !metadata.is_file() {
return Err(roboticus_core::RoboticusError::Config(format!(
"{} is not a regular file",
path.display()
)));
}
if metadata.len() > MAX_FILE_SIZE {
return Err(roboticus_core::RoboticusError::Config(format!(
"{} exceeds maximum file size ({} bytes > {} bytes)",
path.display(),
metadata.len(),
MAX_FILE_SIZE
)));
}
let file_type = FileType::from_path(path).ok_or_else(|| {
roboticus_core::RoboticusError::Config(format!("unsupported file type: {}", path.display()))
})?;
let text = extract_text(path, file_type)?;
let total_chars = text.len();
if text.trim().is_empty() {
return Err(roboticus_core::RoboticusError::Config(format!(
"{} contains no extractable text",
path.display()
)));
}
let config = ChunkConfig::default(); let chunks = chunk_text(&text, &config);
let canonical = path.canonicalize().unwrap_or_else(|_| path.to_path_buf());
let source_id = format!(
"ingest:{}",
canonical.to_string_lossy().replace(['/', '\\'], ":")
);
let file_name = path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unknown");
let mut stored = 0;
for chunk in &chunks {
let chunk_id = format!("{}:chunk:{}", source_id, chunk.index);
let preview = if chunk.text.len() > 200 {
format!("{}...", &chunk.text[..chunk.text.floor_char_boundary(200)])
} else {
chunk.text.clone()
};
let category = if file_type.is_code() {
"ingested_code"
} else {
"ingested_document"
};
let key = format!("{}:{}", file_name, chunk.index);
if let Err(e) = roboticus_db::memory::store_semantic(db, category, &key, &chunk.text, 0.8) {
warn!(error = %e, chunk = chunk.index, "failed to store semantic memory for chunk");
continue;
}
let embedding = roboticus_llm::fallback_embedding(&chunk.text);
if let Err(e) = roboticus_db::embeddings::store_embedding(
db,
&chunk_id,
"ingested_knowledge",
&source_id,
&preview,
&embedding,
) {
warn!(error = %e, chunk = chunk.index, "failed to store embedding entry for chunk");
continue;
}
stored += 1;
}
let description = format!(
"Ingested {} ({}, {} chunks)",
file_name,
file_type.label(),
stored
);
if let Err(e) = roboticus_db::hippocampus::register_table(
db,
&format!("knowledge:{}", file_name),
&description,
&[], "system", false, "read", stored as i64,
) {
warn!(error = %e, "failed to register ingested document in hippocampus");
}
Ok(IngestResult {
file_path: path.display().to_string(),
file_type,
chunks_stored: stored,
total_chars,
source_id,
})
}
pub fn ingest_directory(db: &roboticus_db::Database, dir: &Path) -> Result<Vec<IngestResult>> {
if !dir.is_dir() {
return Err(roboticus_core::RoboticusError::Config(format!(
"{} is not a directory",
dir.display()
)));
}
let mut results = Vec::new();
let entries = std::fs::read_dir(dir).map_err(|e| {
roboticus_core::RoboticusError::Config(format!(
"cannot read directory {}: {e}",
dir.display()
))
})?;
for entry in entries.flatten() {
let path = entry.path();
if path.is_file() && FileType::from_path(&path).is_some() {
match ingest_file(db, &path) {
Ok(result) => results.push(result),
Err(e) => {
warn!(
error = %e,
file = %path.display(),
"skipping file during directory ingestion"
);
}
}
}
}
Ok(results)
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
fn test_db() -> roboticus_db::Database {
let db = roboticus_db::Database::new(":memory:").unwrap();
roboticus_db::schema::initialize_db(&db).unwrap();
db
}
#[test]
fn file_type_detection() {
assert_eq!(
FileType::from_path(Path::new("readme.md")),
Some(FileType::Markdown)
);
assert_eq!(
FileType::from_path(Path::new("main.rs")),
Some(FileType::RustSource)
);
assert_eq!(
FileType::from_path(Path::new("app.tsx")),
Some(FileType::TypeScriptSource)
);
assert_eq!(
FileType::from_path(Path::new("doc.pdf")),
Some(FileType::Pdf)
);
assert_eq!(FileType::from_path(Path::new("image.png")), None);
assert_eq!(FileType::from_path(Path::new("archive.zip")), None);
}
#[test]
fn ingest_markdown_file() {
let db = test_db();
let dir = tempfile::tempdir().unwrap();
let file_path = dir.path().join("test.md");
{
let mut f = std::fs::File::create(&file_path).unwrap();
writeln!(
f,
"# Test Document\n\nThis is a test document with enough content to be meaningful."
)
.unwrap();
writeln!(
f,
"\n## Section Two\n\nMore content here for the chunker to work with."
)
.unwrap();
}
let result = ingest_file(&db, &file_path).unwrap();
assert_eq!(result.file_type, FileType::Markdown);
assert!(result.chunks_stored > 0);
assert!(result.total_chars > 50);
assert!(result.source_id.starts_with("ingest:"));
}
#[test]
fn ingest_code_file() {
let db = test_db();
let dir = tempfile::tempdir().unwrap();
let file_path = dir.path().join("example.rs");
{
let mut f = std::fs::File::create(&file_path).unwrap();
writeln!(f, "fn main() {{").unwrap();
writeln!(f, " println!(\"Hello, world!\");").unwrap();
writeln!(f, "}}").unwrap();
}
let result = ingest_file(&db, &file_path).unwrap();
assert_eq!(result.file_type, FileType::RustSource);
assert_eq!(result.chunks_stored, 1); }
#[test]
fn ingest_empty_file_fails() {
let db = test_db();
let dir = tempfile::tempdir().unwrap();
let file_path = dir.path().join("empty.txt");
std::fs::File::create(&file_path).unwrap();
let err = ingest_file(&db, &file_path).unwrap_err();
assert!(err.to_string().contains("no extractable text"));
}
#[test]
fn ingest_unsupported_extension_fails() {
let db = test_db();
let dir = tempfile::tempdir().unwrap();
let file_path = dir.path().join("photo.png");
std::fs::write(&file_path, b"fake png data").unwrap();
let err = ingest_file(&db, &file_path).unwrap_err();
assert!(err.to_string().contains("unsupported file type"));
}
#[test]
fn ingest_directory_collects_supported_files() {
let db = test_db();
let dir = tempfile::tempdir().unwrap();
std::fs::write(
dir.path().join("a.md"),
"# Doc A\nSome markdown content here.",
)
.unwrap();
std::fs::write(
dir.path().join("b.txt"),
"Plain text content for ingestion.",
)
.unwrap();
std::fs::write(dir.path().join("c.png"), b"fake image").unwrap();
let results = ingest_directory(&db, dir.path()).unwrap();
assert_eq!(results.len(), 2);
}
#[test]
fn hippocampus_registration_after_ingest() {
let db = test_db();
let dir = tempfile::tempdir().unwrap();
let file_path = dir.path().join("notes.md");
std::fs::write(&file_path, "# My Notes\nImportant information.").unwrap();
ingest_file(&db, &file_path).unwrap();
let tables = roboticus_db::hippocampus::list_tables(&db).unwrap();
let found = tables.iter().any(|t| t.table_name == "knowledge:notes.md");
assert!(
found,
"ingested document should be registered in hippocampus"
);
}
#[test]
fn ingest_stores_real_embeddings_for_chunks() {
let db = test_db();
let dir = tempfile::tempdir().unwrap();
let file_path = dir.path().join("embeddings.md");
std::fs::write(
&file_path,
"# Embeddings\nThis document should create a non-empty deterministic embedding.",
)
.unwrap();
let result = ingest_file(&db, &file_path).unwrap();
assert!(result.chunks_stored > 0);
let conn = db.conn();
let (count, min_dimensions): (i64, i64) = conn
.query_row(
"SELECT COUNT(*), COALESCE(MIN(dimensions), 0)
FROM embeddings
WHERE source_table = 'ingested_knowledge' AND source_id = ?1",
[&result.source_id],
|row| Ok((row.get(0)?, row.get(1)?)),
)
.unwrap();
assert_eq!(count, result.chunks_stored as i64);
assert!(
min_dimensions > 0,
"ingested embeddings should not be empty"
);
}
}