file_rag/
file_rag.rs

1use std::fs::{self};
2use std::path::{Path, PathBuf};
3use std::time::Instant;
4
5use anyhow::{Context, Result};
6use llm_brain::LLMBrain;
7use pdf_extract::extract_text;
8use serde_json::json;
9
10const DEFAULT_CHUNK_SIZE: usize = 500; // Approximate words per chunk
11
12// --- Config Helper (Copied and adapted) ---
13fn ensure_config_exists() -> Result<()> {
14    let config_dir = PathBuf::from("./config");
15    let default_config_path = config_dir.join("default.toml");
16    if default_config_path.exists() {
17        return Ok(());
18    }
19    if !config_dir.exists() {
20        fs::create_dir_all(&config_dir)?;
21    }
22    let default_toml_content = r#"
23[database]
24path = "./llm_brain_file_rag_db" # DB path for this example
25namespace = "file_rag_ns"
26database = "file_rag_db"
27
28[llm]
29# Provide necessary LLM config
30"#;
31    fs::write(&default_config_path, default_toml_content)?;
32    println!("Created default config: {}", default_config_path.display());
33    Ok(())
34}
35// --- End Config Helper ---
36
37// Simple text chunking by approximate word count
38fn chunk_text_by_words(text: &str, chunk_size: usize) -> Vec<String> {
39    let words: Vec<&str> = text.split_whitespace().collect();
40    let mut chunks = Vec::new();
41    for chunk_words in words.chunks(chunk_size) {
42        chunks.push(chunk_words.join(" "));
43    }
44    chunks
45}
46
47async fn ingest_pdf(llm_brain: &LLMBrain, pdf_path: &Path) -> Result<Vec<String>> {
48    let start_time = Instant::now();
49    println!("Processing PDF: {}", pdf_path.display());
50
51    // 1. Extract text
52    let text = extract_text(pdf_path).context("Failed to extract text from PDF")?;
53    println!("Extracted {} characters.", text.len());
54
55    // 2. Chunk text
56    let chunks = chunk_text_by_words(&text, DEFAULT_CHUNK_SIZE);
57    println!("Split into {} chunks.", chunks.len());
58
59    // 3. Add chunks as MemoryFragments
60    let filename = pdf_path
61        .file_name()
62        .unwrap_or_default()
63        .to_string_lossy()
64        .to_string();
65    let file_stem = pdf_path
66        .file_stem()
67        .unwrap_or_default()
68        .to_string_lossy()
69        .to_string();
70
71    let mut chunk_ids = Vec::new();
72
73    for (i, chunk_content) in chunks.into_iter().enumerate() {
74        let chunk_num = i + 1;
75        let metadata = json!({
76            "memory_type": "DocumentChunk",
77            "source_file": filename,
78            "chunk_number": chunk_num,
79            // Optionally add entity_name for direct reference if needed
80            "entity_name": format!("{}_chunk_{}", file_stem, chunk_num)
81        });
82
83        // Add memory (handle potential errors)
84        match llm_brain.add_memory(chunk_content, metadata).await {
85            Ok(id) => {
86                chunk_ids.push(id.to_string());
87                if chunk_num % 10 == 0 {
88                    println!("Added chunk {chunk_num}...");
89                }
90            }
91            Err(e) => {
92                eprintln!("Failed to add chunk {chunk_num}: {e}");
93                // Decide whether to continue or stop on error
94            }
95        }
96    }
97
98    // 4. (Optional) Add a MemoryFragment for the document itself, linking to chunks
99    let doc_content = format!(
100        "Metadata for document: {}. Contains {} chunks.",
101        filename,
102        chunk_ids.len()
103    );
104    let doc_metadata = json!({
105        "memory_type": "DocumentMeta",
106        "entity_name": file_stem,
107        "properties": {
108            "file_path": pdf_path.to_string_lossy(),
109            "file_name": filename,
110            "chunk_count": chunk_ids.len()
111        },
112        "relationships": {
113            "contains_chunks": chunk_ids // Store the actual DB IDs
114        }
115    });
116    llm_brain.add_memory(doc_content, doc_metadata).await?;
117    println!("Added document metadata entry.");
118
119    println!("Ingestion completed in {:?}.", start_time.elapsed());
120    Ok(chunk_ids) // Return chunk IDs if needed elsewhere
121}
122
123async fn run_queries(llm_brain: &LLMBrain, queries: &[&str]) -> Result<()> {
124    println!("\n--- Running Queries ---");
125    for query in queries {
126        println!("\nQuery: {query}");
127        let start_time = Instant::now();
128        match llm_brain.recall(query, 3).await {
129            // Retrieve top 3 relevant chunks
130            Ok(results) => {
131                println!(
132                    "Found {} relevant chunks in {:?}:",
133                    results.len(),
134                    start_time.elapsed()
135                );
136                if results.is_empty() {
137                    println!("  (No relevant chunks found)");
138                } else {
139                    for (fragment, score) in results {
140                        println!(
141                            "  - Score: {:.4}, Source: {:?}, Chunk: {:?}\n    Content: {:.150}...",
142                            score,
143                            fragment.metadata.get("source_file"),
144                            fragment.metadata.get("chunk_number"),
145                            fragment.content.replace('\n', " ") // Basic formatting
146                        );
147                    }
148                }
149            }
150            Err(e) => {
151                eprintln!("  Error during recall: {e}");
152            }
153        }
154    }
155    Ok(())
156}
157
158#[tokio::main]
159async fn main() -> Result<()> {
160    println!("--- Starting File RAG Example ---");
161    ensure_config_exists()?;
162    let llm_brain = LLMBrain::launch().await?;
163
164    // --- Ingestion Step ---
165    // IMPORTANT: Adjust this path to your actual test PDF file
166    // Using the path relative to workspace root as seen in python example context
167    let pdf_path = PathBuf::from("tests/document.pdf");
168
169    if !pdf_path.exists() {
170        eprintln!("Error: PDF file not found at {}", pdf_path.display());
171        eprintln!("Please place a PDF file at that location or update the path in the code.");
172        return Ok(()); // Exit gracefully
173    }
174
175    // Ingest the PDF content
176    if let Err(e) = ingest_pdf(&llm_brain, &pdf_path).await {
177        eprintln!("Error during PDF ingestion: {e}");
178        // Decide if program should terminate or continue
179    }
180
181    // --- Query Step ---
182    let test_queries = [
183        "What is the main topic of the document?",
184        "Summarize the key points from the document.",
185        "What are the main conclusions drawn in the document?",
186        "what is silha center",
187        "who is Charlotte Higgins",
188        "Explain the lawsuits",
189        "Explain OpenAI's Involvement",
190        "who is Mike Masnick",
191        "content moderation liability shield", // Added query
192    ];
193
194    if let Err(e) = run_queries(&llm_brain, &test_queries).await {
195        eprintln!("Error during querying: {e}");
196    }
197
198    println!("\n--- File RAG Example Finished ---");
199    println!("Note: Database is at ./llm_brain_file_rag_db");
200
201    Ok(())
202}