Skip to main content

brainwires_storage/
file_context.rs

1//! File Context Manager
2//!
3//! Manages file content for context injection with smart chunking for large files.
4//! Prevents re-injection of files already in context and retrieves relevant
5//! portions of large files based on query context.
6
7use anyhow::{Context, Result};
8use sha2::{Digest, Sha256};
9use std::collections::{HashMap, HashSet};
10
11/// Maximum characters before chunking a file
12const MAX_DIRECT_FILE_CHARS: usize = 50_000;
13/// Chunk size for large files (in characters)
14const LARGE_FILE_CHUNK_SIZE: usize = 10_000;
15/// Maximum number of chunks to return from a large file
16const MAX_FILE_CHUNKS: usize = 5;
17
18/// Content returned from file context manager
19#[derive(Debug, Clone)]
20pub enum FileContent {
21    /// Small file - full content returned
22    Full(String),
23    /// Large file - only relevant chunks returned
24    Chunked {
25        /// File path
26        path: String,
27        /// Total file size in characters
28        total_size: usize,
29        /// Retrieved chunks with context
30        chunks: Vec<FileChunk>,
31        /// Whether there's more content available
32        has_more: bool,
33    },
34    /// File already in context - just return a reference
35    AlreadyInContext(String),
36}
37
38/// A chunk of file content with context
39#[derive(Debug, Clone)]
40pub struct FileChunk {
41    /// Chunk content
42    pub content: String,
43    /// Starting line number (1-indexed)
44    pub line_start: usize,
45    /// Ending line number (1-indexed)
46    pub line_end: usize,
47    /// Relevance score (0.0 to 1.0)
48    pub relevance_score: f32,
49}
50
51/// Manages file content for context injection
52pub struct FileContextManager {
53    /// Files already in current context (to avoid re-injection)
54    context_files: HashSet<String>,
55    /// Cache of indexed file chunks (path -> chunks)
56    file_chunks: HashMap<String, Vec<FileChunk>>,
57}
58
59impl Default for FileContextManager {
60    fn default() -> Self {
61        Self::new()
62    }
63}
64
65impl FileContextManager {
66    /// Create a new file context manager
67    pub fn new() -> Self {
68        Self {
69            context_files: HashSet::new(),
70            file_chunks: HashMap::new(),
71        }
72    }
73
74    /// Compute SHA256 hash of content
75    pub fn compute_hash(content: &str) -> String {
76        let mut hasher = Sha256::new();
77        hasher.update(content.as_bytes());
78        format!("{:x}", hasher.finalize())
79    }
80
81    /// Check if a file is already in the current context
82    pub fn is_in_context(&self, path: &str) -> bool {
83        self.context_files.contains(path)
84    }
85
86    /// Mark a file as being in the current context
87    pub fn mark_in_context(&mut self, path: &str) {
88        self.context_files.insert(path.to_string());
89    }
90
91    /// Clear the context tracking (for new conversation turns)
92    pub fn clear_context(&mut self) {
93        self.context_files.clear();
94    }
95
96    /// Get the number of files currently in context
97    pub fn context_file_count(&self) -> usize {
98        self.context_files.len()
99    }
100
101    /// Get file content with smart routing based on size
102    ///
103    /// # Arguments
104    /// * `path` - Path to the file
105    /// * `query_context` - Optional query to use for finding relevant chunks
106    ///
107    /// # Returns
108    /// * `FileContent::Full` for small files
109    /// * `FileContent::Chunked` for large files with relevant portions
110    /// * `FileContent::AlreadyInContext` if file was previously loaded
111    pub async fn get_file_content(
112        &mut self,
113        path: &str,
114        query_context: Option<&str>,
115    ) -> Result<FileContent> {
116        // Check if already in context
117        if self.is_in_context(path) {
118            return Ok(FileContent::AlreadyInContext(path.to_string()));
119        }
120
121        // Read the file
122        let content = tokio::fs::read_to_string(path)
123            .await
124            .with_context(|| format!("Failed to read file: {}", path))?;
125
126        // Small file: return full content
127        if content.len() <= MAX_DIRECT_FILE_CHARS {
128            self.mark_in_context(path);
129            return Ok(FileContent::Full(content));
130        }
131
132        // Large file: get relevant chunks
133        let chunks = self.get_relevant_chunks(path, &content, query_context)?;
134
135        self.mark_in_context(path);
136
137        Ok(FileContent::Chunked {
138            path: path.to_string(),
139            total_size: content.len(),
140            chunks,
141            has_more: content.len() > MAX_DIRECT_FILE_CHARS,
142        })
143    }
144
145    /// Get relevant chunks from a large file
146    fn get_relevant_chunks(
147        &mut self,
148        path: &str,
149        content: &str,
150        query_context: Option<&str>,
151    ) -> Result<Vec<FileChunk>> {
152        // Build all chunks from file
153        let all_chunks = self.build_file_chunks(content);
154
155        // Cache chunks for future reference
156        self.file_chunks
157            .insert(path.to_string(), all_chunks.clone());
158
159        // If we have a query, try to find relevant chunks
160        if let Some(query) = query_context {
161            let relevant = self.find_relevant_chunks(&all_chunks, query);
162            if !relevant.is_empty() {
163                return Ok(relevant);
164            }
165        }
166
167        // Fall back to first N chunks
168        Ok(all_chunks.into_iter().take(MAX_FILE_CHUNKS).collect())
169    }
170
171    /// Build chunks from file content
172    fn build_file_chunks(&self, content: &str) -> Vec<FileChunk> {
173        let lines: Vec<&str> = content.lines().collect();
174        let mut chunks = Vec::new();
175        let mut current_line = 0;
176
177        while current_line < lines.len() {
178            let mut chunk_content = String::new();
179            let start_line = current_line + 1; // 1-indexed
180
181            // Build chunk up to target size
182            while current_line < lines.len() && chunk_content.len() < LARGE_FILE_CHUNK_SIZE {
183                if !chunk_content.is_empty() {
184                    chunk_content.push('\n');
185                }
186                chunk_content.push_str(lines[current_line]);
187                current_line += 1;
188            }
189
190            if !chunk_content.is_empty() {
191                chunks.push(FileChunk {
192                    content: chunk_content,
193                    line_start: start_line,
194                    line_end: current_line,
195                    relevance_score: 1.0,
196                });
197            }
198        }
199
200        chunks
201    }
202
203    /// Find chunks relevant to a query using simple keyword matching
204    fn find_relevant_chunks(&self, chunks: &[FileChunk], query: &str) -> Vec<FileChunk> {
205        let query_lower = query.to_lowercase();
206        let query_words: Vec<&str> = query_lower.split_whitespace().collect();
207
208        let mut scored_chunks: Vec<(FileChunk, f32)> = chunks
209            .iter()
210            .filter_map(|chunk| {
211                let content_lower = chunk.content.to_lowercase();
212
213                // Count matching words
214                let matching_words = query_words
215                    .iter()
216                    .filter(|word| content_lower.contains(*word))
217                    .count();
218
219                if matching_words > 0 {
220                    let score = matching_words as f32 / query_words.len() as f32;
221                    Some((
222                        FileChunk {
223                            content: chunk.content.clone(),
224                            line_start: chunk.line_start,
225                            line_end: chunk.line_end,
226                            relevance_score: score,
227                        },
228                        score,
229                    ))
230                } else {
231                    None
232                }
233            })
234            .collect();
235
236        // Sort by score descending
237        scored_chunks.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
238
239        // Take top N chunks
240        scored_chunks
241            .into_iter()
242            .take(MAX_FILE_CHUNKS)
243            .map(|(chunk, _)| chunk)
244            .collect()
245    }
246
247    /// Get specific lines from a file
248    pub async fn get_file_lines(
249        &mut self,
250        path: &str,
251        start_line: usize,
252        end_line: usize,
253    ) -> Result<FileContent> {
254        let content = tokio::fs::read_to_string(path)
255            .await
256            .with_context(|| format!("Failed to read file: {}", path))?;
257
258        let lines: Vec<&str> = content.lines().collect();
259        let total_lines = lines.len();
260
261        let start = (start_line.saturating_sub(1)).min(total_lines);
262        let end = end_line.min(total_lines);
263
264        if start >= end {
265            return Ok(FileContent::Full(String::new()));
266        }
267
268        let selected_content: String = lines[start..end].join("\n");
269
270        self.mark_in_context(path);
271
272        if selected_content.len() <= MAX_DIRECT_FILE_CHARS {
273            Ok(FileContent::Full(selected_content))
274        } else {
275            Ok(FileContent::Chunked {
276                path: path.to_string(),
277                total_size: content.len(),
278                chunks: vec![FileChunk {
279                    content: selected_content,
280                    line_start: start + 1,
281                    line_end: end,
282                    relevance_score: 1.0,
283                }],
284                has_more: true,
285            })
286        }
287    }
288
289    /// Format chunked content for display in context
290    pub fn format_content(file_content: &FileContent) -> String {
291        match file_content {
292            FileContent::Full(content) => content.clone(),
293            FileContent::AlreadyInContext(path) => {
294                format!("[File {} is already shown above]", path)
295            }
296            FileContent::Chunked {
297                path,
298                total_size,
299                chunks,
300                has_more,
301            } => {
302                let mut result = format!(
303                    "[File: {} | Size: {} chars | Showing {} relevant sections]\n\n",
304                    path,
305                    total_size,
306                    chunks.len()
307                );
308
309                for chunk in chunks {
310                    result.push_str(&format!(
311                        "--- Lines {}-{} (relevance: {:.2}) ---\n{}\n\n",
312                        chunk.line_start, chunk.line_end, chunk.relevance_score, chunk.content
313                    ));
314                }
315
316                if *has_more {
317                    result.push_str(
318                        "[... more content available, ask for specific sections or line numbers ...]\n",
319                    );
320                }
321
322                result
323            }
324        }
325    }
326}
327
328#[cfg(test)]
329mod tests {
330    use super::*;
331
332    #[test]
333    fn test_file_chunk_creation() {
334        let chunk = FileChunk {
335            content: "fn main() {}".to_string(),
336            line_start: 1,
337            line_end: 1,
338            relevance_score: 0.95,
339        };
340
341        assert_eq!(chunk.line_start, 1);
342        assert_eq!(chunk.line_end, 1);
343        assert!((chunk.relevance_score - 0.95).abs() < 0.01);
344    }
345
346    #[test]
347    fn test_format_full_content() {
348        let content = FileContent::Full("hello world".to_string());
349        let formatted = FileContextManager::format_content(&content);
350        assert_eq!(formatted, "hello world");
351    }
352
353    #[test]
354    fn test_format_already_in_context() {
355        let content = FileContent::AlreadyInContext("/path/to/file.rs".to_string());
356        let formatted = FileContextManager::format_content(&content);
357        assert!(formatted.contains("already shown above"));
358        assert!(formatted.contains("/path/to/file.rs"));
359    }
360
361    #[test]
362    fn test_format_chunked_content() {
363        let content = FileContent::Chunked {
364            path: "/path/to/file.rs".to_string(),
365            total_size: 50000,
366            chunks: vec![
367                FileChunk {
368                    content: "fn main() {}".to_string(),
369                    line_start: 1,
370                    line_end: 1,
371                    relevance_score: 0.95,
372                },
373                FileChunk {
374                    content: "fn helper() {}".to_string(),
375                    line_start: 10,
376                    line_end: 10,
377                    relevance_score: 0.85,
378                },
379            ],
380            has_more: true,
381        };
382
383        let formatted = FileContextManager::format_content(&content);
384
385        assert!(formatted.contains("/path/to/file.rs"));
386        assert!(formatted.contains("50000 chars"));
387        assert!(formatted.contains("2 relevant sections"));
388        assert!(formatted.contains("fn main()"));
389        assert!(formatted.contains("fn helper()"));
390        assert!(formatted.contains("more content available"));
391    }
392
393    #[test]
394    fn test_compute_hash() {
395        let hash1 = FileContextManager::compute_hash("hello world");
396        let hash2 = FileContextManager::compute_hash("hello world");
397        let hash3 = FileContextManager::compute_hash("different content");
398
399        assert_eq!(hash1, hash2);
400        assert_ne!(hash1, hash3);
401        assert_eq!(hash1.len(), 64); // SHA256 hex length
402    }
403
404    #[test]
405    fn test_context_tracking() {
406        let mut manager = FileContextManager::new();
407
408        assert!(!manager.is_in_context("/some/file.rs"));
409        assert_eq!(manager.context_file_count(), 0);
410
411        manager.mark_in_context("/some/file.rs");
412        assert!(manager.is_in_context("/some/file.rs"));
413        assert_eq!(manager.context_file_count(), 1);
414
415        manager.clear_context();
416        assert!(!manager.is_in_context("/some/file.rs"));
417        assert_eq!(manager.context_file_count(), 0);
418    }
419
420    #[test]
421    fn test_build_file_chunks() {
422        let manager = FileContextManager::new();
423        let content = "line 1\nline 2\nline 3\nline 4\nline 5";
424
425        let chunks = manager.build_file_chunks(content);
426
427        assert!(!chunks.is_empty());
428        assert_eq!(chunks[0].line_start, 1);
429    }
430
431    #[test]
432    fn test_find_relevant_chunks() {
433        let manager = FileContextManager::new();
434        let chunks = vec![
435            FileChunk {
436                content: "This is about authentication and login".to_string(),
437                line_start: 1,
438                line_end: 1,
439                relevance_score: 1.0,
440            },
441            FileChunk {
442                content: "This is about database queries".to_string(),
443                line_start: 2,
444                line_end: 2,
445                relevance_score: 1.0,
446            },
447            FileChunk {
448                content: "This handles user login flow".to_string(),
449                line_start: 3,
450                line_end: 3,
451                relevance_score: 1.0,
452            },
453        ];
454
455        let relevant = manager.find_relevant_chunks(&chunks, "login authentication");
456
457        assert!(!relevant.is_empty());
458        // The chunks containing "login" or "authentication" should be ranked higher
459        assert!(
460            relevant[0].content.contains("login") || relevant[0].content.contains("authentication")
461        );
462    }
463}