lc/
vector_db.rs

1use anyhow::Result;
2use dashmap::DashMap;
3use hnsw_rs::prelude::*;
4use parking_lot::RwLock;
5use rayon::prelude::*;
6use rusqlite::{params, Connection};
7use serde::{Deserialize, Serialize};
8use std::fs;
9use std::path::PathBuf;
10use std::sync::Arc;
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct VectorEntry {
14    pub id: i64,
15    pub text: String,
16    pub vector: Vec<f64>,
17    pub model: String,
18    pub provider: String,
19    pub created_at: chrono::DateTime<chrono::Utc>,
20    pub file_path: Option<String>,
21    pub chunk_index: Option<i32>,
22    pub total_chunks: Option<i32>,
23}
24
25// HNSW index for fast approximate nearest neighbor search
26type HnswIndex = Hnsw<'static, f64, DistCosine>;
27
28pub struct VectorDatabase {
29    db_path: PathBuf,
30    // In-memory HNSW index for fast similarity search
31    hnsw_index: Arc<RwLock<Option<HnswIndex>>>,
32    // Cache for vector entries to avoid repeated DB queries
33    vector_cache: Arc<DashMap<i64, VectorEntry>>,
34    // Track if index needs rebuilding
35    index_dirty: Arc<RwLock<bool>>,
36}
37
38impl std::fmt::Debug for VectorDatabase {
39    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
40        f.debug_struct("VectorDatabase")
41            .field("db_path", &self.db_path)
42            .field("vector_cache_len", &self.vector_cache.len())
43            .field("index_dirty", &self.index_dirty)
44            .finish()
45    }
46}
47
48impl VectorDatabase {
49    pub fn new(name: &str) -> Result<Self> {
50        let embeddings_dir = Self::embeddings_dir()?;
51        fs::create_dir_all(&embeddings_dir)?;
52
53        let db_path = embeddings_dir.join(format!("{}.db", name));
54
55        let db = Self {
56            db_path,
57            hnsw_index: Arc::new(RwLock::new(None)),
58            vector_cache: Arc::new(DashMap::new()),
59            index_dirty: Arc::new(RwLock::new(true)),
60        };
61
62        db.initialize()?;
63        Ok(db)
64    }
65
66    pub fn embeddings_dir() -> Result<PathBuf> {
67        let home_dir =
68            dirs::home_dir().ok_or_else(|| anyhow::anyhow!("Could not find home directory"))?;
69        Ok(home_dir.join("Library/Application Support/lc/embeddings"))
70    }
71
72    pub fn list_databases() -> Result<Vec<String>> {
73        let embeddings_dir = Self::embeddings_dir()?;
74        Self::list_databases_in_dir(&embeddings_dir)
75    }
76
77    pub fn list_databases_in_dir(embeddings_dir: &std::path::Path) -> Result<Vec<String>> {
78        if !embeddings_dir.exists() {
79            return Ok(Vec::new());
80        }
81
82        let mut databases = Vec::new();
83
84        for entry in fs::read_dir(embeddings_dir)? {
85            let entry = entry?;
86            let path = entry.path();
87
88            if path.is_file() {
89                if let Some(extension) = path.extension() {
90                    if extension == "db" {
91                        if let Some(name) = path.file_stem().and_then(|s| s.to_str()) {
92                            databases.push(name.to_string());
93                        }
94                    }
95                }
96            }
97        }
98
99        databases.sort();
100        Ok(databases)
101    }
102
103    pub fn delete_database(name: &str) -> Result<()> {
104        let embeddings_dir = Self::embeddings_dir()?;
105        Self::delete_database_in_dir(name, &embeddings_dir)
106    }
107
108    pub fn delete_database_in_dir(name: &str, embeddings_dir: &std::path::Path) -> Result<()> {
109        let db_path = embeddings_dir.join(format!("{}.db", name));
110
111        if db_path.exists() {
112            fs::remove_file(db_path)?;
113        }
114
115        Ok(())
116    }
117
118    fn initialize(&self) -> Result<()> {
119        let conn = Connection::open(&self.db_path)?;
120
121        // First, create the table with the basic schema if it doesn't exist
122        conn.execute(
123            "CREATE TABLE IF NOT EXISTS vectors (
124                id INTEGER PRIMARY KEY AUTOINCREMENT,
125                text TEXT NOT NULL,
126                vector BLOB NOT NULL,
127                model TEXT NOT NULL,
128                provider TEXT NOT NULL,
129                created_at TEXT NOT NULL
130            )",
131            [],
132        )?;
133
134        // Check if we need to migrate the schema by checking for missing columns
135        let mut has_file_path = false;
136        let mut has_chunk_index = false;
137        let mut has_total_chunks = false;
138
139        // Query the table schema to see what columns exist
140        let mut stmt = conn.prepare("PRAGMA table_info(vectors)")?;
141        let column_iter = stmt.query_map([], |row| {
142            let column_name: String = row.get(1)?;
143            Ok(column_name)
144        })?;
145
146        for column_result in column_iter {
147            let column_name = column_result?;
148            match column_name.as_str() {
149                "file_path" => has_file_path = true,
150                "chunk_index" => has_chunk_index = true,
151                "total_chunks" => has_total_chunks = true,
152                _ => {}
153            }
154        }
155
156        // Add missing columns if they don't exist
157        if !has_file_path {
158            conn.execute("ALTER TABLE vectors ADD COLUMN file_path TEXT", [])?;
159        }
160        if !has_chunk_index {
161            conn.execute("ALTER TABLE vectors ADD COLUMN chunk_index INTEGER", [])?;
162        }
163        if !has_total_chunks {
164            conn.execute("ALTER TABLE vectors ADD COLUMN total_chunks INTEGER", [])?;
165        }
166
167        // Create index for faster similarity searches
168        conn.execute(
169            "CREATE INDEX IF NOT EXISTS idx_model_provider ON vectors(model, provider)",
170            [],
171        )?;
172
173        // Create index for file-based searches (only after ensuring the column exists)
174        conn.execute(
175            "CREATE INDEX IF NOT EXISTS idx_file_path ON vectors(file_path)",
176            [],
177        )?;
178
179        Ok(())
180    }
181
182    pub fn add_vector(
183        &self,
184        text: &str,
185        vector: &[f64],
186        model: &str,
187        provider: &str,
188    ) -> Result<i64> {
189        self.add_vector_with_metadata(text, vector, model, provider, None, None, None)
190    }
191
192    pub fn add_vector_with_metadata(
193        &self,
194        text: &str,
195        vector: &[f64],
196        model: &str,
197        provider: &str,
198        file_path: Option<&str>,
199        chunk_index: Option<i32>,
200        total_chunks: Option<i32>,
201    ) -> Result<i64> {
202        let conn = Connection::open(&self.db_path)?;
203
204        // Serialize vector as JSON for storage
205        let vector_json = serde_json::to_string(vector)?;
206        let created_at = chrono::Utc::now().to_rfc3339();
207
208        conn.execute(
209            "INSERT INTO vectors (text, vector, model, provider, created_at, file_path, chunk_index, total_chunks) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
210            params![text, vector_json, model, provider, created_at, file_path, chunk_index, total_chunks],
211        )?;
212
213        let id = conn.last_insert_rowid();
214
215        // Create vector entry for cache
216        let vector_entry = VectorEntry {
217            id,
218            text: text.to_string(),
219            vector: vector.to_vec(),
220            model: model.to_string(),
221            provider: provider.to_string(),
222            created_at: chrono::Utc::now(),
223            file_path: file_path.map(|s| s.to_string()),
224            chunk_index,
225            total_chunks,
226        };
227
228        // Add to cache
229        self.vector_cache.insert(id, vector_entry);
230
231        // Mark index as dirty for rebuilding
232        *self.index_dirty.write() = true;
233
234        Ok(id)
235    }
236
237    pub fn get_all_vectors(&self) -> Result<Vec<VectorEntry>> {
238        let conn = Connection::open(&self.db_path)?;
239
240        let mut stmt = conn.prepare(
241            "SELECT id, text, vector, model, provider, created_at, file_path, chunk_index, total_chunks FROM vectors ORDER BY created_at DESC"
242        )?;
243
244        let vector_iter = stmt.query_map([], |row| {
245            let vector_json: String = row.get(2)?;
246            let vector: Vec<f64> = serde_json::from_str(&vector_json).map_err(|_e| {
247                rusqlite::Error::InvalidColumnType(
248                    2,
249                    "vector".to_string(),
250                    rusqlite::types::Type::Text,
251                )
252            })?;
253
254            let created_at_str: String = row.get(5)?;
255            let created_at = chrono::DateTime::parse_from_rfc3339(&created_at_str)
256                .map_err(|_| {
257                    rusqlite::Error::InvalidColumnType(
258                        5,
259                        "created_at".to_string(),
260                        rusqlite::types::Type::Text,
261                    )
262                })?
263                .with_timezone(&chrono::Utc);
264
265            Ok(VectorEntry {
266                id: row.get(0)?,
267                text: row.get(1)?,
268                vector,
269                model: row.get(3)?,
270                provider: row.get(4)?,
271                created_at,
272                file_path: row.get(6).ok(),
273                chunk_index: row.get(7).ok(),
274                total_chunks: row.get(8).ok(),
275            })
276        })?;
277
278        let mut vectors = Vec::new();
279        for vector in vector_iter {
280            vectors.push(vector?);
281        }
282
283        Ok(vectors)
284    }
285
286    pub fn get_model_info(&self) -> Result<Option<(String, String)>> {
287        let conn = Connection::open(&self.db_path)?;
288
289        let mut stmt = conn.prepare("SELECT model, provider FROM vectors LIMIT 1")?;
290
291        let mut rows = stmt.query_map([], |row| {
292            Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
293        })?;
294
295        if let Some(row) = rows.next() {
296            Ok(Some(row?))
297        } else {
298            Ok(None)
299        }
300    }
301
302    pub fn find_similar(
303        &self,
304        query_vector: &[f64],
305        limit: usize,
306    ) -> Result<Vec<(VectorEntry, f64)>> {
307        // Ensure HNSW index is built
308        self.ensure_index_built()?;
309
310        // Try to use HNSW index for fast approximate search
311        if let Some(index) = self.hnsw_index.read().as_ref() {
312            // Check dimension compatibility before using HNSW
313            if !self.vector_cache.is_empty() {
314                let first_entry = self.vector_cache.iter().next();
315                if let Some(entry) = first_entry {
316                    let stored_dimension = entry.vector.len();
317                    if query_vector.len() != stored_dimension {
318                        crate::debug_log!("Dimension mismatch: query={}, stored={}, falling back to linear search",
319                                        query_vector.len(), stored_dimension);
320                        return self.find_similar_linear_optimized(query_vector, limit);
321                    }
322                }
323            }
324
325            // Request more results from HNSW to account for potential cache misses
326            let hnsw_limit = std::cmp::min(limit * 2, self.vector_cache.len());
327            let search_results = index.search(query_vector, hnsw_limit, 50); // Higher ef for better recall
328
329            let mut results = Vec::with_capacity(limit);
330            for neighbor in search_results {
331                if let Some(entry) = self.vector_cache.get(&(neighbor.d_id as i64)) {
332                    // Convert distance to similarity (cosine distance -> cosine similarity)
333                    let similarity = 1.0 - neighbor.distance as f64;
334                    results.push((entry.value().clone(), similarity));
335
336                    // Stop once we have enough results
337                    if results.len() >= limit {
338                        break;
339                    }
340                }
341            }
342
343            // If HNSW didn't return enough results, fall back to linear search
344            if results.len() < limit && results.len() < self.vector_cache.len() {
345                crate::debug_log!(
346                    "HNSW returned only {} results, falling back to linear search",
347                    results.len()
348                );
349                return self.find_similar_linear_optimized(query_vector, limit);
350            }
351
352            return Ok(results);
353        }
354
355        // Fallback to optimized linear search with parallel processing
356        self.find_similar_linear_optimized(query_vector, limit)
357    }
358
359    /// Optimized linear search with parallel processing and SIMD
360    fn find_similar_linear_optimized(
361        &self,
362        query_vector: &[f64],
363        limit: usize,
364    ) -> Result<Vec<(VectorEntry, f64)>> {
365        // Get all vectors from cache or database
366        let vectors = if self.vector_cache.is_empty() {
367            self.get_all_vectors()?
368        } else {
369            self.vector_cache
370                .iter()
371                .map(|entry| entry.value().clone())
372                .collect::<Vec<_>>()
373        };
374
375        // Use parallel processing for similarity calculations
376        let mut similarities: Vec<(VectorEntry, f64)> = vectors
377            .into_par_iter()
378            .map(|vector_entry| {
379                let similarity = cosine_similarity_simd(query_vector, &vector_entry.vector);
380                (vector_entry, similarity)
381            })
382            .collect();
383
384        // Use partial_sort for better performance when limit << total_vectors
385        if limit < similarities.len() {
386            similarities.select_nth_unstable_by(limit, |a, b| {
387                b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)
388            });
389            similarities[..limit]
390                .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
391            similarities.truncate(limit);
392        } else {
393            similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
394        }
395
396        Ok(similarities)
397    }
398
399    /// Ensure HNSW index is built and up-to-date
400    fn ensure_index_built(&self) -> Result<()> {
401        let index_dirty = *self.index_dirty.read();
402
403        if index_dirty || self.hnsw_index.read().is_none() {
404            self.rebuild_index()?;
405        }
406
407        Ok(())
408    }
409
410    /// Rebuild the HNSW index from all vectors
411    fn rebuild_index(&self) -> Result<()> {
412        crate::debug_log!("Rebuilding HNSW index...");
413
414        // Load all vectors if cache is empty
415        if self.vector_cache.is_empty() {
416            let vectors = self.get_all_vectors()?;
417            for vector in vectors {
418                self.vector_cache.insert(vector.id, vector);
419            }
420        }
421
422        if self.vector_cache.is_empty() {
423            return Ok(());
424        }
425
426        // Get vector dimension from first entry
427        let first_entry = self.vector_cache.iter().next();
428        if let Some(entry) = first_entry {
429            let dimension = entry.vector.len();
430
431            // Create new HNSW index
432            let hnsw = Hnsw::new(16, dimension, 200, 200, DistCosine {});
433
434            // Add all vectors to index
435            for entry in self.vector_cache.iter() {
436                let vector_entry = entry.value();
437                hnsw.insert((&vector_entry.vector, vector_entry.id as usize));
438            }
439
440            // Update the index
441            *self.hnsw_index.write() = Some(hnsw);
442            *self.index_dirty.write() = false;
443
444            crate::debug_log!(
445                "HNSW index rebuilt with {} vectors",
446                self.vector_cache.len()
447            );
448        }
449
450        Ok(())
451    }
452
453    pub fn count(&self) -> Result<usize> {
454        let conn = Connection::open(&self.db_path)?;
455
456        let count: i64 = conn.query_row("SELECT COUNT(*) FROM vectors", [], |row| row.get(0))?;
457
458        Ok(count as usize)
459    }
460}
461
462// Optimized cosine similarity calculation with manual vectorization
463pub fn cosine_similarity_simd(a: &[f64], b: &[f64]) -> f64 {
464    if a.len() != b.len() {
465        crate::debug_log!(
466            "Vector dimension mismatch: query={}, stored={}",
467            a.len(),
468            b.len()
469        );
470        return 0.0;
471    }
472
473    if a.is_empty() {
474        return 0.0;
475    }
476
477    // Use chunked processing for better cache performance
478    let mut dot_product = 0.0f64;
479    let mut norm_a_sq = 0.0f64;
480    let mut norm_b_sq = 0.0f64;
481
482    // Process in chunks of 4 for better performance
483    let chunk_size = 4;
484    let chunks = a.len() / chunk_size;
485
486    for i in 0..chunks {
487        let start = i * chunk_size;
488        let end = start + chunk_size;
489
490        for j in start..end {
491            let av = a[j];
492            let bv = b[j];
493            dot_product += av * bv;
494            norm_a_sq += av * av;
495            norm_b_sq += bv * bv;
496        }
497    }
498
499    // Process remaining elements
500    for i in (chunks * chunk_size)..a.len() {
501        let av = a[i];
502        let bv = b[i];
503        dot_product += av * bv;
504        norm_a_sq += av * av;
505        norm_b_sq += bv * bv;
506    }
507
508    let norm_a = norm_a_sq.sqrt();
509    let norm_b = norm_b_sq.sqrt();
510
511    if norm_a == 0.0 || norm_b == 0.0 {
512        return 0.0;
513    }
514
515    dot_product / (norm_a * norm_b)
516}
517
518// File processing utilities
519pub struct FileProcessor;
520
521impl FileProcessor {
522    /// Check if a file is likely to be a text file based on extension and content
523    pub fn is_text_file(path: &std::path::Path) -> bool {
524        // Check extension first
525        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
526            let ext = ext.to_lowercase();
527            match ext.as_str() {
528                // Text files
529                "txt" | "md" | "markdown" | "rst" | "org" | "tex" | "rtf" => true,
530                // Code files
531                "rs" | "py" | "js" | "ts" | "java" | "cpp" | "c" | "h" | "hpp" | "go" | "rb"
532                | "php" | "swift" | "kt" | "scala" | "sh" | "bash" | "zsh" | "fish" | "ps1"
533                | "bat" | "cmd" | "html" | "css" | "scss" | "sass" | "less" | "xml" | "json"
534                | "yaml" | "yml" | "toml" | "ini" | "cfg" | "conf" | "sql" | "r" | "m" | "mm"
535                | "pl" | "pm" | "lua" | "vim" | "dockerfile" | "makefile" | "cmake" | "gradle" => {
536                    true
537                }
538                // Log files
539                "log" | "out" | "err" => true,
540                // Binary files to exclude
541                "exe" | "dll" | "so" | "dylib" | "bin" | "obj" | "o" | "a" | "lib" | "zip"
542                | "tar" | "gz" | "bz2" | "xz" | "7z" | "rar" | "pdf" | "doc" | "docx" | "xls"
543                | "xlsx" | "ppt" | "pptx" | "jpg" | "jpeg" | "png" | "gif" | "bmp" | "tiff"
544                | "svg" | "ico" | "mp3" | "mp4" | "avi" | "mov" | "wmv" | "flv" | "mkv" | "wav"
545                | "flac" | "ogg" => false,
546                _ => {
547                    // For unknown extensions, check if file has no extension (might be text)
548                    path.file_name()
549                        .and_then(|name| name.to_str())
550                        .map(|name| !name.contains('.'))
551                        .unwrap_or(false)
552                }
553            }
554        } else {
555            // No extension - might be a text file, check content
556            Self::is_text_content(path).unwrap_or(false)
557        }
558    }
559
560    /// Check if file content appears to be text by sampling first few bytes
561    fn is_text_content(path: &std::path::Path) -> Result<bool> {
562        use std::fs::File;
563        use std::io::Read;
564
565        let mut file = File::open(path)?;
566        let mut buffer = [0; 512]; // Sample first 512 bytes
567        let bytes_read = file.read(&mut buffer)?;
568
569        if bytes_read == 0 {
570            return Ok(true); // Empty file is considered text
571        }
572
573        // Check for null bytes (strong indicator of binary content)
574        let null_count = buffer[..bytes_read].iter().filter(|&&b| b == 0).count();
575        if null_count > 0 {
576            return Ok(false);
577        }
578
579        // Check for high ratio of printable ASCII characters
580        let printable_count = buffer[..bytes_read]
581            .iter()
582            .filter(|&&b| b >= 32 && b <= 126 || b == 9 || b == 10 || b == 13)
583            .count();
584
585        let printable_ratio = printable_count as f64 / bytes_read as f64;
586        Ok(printable_ratio > 0.7) // At least 70% printable characters
587    }
588
589    /// Expand glob patterns and filter for text files
590    pub fn expand_file_patterns(patterns: &[String]) -> Result<Vec<std::path::PathBuf>> {
591        use glob::glob;
592
593        let mut files = Vec::new();
594
595        for pattern in patterns {
596            crate::debug_log!("Processing file pattern: {}", pattern);
597
598            match glob(pattern) {
599                Ok(paths) => {
600                    for path_result in paths {
601                        match path_result {
602                            Ok(path) => {
603                                if path.is_file() && Self::is_text_file(&path) {
604                                    crate::debug_log!("Adding text file: {}", path.display());
605                                    files.push(path);
606                                } else if path.is_file() {
607                                    crate::debug_log!("Skipping non-text file: {}", path.display());
608                                } else {
609                                    crate::debug_log!("Skipping non-file: {}", path.display());
610                                }
611                            }
612                            Err(e) => {
613                                eprintln!(
614                                    "Warning: Error processing path in pattern '{}': {}",
615                                    pattern, e
616                                );
617                            }
618                        }
619                    }
620                }
621                Err(e) => {
622                    eprintln!("Warning: Invalid glob pattern '{}': {}", pattern, e);
623                }
624            }
625        }
626
627        files.sort();
628        files.dedup();
629        Ok(files)
630    }
631
632    /// Split text into chunks with overlap for better context preservation
633    pub fn chunk_text(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
634        crate::debug_log!(
635            "Chunking text: {} chars, chunk_size: {}, overlap: {}",
636            text.len(),
637            chunk_size,
638            overlap
639        );
640
641        if text.len() <= chunk_size {
642            crate::debug_log!("Text is smaller than chunk size, returning single chunk");
643            return vec![text.to_string()];
644        }
645
646        let mut chunks = Vec::new();
647        let mut start = 0;
648        let mut iteration = 0;
649
650        while start < text.len() {
651            iteration += 1;
652            crate::debug_log!(
653                "Chunk iteration {}: start={}, text.len()={}",
654                iteration,
655                start,
656                text.len()
657            );
658
659            let end = std::cmp::min(start + chunk_size, text.len());
660            let mut chunk_end = end;
661
662            // Try to break at sentence boundary
663            if end < text.len() {
664                if let Some(sentence_end) = text[start..end].rfind(". ") {
665                    chunk_end = start + sentence_end + 1;
666                } else if let Some(para_end) = text[start..end].rfind("\n\n") {
667                    chunk_end = start + para_end + 1;
668                } else if let Some(line_end) = text[start..end].rfind('\n') {
669                    chunk_end = start + line_end + 1;
670                }
671            }
672
673            let chunk = text[start..chunk_end].trim().to_string();
674            if !chunk.is_empty() {
675                let chunk_len = chunk.len();
676                chunks.push(chunk);
677                crate::debug_log!("Added chunk {}: {} chars", chunks.len(), chunk_len);
678            }
679
680            // Move start position with overlap
681            if chunk_end >= text.len() {
682                crate::debug_log!("Reached end of text, breaking");
683                break;
684            }
685
686            let new_start = if chunk_end > overlap {
687                chunk_end - overlap
688            } else {
689                chunk_end
690            };
691
692            // Ensure we're making progress - if new start is not greater than current start,
693            // move forward by at least 1 character to prevent infinite loops
694            if new_start <= start {
695                start = start + 1;
696                crate::debug_log!(
697                    "Preventing infinite loop: moving start from {} to {}",
698                    new_start,
699                    start
700                );
701            } else {
702                start = new_start;
703            }
704
705            crate::debug_log!("Next start position: {}", start);
706
707            // Safety check to prevent infinite loop
708            if iteration > 1000 {
709                crate::debug_log!(
710                    "WARNING: Too many iterations, breaking to prevent infinite loop"
711                );
712                break;
713            }
714        }
715
716        crate::debug_log!("Chunking complete: {} chunks created", chunks.len());
717        chunks
718    }
719
720    /// Read and chunk a file (synchronous version for compatibility)
721    pub fn process_file(path: &std::path::Path) -> Result<Vec<String>> {
722        // Try async version first, fall back to sync if no runtime available
723        if let Ok(handle) = tokio::runtime::Handle::try_current() {
724            handle.block_on(Self::process_file_async(path))
725        } else {
726            // Fallback to synchronous implementation for tests and non-async contexts
727            crate::debug_log!("Reading file synchronously: {}", path.display());
728            let content = std::fs::read_to_string(path)?;
729            crate::debug_log!("File content length: {} characters", content.len());
730
731            // Use 1200 character chunks with 200 character overlap
732            crate::debug_log!("Starting text chunking with 1200 char chunks, 200 char overlap");
733            let chunks = Self::chunk_text(&content, 1200, 200);
734
735            crate::debug_log!(
736                "File '{}' split into {} chunks",
737                path.display(),
738                chunks.len()
739            );
740
741            Ok(chunks)
742        }
743    }
744
745    /// Async version of process_file with memory mapping optimization
746    pub async fn process_file_async(path: &std::path::Path) -> Result<Vec<String>> {
747        crate::debug_log!("Reading file: {}", path.display());
748
749        let content = Self::read_file_optimized(path).await?;
750        crate::debug_log!("File content length: {} characters", content.len());
751
752        // Use 1200 character chunks with 200 character overlap
753        crate::debug_log!("Starting text chunking with 1200 char chunks, 200 char overlap");
754        let chunks = Self::chunk_text(&content, 1200, 200);
755
756        crate::debug_log!(
757            "File '{}' split into {} chunks",
758            path.display(),
759            chunks.len()
760        );
761
762        Ok(chunks)
763    }
764
765    /// Optimized file reading with memory mapping for large files
766    async fn read_file_optimized(path: &std::path::Path) -> Result<String> {
767        let metadata = tokio::fs::metadata(path).await?;
768        let file_size = metadata.len();
769
770        // Use memory mapping for large files (>1MB)
771        if file_size > 1_048_576 {
772            crate::debug_log!("Using memory mapping for large file: {} bytes", file_size);
773
774            let file = std::fs::File::open(path)?;
775            let mmap = unsafe { memmap2::Mmap::map(&file)? };
776
777            // Convert bytes to string in a separate task to avoid blocking
778            let content = tokio::task::spawn_blocking(move || {
779                std::str::from_utf8(&mmap)
780                    .map_err(|e| anyhow::anyhow!("Invalid UTF-8 in file: {}", e))
781                    .map(|s| s.to_string())
782            })
783            .await??;
784
785            Ok(content)
786        } else {
787            // Use async file reading for smaller files
788            crate::debug_log!(
789                "Using async file reading for small file: {} bytes",
790                file_size
791            );
792            Ok(tokio::fs::read_to_string(path).await?)
793        }
794    }
795}
796
797#[cfg(test)]
798mod tests {
799    use super::*;
800
801    #[test]
802    fn test_cosine_similarity() {
803        let a = vec![1.0, 2.0, 3.0];
804        let b = vec![1.0, 2.0, 3.0];
805        assert!((cosine_similarity_simd(&a, &b) - 1.0).abs() < 1e-10);
806
807        let a = vec![1.0, 0.0];
808        let b = vec![0.0, 1.0];
809        assert!((cosine_similarity_simd(&a, &b) - 0.0).abs() < 1e-10);
810    }
811
812    #[test]
813    fn test_chunk_text() {
814        let text = "This is sentence one. This is sentence two. This is sentence three.";
815        let chunks = FileProcessor::chunk_text(text, 30, 10);
816
817        assert!(chunks.len() > 1);
818        assert!(chunks[0].contains("sentence one"));
819    }
820
821    #[test]
822    fn test_is_text_file() {
823        use std::path::Path;
824
825        assert!(FileProcessor::is_text_file(Path::new("test.txt")));
826        assert!(FileProcessor::is_text_file(Path::new("test.rs")));
827        assert!(FileProcessor::is_text_file(Path::new("test.py")));
828        assert!(!FileProcessor::is_text_file(Path::new("test.exe")));
829        assert!(!FileProcessor::is_text_file(Path::new("test.jpg")));
830    }
831}
lc/vector_db.rs

lc/
vector_db.rs