maproom 0.1.0

Semantic code search powered by embeddings and SQLite
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
//! Git tree SHA-based incremental update algorithm.
//!
//! This module implements the core incremental indexing optimization:
//! compare current git tree SHA against last indexed SHA, and if different,
//! use git diff-tree to find changed files and process only those.
//!
//! This provides a 5-10x performance improvement over full repository scans
//! by avoiding re-indexing unchanged code.
//!
//! # Algorithm Flow
//!
//! 1. Get current git tree SHA via `get_git_tree_sha()`
//! 2. Retrieve last indexed tree SHA from `worktree_index_state` table
//! 3. **Quick check**: If tree SHAs match, skip processing (0 changes)
//! 4. If different, use `git diff-tree` to find Added/Modified/Deleted files
//! 5. Process only changed files (not entire repository)
//! 6. Update `worktree_index_state` with new tree SHA and metrics
//!
//! # Example
//!
//! ```ignore
//! use maproom::incremental::incremental_update;
//! use maproom::db::create_pool;
//! use std::path::Path;
//!
//! # async fn example() -> anyhow::Result<()> {
//! let pool = create_pool().await?;
//! let client = pool.get().await?;
//! let worktree_id = 1;
//! let repo_path = Path::new("/workspace");
//!
//! let stats = incremental_update(&*client, worktree_id, repo_path).await?;
//!
//! println!("Processed {} files, {} chunks",
//!     stats.files_processed, stats.chunks_processed);
//!
//! if stats.files_processed == 0 {
//!     println!("No changes detected (tree SHA unchanged)");
//! }
//! # Ok(())
//! # }
//! ```

use crate::db::index_state::{get_last_indexed_tree, UpdateStats};
use crate::db::SqliteStore;
use crate::git::{get_git_tree_sha, git_diff_tree, FileStatus};
use anyhow::{Context, Result};
use std::path::Path;
use tracing::{debug, info};

impl UpdateStats {
    /// Create a new UpdateStats with all counters at zero.
    pub fn new() -> Self {
        Self {
            files_processed: 0,
            chunks_processed: 0,
            embeddings_generated: 0,
        }
    }

    /// Create stats representing a skipped update (no changes detected).
    pub fn skipped() -> Self {
        Self::new()
    }

    /// Calculate the embedding cache hit rate.
    ///
    /// Returns 1.0 (100%) if no chunks processed, otherwise the ratio of
    /// chunks that didn't need new embeddings.
    pub fn cache_hit_rate(&self) -> f64 {
        if self.chunks_processed == 0 {
            return 1.0;
        }
        1.0 - (self.embeddings_generated as f64 / self.chunks_processed as f64)
    }

    /// Estimate embedding API cost based on number of embeddings generated.
    ///
    /// Uses $0.00002 per embedding (typical for text-embedding-ada-002).
    pub fn cost(&self) -> f64 {
        self.embeddings_generated as f64 * 0.00002
    }
}

impl Default for UpdateStats {
    fn default() -> Self {
        Self::new()
    }
}

/// Remove a worktree from chunks when files are deleted.
///
/// This function:
/// 1. Removes `worktree_id` from the `worktree_ids` JSONB array for all chunks in `relpath`
/// 2. Deletes chunks that have empty `worktree_ids` arrays (garbage collection)
///
/// # Arguments
///
/// * `client` - Database client
/// * `worktree_id` - ID of the worktree to remove
/// * `relpath` - Relative path of the deleted file
///
/// # Returns
///
/// * `Ok(affected)` - Number of chunks affected (had worktree removed)
/// * `Err` - Database errors
///
/// # Idempotency
///
/// Calling this function multiple times with the same arguments is safe (no-op if worktree already removed).
///
/// # Example
///
/// ```no_run
/// # use maproom::incremental::remove_worktree_from_chunks;
/// # use maproom::db;
/// # async fn example() -> anyhow::Result<()> {
/// let client = db::connect().await?;
/// let affected = remove_worktree_from_chunks(&client, 1, "src/deleted.rs").await?;
/// println!("Removed worktree from {} chunks", affected);
/// # Ok(())
/// # }
/// ```
pub async fn remove_worktree_from_chunks(
    store: &SqliteStore,
    worktree_id: i64,
    relpath: &str,
) -> Result<i64> {
    let relpath = relpath.to_string();

    store
        .run(move |conn| {
            // 1. Find chunks in this file that have the worktree
            let chunk_ids: Vec<i64> = {
                let mut stmt = conn.prepare(
                    "SELECT c.id FROM chunks c
                 JOIN files f ON c.file_id = f.id
                 JOIN chunk_worktrees cw ON cw.chunk_id = c.id
                 WHERE f.relpath = ?1 AND cw.worktree_id = ?2",
                )?;
                let rows =
                    stmt.query_map(rusqlite::params![relpath, worktree_id], |row| row.get(0))?;
                rows.filter_map(|r| r.ok()).collect()
            };

            if chunk_ids.is_empty() {
                return Ok(0);
            }

            // 2. Remove worktree entries for these chunks
            let placeholders: String = chunk_ids.iter().map(|_| "?").collect::<Vec<_>>().join(",");
            let sql = format!(
                "DELETE FROM chunk_worktrees WHERE chunk_id IN ({}) AND worktree_id = ?",
                placeholders
            );

            let mut params: Vec<Box<dyn rusqlite::ToSql>> = chunk_ids
                .iter()
                .map(|id| Box::new(*id) as Box<dyn rusqlite::ToSql>)
                .collect();
            params.push(Box::new(worktree_id));

            // Use execute with proper parameter handling
            let affected = conn.execute(
                &sql,
                rusqlite::params_from_iter(chunk_ids.iter().chain(std::iter::once(&worktree_id))),
            )?;

            // 3. Clean up orphaned chunks (chunks with no worktrees)
            conn.execute(
            "DELETE FROM chunks WHERE id NOT IN (SELECT DISTINCT chunk_id FROM chunk_worktrees)",
            []
        )?;

            Ok(affected as i64)
        })
        .await
}

/// Perform incremental update based on git tree SHA comparison.
///
/// This is the core function that implements the 5-10x performance improvement
/// from BRANCHX. It compares the current git tree SHA against the last indexed
/// SHA, and if different, processes only the changed files.
///
/// # Algorithm
///
/// 1. Get current git tree SHA (O(1) git operation)
/// 2. Get last indexed tree SHA from database
/// 3. If identical, return immediately with zero stats (common case after indexing)
/// 4. If different, use `git diff-tree` to find changed files
/// 5. Process only Added/Modified files (Deleted handled in BRANCHX-1009)
/// 6. Update index state with new tree SHA
///
/// # Arguments
///
/// * `client` - Database client
/// * `worktree_id` - ID of the worktree being updated
/// * `repo_path` - Path to the git repository root
///
/// # Returns
///
/// * `Ok(UpdateStats)` - Metrics about files/chunks/embeddings processed
/// * `Err` - Git errors, database errors, or processing errors
///
/// # Performance
///
/// - No changes: ~5-10ms (just git tree SHA + database query)
/// - Small changes: 100ms - 1s (process only changed files)
/// - Full repository: 10s - 5min (rare, only on first index or git reset)
///
/// # Example
///
/// ```no_run
/// # use maproom::incremental::incremental_update;
/// # use maproom::db;
/// # use std::path::Path;
/// # async fn example() -> anyhow::Result<()> {
/// let client = db::connect().await?;
/// let stats = incremental_update(&client, 1, Path::new("/workspace")).await?;
///
/// println!("Files: {}, Chunks: {}, Cost: ${:.4}",
///     stats.files_processed,
///     stats.chunks_processed,
///     stats.cost());
/// # Ok(())
/// # }
/// ```
pub async fn incremental_update(
    store: &SqliteStore,
    worktree_id: i64,
    repo_path: &Path,
) -> Result<UpdateStats> {
    // 1. Get current git tree SHA
    let current_tree_sha = get_git_tree_sha(repo_path)
        .with_context(|| format!("Failed to get git tree SHA for {:?}", repo_path))?;

    debug!(
        worktree_id = worktree_id,
        tree_sha = %current_tree_sha,
        "Got current git tree SHA"
    );

    // 2. Get last indexed tree SHA from database
    // Returns "init" if no previous state exists
    let last_indexed = get_last_indexed_tree(store, worktree_id)
        .await
        .with_context(|| {
            format!(
                "Failed to get last indexed tree for worktree {}",
                worktree_id
            )
        })?;

    // 3. If tree SHAs match, skip processing (quick path)
    if last_indexed != "init" && last_indexed == current_tree_sha {
        info!(
            worktree_id = worktree_id,
            tree_sha = %current_tree_sha,
            "Tree SHA unchanged, skipping incremental update"
        );
        return Ok(UpdateStats::skipped());
    }

    if last_indexed != "init" {
        debug!(
            worktree_id = worktree_id,
            last_sha = %last_indexed,
            current_sha = %current_tree_sha,
            "Tree SHA changed, processing diff"
        );
    } else {
        debug!(
            worktree_id = worktree_id,
            "No previous tree SHA found, this is likely first index"
        );
    }

    // 4. Find changed files via git diff-tree
    let changes = if last_indexed != "init" {
        // git_diff_tree(old_tree, new_tree, repo_path)
        git_diff_tree(&last_indexed, &current_tree_sha, repo_path).with_context(|| {
            format!(
                "Failed to get diff-tree between {} and {}",
                last_indexed, current_tree_sha
            )
        })?
    } else {
        // No previous state - treat as full re-index
        // This case should be rare as first index is done by `scan` command
        debug!("No previous tree SHA, returning empty diff (full index handled separately)");
        Vec::new()
    };

    let mut stats = UpdateStats::new();
    stats.files_processed = changes.len() as i32;

    // 5. Process changed files based on status
    // Note: Full processing is handled by the processor module
    // This function just orchestrates and tracks stats
    for change in &changes {
        let relpath = change.path.to_string_lossy();
        match change.status {
            FileStatus::Added | FileStatus::Modified => {
                debug!(
                    file = %relpath,
                    status = ?change.status,
                    "File needs processing"
                );
                // Actual processing happens through the upsert command
                // This function tracks what needs to be done
            }
            FileStatus::Deleted => {
                debug!(
                    file = %relpath,
                    "File deleted, removing chunks"
                );
                // Remove worktree from chunks for deleted files
                let affected = remove_worktree_from_chunks(store, worktree_id, &relpath).await?;
                debug!(
                    file = %relpath,
                    chunks_affected = affected,
                    "Removed worktree from chunks"
                );
            }
        }
    }

    // 6. Update index state with new tree SHA
    // This is done after successful processing
    store
        .run({
            let tree_sha = current_tree_sha.clone();
            move |conn| {
                conn.execute(
                    "INSERT OR REPLACE INTO index_state (worktree_id, tree_sha, last_indexed)
                 VALUES (?1, ?2, datetime('now'))",
                    rusqlite::params![worktree_id, tree_sha],
                )?;
                Ok(())
            }
        })
        .await
        .with_context(|| format!("Failed to update index state for worktree {}", worktree_id))?;

    info!(
        worktree_id = worktree_id,
        files_processed = stats.files_processed,
        tree_sha = %current_tree_sha,
        "Incremental update complete"
    );

    Ok(stats)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_update_stats_new() {
        let stats = UpdateStats::new();
        assert_eq!(stats.files_processed, 0);
        assert_eq!(stats.chunks_processed, 0);
        assert_eq!(stats.embeddings_generated, 0);
    }

    #[test]
    fn test_update_stats_skipped() {
        let stats = UpdateStats::skipped();
        assert_eq!(stats.files_processed, 0);
        assert_eq!(stats.chunks_processed, 0);
        assert_eq!(stats.embeddings_generated, 0);
    }

    #[test]
    fn test_cache_hit_rate_no_chunks() {
        let stats = UpdateStats::new();
        assert_eq!(stats.cache_hit_rate(), 1.0);
    }

    #[test]
    fn test_cache_hit_rate_all_cached() {
        let stats = UpdateStats {
            files_processed: 10,
            chunks_processed: 100,
            embeddings_generated: 0,
        };
        assert_eq!(stats.cache_hit_rate(), 1.0);
    }

    #[test]
    fn test_cache_hit_rate_partial() {
        let stats = UpdateStats {
            files_processed: 10,
            chunks_processed: 100,
            embeddings_generated: 50,
        };
        assert_eq!(stats.cache_hit_rate(), 0.5); // 50% hit rate
    }

    #[test]
    fn test_cost_calculation() {
        let stats = UpdateStats {
            files_processed: 10,
            chunks_processed: 100,
            embeddings_generated: 1000,
        };
        assert_eq!(stats.cost(), 0.02); // 1000 * 0.00002 = $0.02
    }

    #[test]
    fn test_default() {
        let stats = UpdateStats::default();
        assert_eq!(stats.files_processed, 0);
        assert_eq!(stats.chunks_processed, 0);
        assert_eq!(stats.embeddings_generated, 0);
    }

    // Note: remove_worktree_from_chunks requires database integration tests
    // See tests/incremental_deletions.rs for comprehensive testing
}