colgrep 1.5.0

Semantic code search powered by ColBERT
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
//! Centralized index storage paths following XDG Base Directory Specification
//!
//! Index storage location:
//! - Linux: ~/.local/share/colgrep/indices/
//! - macOS: ~/Library/Application Support/colgrep/indices/
//! - Windows: C:\Users\{user}\AppData\Roaming\colgrep\indices\

use std::fs::{self, File};
use std::path::{Path, PathBuf};

use anyhow::{Context, Result};
use fs2::FileExt;
use serde::{Deserialize, Serialize};
use xxhash_rust::xxh3::xxh3_64;

const STATE_FILE: &str = "state.json";
const PROJECT_FILE: &str = "project.json";
const INDEX_SUBDIR: &str = "index";
const LOCK_FILE: &str = ".lock";

/// Metadata about the project stored alongside the index
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ProjectMetadata {
    /// Canonical path to the project directory
    pub project_path: PathBuf,
    /// Project name (directory name)
    pub project_name: String,
    /// Model id the index was built with (e.g., "lightonai/LateOn-Code-edge").
    /// Optional so pre-1.3 project.json files without this field still deserialize;
    /// legacy indexes without a model are treated as orphaned and ignored by lookups.
    #[serde(default)]
    pub model: Option<String>,
}

impl ProjectMetadata {
    pub fn new(project_path: &Path, model: &str) -> Self {
        let project_name = project_path
            .file_name()
            .map(|n| n.to_string_lossy().to_string())
            .unwrap_or_else(|| "project".to_string());

        Self {
            project_path: project_path.to_path_buf(),
            project_name,
            model: Some(model.to_string()),
        }
    }

    pub fn load(index_dir: &Path) -> Result<Self> {
        let path = index_dir.join(PROJECT_FILE);
        let content = fs::read_to_string(&path)
            .with_context(|| format!("Failed to read {}", path.display()))?;
        Ok(serde_json::from_str(&content)?)
    }

    pub fn save(&self, index_dir: &Path) -> Result<()> {
        fs::create_dir_all(index_dir)?;
        let path = index_dir.join(PROJECT_FILE);
        let content = serde_json::to_string_pretty(self)?;
        fs::write(&path, content)?;
        Ok(())
    }
}

/// Get the base colgrep data directory (XDG_DATA_HOME/colgrep or platform equivalent).
///
/// Honours the `COLGREP_DATA_DIR` env var so concurrent benchmark
/// processes can keep separate index caches without fighting over
/// `~/.local/share/colgrep/indices`.
pub fn get_colgrep_data_dir() -> Result<PathBuf> {
    if let Ok(env_dir) = std::env::var("COLGREP_DATA_DIR") {
        if !env_dir.is_empty() {
            return Ok(PathBuf::from(env_dir));
        }
    }
    let data_dir = dirs::data_dir().context("Could not determine data directory")?;
    Ok(data_dir.join("colgrep").join("indices"))
}

/// Compute the index directory name for a (project_path, model) pair.
/// Format: {project_name}-{first 8 hex chars of xxh3_64(path|model) hash}
/// Including the model in the hash lets different models keep independent indexes
/// for the same project (switching models no longer corrupts the index).
fn compute_index_dir_name(project_path: &Path, model: &str) -> String {
    let path_str = project_path.to_string_lossy();
    let mut hasher_input = Vec::with_capacity(path_str.len() + 1 + model.len());
    hasher_input.extend_from_slice(path_str.as_bytes());
    hasher_input.push(b'|');
    hasher_input.extend_from_slice(model.as_bytes());
    let hash = xxh3_64(&hasher_input);
    let hash_prefix = format!("{:08x}", hash).chars().take(8).collect::<String>();

    let project_name = project_path
        .file_name()
        .map(|n| n.to_string_lossy().to_string())
        .unwrap_or_else(|| "project".to_string());

    // Sanitize project name (remove characters that might cause issues in filenames)
    let sanitized_name: String = project_name
        .chars()
        .map(|c| {
            if c.is_alphanumeric() || c == '-' || c == '_' {
                c
            } else {
                '_'
            }
        })
        .collect();

    format!("{}-{}", sanitized_name, hash_prefix)
}

/// Get the index directory for a (project_path, model) pair.
/// Creates the directory structure if it doesn't exist.
pub fn get_index_dir_for_project(project_path: &Path, model: &str) -> Result<PathBuf> {
    let base_dir = get_colgrep_data_dir()?;
    let dir_name = compute_index_dir_name(project_path, model);
    Ok(base_dir.join(dir_name))
}

/// Find an existing index for a (project_path, model) pair.
/// Returns None if no index exists for that specific model.
pub fn find_index_for_project(project_path: &Path, model: &str) -> Result<Option<PathBuf>> {
    let index_dir = get_index_dir_for_project(project_path, model)?;

    // Check if the index directory exists and has valid metadata
    let metadata_path = index_dir.join(INDEX_SUBDIR).join("metadata.json");
    if metadata_path.exists() {
        // Verify the project path matches and (if stored) the model matches.
        if let Ok(meta) = ProjectMetadata::load(&index_dir) {
            if meta.project_path == project_path {
                match meta.model.as_deref() {
                    Some(m) if m == model => return Ok(Some(index_dir)),
                    // Legacy index (no model recorded): directory hash already scopes by
                    // model, so reaching this branch means the caller's model matches
                    // whatever was built there. Treat as a match.
                    None => return Ok(Some(index_dir)),
                    _ => return Ok(None),
                }
            }
        }
        // Index exists but project path doesn't match (hash collision).
        // With the model now in the hash this is still extremely rare; handle gracefully.
        return Ok(Some(index_dir));
    }

    Ok(None)
}

/// Check if an index exists for the given (project, model) pair.
pub fn index_exists(project_path: &Path, model: &str) -> bool {
    matches!(find_index_for_project(project_path, model), Ok(Some(_)))
}

/// Information about a discovered parent index
#[derive(Debug, Clone)]
pub struct ParentIndexInfo {
    /// Path to the parent project's index directory
    pub index_dir: PathBuf,
    /// The parent project's root path
    pub project_path: PathBuf,
    /// Relative path from parent project root to the search directory
    pub relative_subdir: PathBuf,
}

/// Find if the given path is a subdirectory of any existing indexed project
/// built with `model`. Indexes for other models are ignored so that switching
/// models does not reuse a mismatched index.
/// Returns the most specific (longest-matching) parent index if found.
pub fn find_parent_index(search_path: &Path, model: &str) -> Result<Option<ParentIndexInfo>> {
    let data_dir = get_colgrep_data_dir()?;

    if !data_dir.exists() {
        return Ok(None);
    }

    let mut best_match: Option<ParentIndexInfo> = None;
    let mut best_depth = 0;

    for entry in fs::read_dir(&data_dir)?.filter_map(|e| e.ok()) {
        let index_dir = entry.path();
        if !index_dir.is_dir() {
            continue;
        }

        // Try to load project metadata
        if let Ok(meta) = ProjectMetadata::load(&index_dir) {
            // Skip indexes that were built with a different model. Legacy indexes
            // (no model field) are also skipped — they're orphans under the new
            // per-model hashing scheme and users should rebuild.
            if meta.model.as_deref() != Some(model) {
                continue;
            }
            // Check if search_path starts with this project's path
            // but is NOT the same path (must be a subdirectory)
            if search_path != meta.project_path {
                if let Ok(relative) = search_path.strip_prefix(&meta.project_path) {
                    // Prefer the most specific (longest) parent path
                    let depth = meta.project_path.components().count();
                    if depth > best_depth {
                        best_depth = depth;
                        best_match = Some(ParentIndexInfo {
                            index_dir,
                            project_path: meta.project_path,
                            relative_subdir: relative.to_path_buf(),
                        });
                    }
                }
            }
        }
    }

    Ok(best_match)
}

/// Get the path to the state.json file within an index directory
pub fn get_state_path(index_dir: &Path) -> PathBuf {
    index_dir.join(STATE_FILE)
}

/// Get the path to the vector index within an index directory
pub fn get_vector_index_path(index_dir: &Path) -> PathBuf {
    index_dir.join(INDEX_SUBDIR)
}

/// Get the path to the lock file within an index directory
pub fn get_lock_path(index_dir: &Path) -> PathBuf {
    index_dir.join(LOCK_FILE)
}

/// Try to acquire the index lock without waiting.
/// Returns `Ok(Some(file))` if acquired, `Ok(None)` if another process holds it.
pub fn try_acquire_index_lock(index_dir: &Path) -> Result<Option<File>> {
    fs::create_dir_all(index_dir)?;
    let lock_path = get_lock_path(index_dir);
    let lock_file = File::create(&lock_path)
        .with_context(|| format!("Failed to create lock file at {}", lock_path.display()))?;

    match lock_file.try_lock_exclusive() {
        Ok(()) => Ok(Some(lock_file)),
        Err(_) => Ok(None),
    }
}

/// Acquires an exclusive lock on the index directory.
/// Returns a guard (File handle) that releases the lock when dropped.
///
/// If another process holds the lock, retries for up to 5 seconds before
/// returning an error.
pub fn acquire_index_lock(index_dir: &Path) -> Result<File> {
    use std::time::{Duration, Instant};

    const TIMEOUT: Duration = Duration::from_secs(5);
    const RETRY_INTERVAL: Duration = Duration::from_millis(500);

    fs::create_dir_all(index_dir)?;
    let lock_path = get_lock_path(index_dir);
    let lock_file = File::create(&lock_path)
        .with_context(|| format!("Failed to create lock file at {}", lock_path.display()))?;

    let start = Instant::now();
    loop {
        match lock_file.try_lock_exclusive() {
            Ok(()) => return Ok(lock_file),
            Err(_) if start.elapsed() < TIMEOUT => {
                std::thread::sleep(RETRY_INTERVAL);
            }
            Err(_) => {
                return Err(anyhow::anyhow!(
                    "Timed out waiting for index lock after 5 seconds. \
                     Another colgrep instance may be updating this index."
                ));
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_compute_index_dir_name() {
        let path = PathBuf::from("/Users/foo/myproject");
        let name = compute_index_dir_name(&path, "lightonai/LateOn");
        // Should be format: myproject-{8 hex chars}
        assert!(name.starts_with("myproject-"));
        assert_eq!(name.len(), "myproject-".len() + 8);
    }

    #[test]
    fn test_compute_index_dir_name_with_special_chars() {
        let path = PathBuf::from("/Users/foo/my project (1)");
        let name = compute_index_dir_name(&path, "lightonai/LateOn");
        // Special chars should be replaced with underscores
        assert!(name.starts_with("my_project__1_-"));
    }

    #[test]
    fn test_different_paths_different_hashes() {
        let path1 = PathBuf::from("/Users/foo/project1");
        let path2 = PathBuf::from("/Users/foo/project2");
        let name1 = compute_index_dir_name(&path1, "lightonai/LateOn");
        let name2 = compute_index_dir_name(&path2, "lightonai/LateOn");
        assert_ne!(name1, name2);
    }

    #[test]
    fn test_different_models_different_hashes() {
        // Same project path, different models → different index directories.
        let path = PathBuf::from("/Users/foo/project");
        let a = compute_index_dir_name(&path, "lightonai/LateOn");
        let b = compute_index_dir_name(&path, "lightonai/LateOn-Code-edge");
        assert_ne!(a, b);
        // Both keep the readable project-name prefix.
        assert!(a.starts_with("project-"));
        assert!(b.starts_with("project-"));
    }

    #[test]
    fn test_same_path_and_model_stable_hash() {
        let path = PathBuf::from("/Users/foo/project");
        let a = compute_index_dir_name(&path, "lightonai/LateOn");
        let b = compute_index_dir_name(&path, "lightonai/LateOn");
        assert_eq!(a, b);
    }

    /// The empty model string and a non-empty one must not collide:
    /// hashing `path|model` with model="" differs from hashing just the path.
    /// Guards against a regression if someone reverts to path-only hashing.
    #[test]
    fn test_empty_model_does_not_collide_with_populated_model() {
        let path = PathBuf::from("/Users/foo/project");
        let empty = compute_index_dir_name(&path, "");
        let populated = compute_index_dir_name(&path, "lightonai/LateOn");
        assert_ne!(empty, populated);
    }

    #[test]
    fn test_project_metadata_roundtrip_with_model() {
        let dir = tempfile::tempdir().unwrap();
        let index_dir = dir.path();
        let project_path = PathBuf::from("/some/project");
        let meta = ProjectMetadata::new(&project_path, "lightonai/LateOn-Code-edge");
        meta.save(index_dir).unwrap();

        let loaded = ProjectMetadata::load(index_dir).unwrap();
        assert_eq!(loaded.project_path, project_path);
        assert_eq!(loaded.project_name, "project");
        assert_eq!(loaded.model.as_deref(), Some("lightonai/LateOn-Code-edge"));
    }

    /// Pre-1.3 project.json files have no "model" field. They must still
    /// deserialize so legacy indexes don't break the parser.
    #[test]
    fn test_project_metadata_legacy_json_without_model_field() {
        let dir = tempfile::tempdir().unwrap();
        let index_dir = dir.path();
        let legacy = r#"{
            "project_path": "/some/project",
            "project_name": "project"
        }"#;
        std::fs::write(index_dir.join("project.json"), legacy).unwrap();

        let loaded = ProjectMetadata::load(index_dir).unwrap();
        assert_eq!(loaded.project_path, PathBuf::from("/some/project"));
        assert_eq!(loaded.project_name, "project");
        assert!(
            loaded.model.is_none(),
            "legacy project.json must deserialize with model=None"
        );
    }

    /// Two indexes for the same project but different models live in different
    /// directories, so saving metadata to each never clobbers the other.
    #[test]
    fn test_two_models_same_project_do_not_overwrite_metadata() {
        let root = tempfile::tempdir().unwrap();
        let project_path = PathBuf::from("/some/project");

        let dir_a = root
            .path()
            .join(compute_index_dir_name(&project_path, "model-a"));
        let dir_b = root
            .path()
            .join(compute_index_dir_name(&project_path, "model-b"));
        std::fs::create_dir_all(&dir_a).unwrap();
        std::fs::create_dir_all(&dir_b).unwrap();

        ProjectMetadata::new(&project_path, "model-a")
            .save(&dir_a)
            .unwrap();
        ProjectMetadata::new(&project_path, "model-b")
            .save(&dir_b)
            .unwrap();

        let a = ProjectMetadata::load(&dir_a).unwrap();
        let b = ProjectMetadata::load(&dir_b).unwrap();
        assert_eq!(a.model.as_deref(), Some("model-a"));
        assert_eq!(b.model.as_deref(), Some("model-b"));
        assert_ne!(dir_a, dir_b);
    }

    /// Dir name is deterministic per (path, model) so stats/clear/find can
    /// round-trip the same input to the same on-disk location across processes.
    #[test]
    fn test_get_index_dir_for_project_is_deterministic() {
        let path = PathBuf::from("/Users/foo/project");
        let a = get_index_dir_for_project(&path, "lightonai/LateOn").unwrap();
        let b = get_index_dir_for_project(&path, "lightonai/LateOn").unwrap();
        assert_eq!(a, b);
    }
}