sqry_core/git/recency.rs
1//! Recency scoring for hybrid search based on git commit timestamps
2//!
3//! This module provides repository-relative recency scoring for Stage 3 hybrid search.
4//! Scores are normalized to [0.0, 1.0] where 1.0 = newest file, 0.0 = oldest file.
5//!
6//! # Design Principles
7//!
8//! - **Deterministic**: Same repo state → same scores (no wall-clock dependency)
9//! - **Relative scoring**: Normalized against repo's own history
10//! - **Local-only**: Uses local git history (no network operations, always safe in offline mode)
11//! - **Graceful fallback**: Returns neutral 0.5 when git unavailable
12//!
13//! # Example
14//!
15//! ```no_run
16//! use sqry_core::git::recency::RecencyIndex;
17//! use std::path::Path;
18//!
19//! let repo = Path::new("/path/to/repo");
20//! let index = RecencyIndex::from_repo(repo)?;
21//!
22//! let score = index.score_for_file(Path::new("src/main.rs"));
23//! println!("Recency score: {score}"); // 0.0 (oldest) to 1.0 (newest)
24//! # Ok::<(), Box<dyn std::error::Error>>(())
25//! ```
26
27use super::{GitBackend, GitError, Result, SubprocessGit};
28use std::collections::HashMap;
29use std::path::{Path, PathBuf};
30
31/// Recency index that normalizes file timestamps relative to repository history
32///
33/// This index builds a mapping of file paths to their last commit timestamps,
34/// then normalizes scores to [0.0, 1.0] based on the repository's min/max timestamps.
35///
36/// # Scoring Formula
37///
38/// ```text
39/// score = (timestamp - min_ts) / (max_ts - min_ts)
40/// ```
41///
42/// - **1.0**: Newest file in the repository
43/// - **0.5**: Mid-point between oldest and newest (or neutral fallback)
44/// - **0.0**: Oldest file in the repository
45///
46/// # Thread Safety
47///
48/// This struct is Send + Sync and can be shared across threads.
49#[derive(Debug, Clone)]
50pub struct RecencyIndex {
51 /// Map of file paths to Unix epoch timestamps (seconds)
52 by_file: HashMap<PathBuf, i64>,
53
54 /// Minimum timestamp across all tracked files
55 min_ts: i64,
56
57 /// Maximum timestamp across all tracked files
58 max_ts: i64,
59
60 /// Repository root path (canonicalized)
61 repo_root: PathBuf,
62}
63
64impl RecencyIndex {
65 #[inline]
66 #[allow(clippy::cast_precision_loss)] // Timestamp ranges are bounded; lossy f32 cast is acceptable for scoring ratios
67 fn to_f32_lossy(value: i64) -> f32 {
68 value as f32
69 }
70
71 /// Build a recency index from a git repository
72 ///
73 /// Walks all tracked files in the repository and records their last commit timestamps.
74 ///
75 /// # Arguments
76 ///
77 /// * `root` - Path to repository root (or any directory within the repo)
78 ///
79 /// # Returns
80 ///
81 /// * `Ok(RecencyIndex)` - Successfully built index
82 /// * `Err(GitError::NotARepo)` - Path is not a git repository
83 /// * `Err(GitError::NotFound)` - Git binary not in PATH
84 /// * `Err(GitError)` - Other git command failures
85 ///
86 /// # Performance
87 ///
88 /// This operation is relatively expensive (O(n) where n = tracked files).
89 /// Consider caching the index and rebuilding only when the repository changes.
90 ///
91 /// # Examples
92 ///
93 /// ```no_run
94 /// # use sqry_core::git::recency::RecencyIndex;
95 /// # use std::path::Path;
96 /// let index = RecencyIndex::from_repo(Path::new("/path/to/repo"))?;
97 /// println!("Indexed {} files", index.file_count());
98 /// # Ok::<(), Box<dyn std::error::Error>>(())
99 /// ```
100 ///
101 /// # Errors
102 ///
103 /// Returns `GitError` when repository discovery or git commands fail,
104 /// or when the underlying git output is malformed.
105 pub fn from_repo(root: &Path) -> Result<Self> {
106 let backend = SubprocessGit::new();
107
108 // Get canonicalized repo root
109 let repo_root = backend.repo_root(root)?;
110
111 // Get list of all tracked files
112 let tracked_files = Self::get_tracked_files(&repo_root)?;
113
114 if tracked_files.is_empty() {
115 // Empty repository (no commits or no tracked files)
116 return Ok(Self {
117 by_file: HashMap::new(),
118 min_ts: 0,
119 max_ts: 0,
120 repo_root,
121 });
122 }
123
124 // Build timestamp map
125 let mut by_file = HashMap::new();
126 let mut min_ts = i64::MAX;
127 let mut max_ts = i64::MIN;
128
129 for file_path in tracked_files {
130 if let Some(timestamp) = Self::get_file_timestamp(&repo_root, &file_path)? {
131 min_ts = min_ts.min(timestamp);
132 max_ts = max_ts.max(timestamp);
133 by_file.insert(file_path, timestamp);
134 }
135 }
136
137 // Handle edge case: all files have same timestamp
138 if min_ts == max_ts {
139 log::debug!(
140 "RecencyIndex: All files have identical timestamps ({min_ts}), scores will be neutral 0.5"
141 );
142 }
143
144 Ok(Self {
145 by_file,
146 min_ts,
147 max_ts,
148 repo_root,
149 })
150 }
151
152 /// Create a recency index from explicit timestamp data (for testing)
153 ///
154 /// This constructor allows creating an index without accessing git,
155 /// useful for deterministic unit tests.
156 ///
157 /// # Arguments
158 ///
159 /// * `by_file` - Map of file paths to Unix epoch timestamps
160 /// * `repo_root` - Repository root path (used for path resolution)
161 ///
162 /// # Panics
163 ///
164 /// Panics if `by_file` is empty (use an empty `HashMap` to represent
165 /// an empty repository, which will result in neutral 0.5 scores).
166 ///
167 /// # Examples
168 ///
169 /// ```
170 /// # use sqry_core::git::recency::RecencyIndex;
171 /// # use std::collections::HashMap;
172 /// # use std::path::{Path, PathBuf};
173 /// let timestamps = HashMap::from([
174 /// (PathBuf::from("old.rs"), 1000),
175 /// (PathBuf::from("mid.rs"), 2000),
176 /// (PathBuf::from("new.rs"), 3000),
177 /// ]);
178 /// let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
179 /// assert!(index.score_for_file(Path::new("new.rs")) > index.score_for_file(Path::new("old.rs")));
180 /// ```
181 #[must_use]
182 pub fn from_timestamps(by_file: HashMap<PathBuf, i64>, repo_root: &Path) -> Self {
183 if by_file.is_empty() {
184 return Self {
185 by_file,
186 min_ts: 0,
187 max_ts: 0,
188 repo_root: repo_root.to_path_buf(),
189 };
190 }
191
192 let min_ts = *by_file.values().min().expect("by_file is not empty");
193 let max_ts = *by_file.values().max().expect("by_file is not empty");
194
195 Self {
196 by_file,
197 min_ts,
198 max_ts,
199 repo_root: repo_root.to_path_buf(),
200 }
201 }
202
203 /// Compute recency score for a file
204 ///
205 /// Returns a normalized score in [0.0, 1.0] where:
206 /// - **1.0**: Newest file in repository
207 /// - **0.5**: Neutral (file not in index, or all files have same timestamp)
208 /// - **0.0**: Oldest file in repository
209 ///
210 /// # Arguments
211 ///
212 /// * `path` - File path (absolute or relative to repo root)
213 ///
214 /// # Returns
215 ///
216 /// Normalized recency score (0.0-1.0)
217 ///
218 /// # Fallback Behavior
219 ///
220 /// Returns 0.5 (neutral) when:
221 /// - File not found in index
222 /// - All files have identical timestamps (`min_ts` == `max_ts`)
223 /// - Empty repository (no tracked files)
224 ///
225 /// # Examples
226 ///
227 /// ```no_run
228 /// # use sqry_core::git::recency::RecencyIndex;
229 /// # use std::path::Path;
230 /// let index = RecencyIndex::from_repo(Path::new("/repo"))?;
231 ///
232 /// // Absolute path
233 /// let score = index.score_for_file(Path::new("/repo/src/main.rs"));
234 ///
235 /// // Relative path (resolved against repo root)
236 /// let score = index.score_for_file(Path::new("src/main.rs"));
237 ///
238 /// // File not in index → neutral 0.5
239 /// let score = index.score_for_file(Path::new("not_tracked.txt"));
240 /// assert_eq!(score, 0.5);
241 /// # Ok::<(), Box<dyn std::error::Error>>(())
242 /// ```
243 #[must_use]
244 pub fn score_for_file(&self, path: &Path) -> f32 {
245 // Handle empty repository
246 if self.by_file.is_empty() {
247 return 0.5;
248 }
249
250 // Try both absolute and relative paths
251 let timestamp = self
252 .by_file
253 .get(path)
254 .or_else(|| {
255 // Try making path relative to repo root
256 if path.is_absolute() {
257 path.strip_prefix(&self.repo_root)
258 .ok()
259 .and_then(|rel| self.by_file.get(rel))
260 } else {
261 None
262 }
263 })
264 .or_else(|| {
265 // Try making path absolute
266 if path.is_relative() {
267 let abs = self.repo_root.join(path);
268 self.by_file.get(&abs)
269 } else {
270 None
271 }
272 });
273
274 let Some(&ts) = timestamp else {
275 return 0.5;
276 };
277
278 if self.max_ts == self.min_ts {
279 0.5
280 } else {
281 let score = Self::to_f32_lossy(ts - self.min_ts)
282 / Self::to_f32_lossy(self.max_ts - self.min_ts);
283 score.clamp(0.0, 1.0)
284 }
285 }
286
287 /// Get the number of files tracked in this index
288 #[must_use]
289 pub fn file_count(&self) -> usize {
290 self.by_file.len()
291 }
292
293 /// Get the repository root path
294 #[must_use]
295 pub fn repo_root(&self) -> &Path {
296 &self.repo_root
297 }
298
299 /// Get the timestamp range (min, max) in Unix epoch seconds
300 ///
301 /// Returns `None` for empty repositories.
302 #[must_use]
303 pub fn timestamp_range(&self) -> Option<(i64, i64)> {
304 if self.by_file.is_empty() {
305 None
306 } else {
307 Some((self.min_ts, self.max_ts))
308 }
309 }
310
311 /// Get list of all tracked files in repository
312 ///
313 /// Uses `git ls-files` to enumerate tracked files.
314 ///
315 /// # Security
316 ///
317 /// - Uses `SubprocessGit`'s `execute_git` (enforces output limits and timeouts)
318 /// - Uses null-terminated output (-z) to handle special characters in filenames
319 /// - Respects .gitignore and git configuration
320 /// - No shell invocation (command array arguments)
321 fn get_tracked_files(repo_root: &Path) -> Result<Vec<PathBuf>> {
322 // Use SubprocessGit's execute_git for safety (output limits, timeouts)
323 let stdout = SubprocessGit::execute_git(
324 &["-C", &repo_root.display().to_string(), "ls-files", "-z"],
325 None, // Use default timeout
326 )?;
327
328 // Parse null-terminated output
329 let files: Vec<PathBuf> = stdout
330 .split('\0')
331 .filter(|s| !s.is_empty())
332 .map(PathBuf::from)
333 .collect();
334
335 Ok(files)
336 }
337
338 /// Get last commit timestamp for a file
339 ///
340 /// Uses `git log -1 --format=%ct -- <file>` to get the committer timestamp.
341 ///
342 /// # Security
343 ///
344 /// - Uses `SubprocessGit`'s `execute_git` (enforces output limits and timeouts)
345 /// - No shell invocation (command array arguments)
346 ///
347 /// # Returns
348 ///
349 /// * `Ok(Some(timestamp))` - File has commit history
350 /// * `Ok(None)` - File is tracked but has no commits (newly added)
351 /// * `Err(GitError)` - Git command failed
352 fn get_file_timestamp(repo_root: &Path, file_path: &Path) -> Result<Option<i64>> {
353 // Convert paths to strings (must bind to variables for lifetime)
354 let repo_root_str = repo_root.display().to_string();
355 let file_path_str = file_path.display().to_string();
356
357 // Build args
358 let args = vec![
359 "-C",
360 &repo_root_str,
361 "log",
362 "-1",
363 "--format=%ct",
364 "--",
365 &file_path_str,
366 ];
367
368 // Use SubprocessGit's execute_git for safety (output limits, timeouts)
369 let stdout = SubprocessGit::execute_git(&args, None)?;
370
371 // Empty output means file has no commits yet (newly added)
372 if stdout.trim().is_empty() {
373 return Ok(None);
374 }
375
376 // Parse timestamp
377 let timestamp: i64 = stdout.trim().parse().map_err(|e| {
378 GitError::InvalidOutput(format!(
379 "Failed to parse timestamp '{}' for {}: {e}",
380 stdout.trim(),
381 file_path.display()
382 ))
383 })?;
384
385 Ok(Some(timestamp))
386 }
387}
388
389#[cfg(test)]
390mod tests {
391 use super::*;
392 use std::fs;
393 use std::process::Command;
394 use tempfile::TempDir;
395
396 const SCORE_EPSILON: f32 = 1.0e-6;
397
398 fn assert_score_close(actual: f32, expected: f32) {
399 assert!(
400 (actual - expected).abs() < SCORE_EPSILON,
401 "expected {expected}, got {actual}"
402 );
403 }
404
405 /// Helper to create a git repo with explicit timestamps
406 ///
407 /// Creates files and commits them with controlled timestamps for deterministic testing.
408 fn create_test_repo_with_timestamps() -> (TempDir, Vec<(&'static str, i64)>) {
409 let tmpdir = tempfile::tempdir().unwrap();
410 let path = tmpdir.path();
411
412 // Initialize git repo
413 let init = Command::new("git")
414 .args(["init"])
415 .current_dir(path)
416 .output()
417 .expect("git init failed");
418 assert!(init.status.success());
419
420 // Configure git
421 Command::new("git")
422 .args(["config", "user.name", "Test"])
423 .current_dir(path)
424 .output()
425 .unwrap();
426 Command::new("git")
427 .args(["config", "user.email", "test@example.com"])
428 .current_dir(path)
429 .output()
430 .unwrap();
431 Command::new("git")
432 .args(["config", "commit.gpgSign", "false"])
433 .current_dir(path)
434 .output()
435 .unwrap();
436
437 // Create files with different timestamps
438 let files = vec![
439 ("old.rs", 1000i64), // Oldest
440 ("mid.rs", 2000i64), // Middle
441 ("new.rs", 3000i64), // Newest
442 ];
443
444 for (filename, timestamp) in &files {
445 // Create file
446 fs::write(path.join(filename), format!("// {filename}")).unwrap();
447
448 // Stage file
449 Command::new("git")
450 .args(["add", filename])
451 .current_dir(path)
452 .output()
453 .unwrap();
454
455 // Commit with explicit timestamp
456 let commit = Command::new("git")
457 .env("GIT_COMMITTER_DATE", timestamp.to_string())
458 .env("GIT_AUTHOR_DATE", timestamp.to_string())
459 .args(["commit", "-m", &format!("Add {filename}")])
460 .current_dir(path)
461 .output()
462 .unwrap();
463 assert!(
464 commit.status.success(),
465 "commit failed for {filename}: {commit:?}"
466 );
467 }
468
469 (tmpdir, files)
470 }
471
472 #[test]
473 fn test_from_timestamps_normalization() {
474 let timestamps = HashMap::from([
475 (PathBuf::from("old.rs"), 1000),
476 (PathBuf::from("mid.rs"), 2000),
477 (PathBuf::from("new.rs"), 3000),
478 ]);
479
480 let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
481
482 // Check normalization
483 assert_score_close(index.score_for_file(Path::new("old.rs")), 0.0); // Oldest = 0.0
484 assert_score_close(index.score_for_file(Path::new("mid.rs")), 0.5); // Middle = 0.5
485 assert_score_close(index.score_for_file(Path::new("new.rs")), 1.0); // Newest = 1.0
486 }
487
488 #[test]
489 fn test_from_timestamps_ordering() {
490 let timestamps = HashMap::from([
491 (PathBuf::from("old.rs"), 1000),
492 (PathBuf::from("mid.rs"), 2000),
493 (PathBuf::from("new.rs"), 3000),
494 ]);
495
496 let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
497
498 // Verify ordering
499 let old_score = index.score_for_file(Path::new("old.rs"));
500 let mid_score = index.score_for_file(Path::new("mid.rs"));
501 let new_score = index.score_for_file(Path::new("new.rs"));
502
503 assert!(new_score > mid_score);
504 assert!(mid_score > old_score);
505 }
506
507 #[test]
508 fn test_from_timestamps_missing_file() {
509 let timestamps = HashMap::from([
510 (PathBuf::from("old.rs"), 1000),
511 (PathBuf::from("new.rs"), 3000),
512 ]);
513
514 let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
515
516 // Missing file returns neutral 0.5
517 assert_score_close(index.score_for_file(Path::new("missing.rs")), 0.5);
518 }
519
520 #[test]
521 fn test_from_timestamps_identical_timestamps() {
522 let timestamps = HashMap::from([
523 (PathBuf::from("a.rs"), 1000),
524 (PathBuf::from("b.rs"), 1000),
525 (PathBuf::from("c.rs"), 1000),
526 ]);
527
528 let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
529
530 // All files have same timestamp → neutral 0.5
531 assert_score_close(index.score_for_file(Path::new("a.rs")), 0.5);
532 assert_score_close(index.score_for_file(Path::new("b.rs")), 0.5);
533 assert_score_close(index.score_for_file(Path::new("c.rs")), 0.5);
534 }
535
536 #[test]
537 fn test_from_timestamps_empty() {
538 let timestamps = HashMap::new();
539 let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
540
541 // Empty repository → neutral 0.5
542 assert_score_close(index.score_for_file(Path::new("any.rs")), 0.5);
543 assert_eq!(index.file_count(), 0);
544 }
545
546 #[test]
547 #[ignore = "Requires git binary and filesystem access"]
548 fn test_from_repo_real_git() {
549 let (tmpdir, _files) = create_test_repo_with_timestamps();
550 let index = RecencyIndex::from_repo(tmpdir.path()).unwrap();
551
552 assert_eq!(index.file_count(), 3);
553
554 // Verify score ordering (newer files score higher)
555 let old_score = index.score_for_file(Path::new("old.rs"));
556 let mid_score = index.score_for_file(Path::new("mid.rs"));
557 let new_score = index.score_for_file(Path::new("new.rs"));
558
559 assert!(
560 new_score > mid_score,
561 "new ({new_score}) should be > mid ({mid_score})"
562 );
563 assert!(
564 mid_score > old_score,
565 "mid ({mid_score}) should be > old ({old_score})"
566 );
567
568 // Newest should be close to 1.0, oldest close to 0.0
569 assert!(
570 new_score > 0.9,
571 "newest file should score > 0.9, got {new_score}"
572 );
573 assert!(
574 old_score < 0.1,
575 "oldest file should score < 0.1, got {old_score}"
576 );
577 }
578
579 #[test]
580 #[ignore = "Requires git binary and filesystem access"]
581 fn test_from_repo_absolute_and_relative_paths() {
582 let (tmpdir, _files) = create_test_repo_with_timestamps();
583 let index = RecencyIndex::from_repo(tmpdir.path()).unwrap();
584
585 // Relative path
586 let rel_score = index.score_for_file(Path::new("new.rs"));
587
588 // Absolute path
589 let abs_path = tmpdir.path().join("new.rs");
590 let abs_score = index.score_for_file(&abs_path);
591
592 // Should be identical
593 assert_score_close(rel_score, abs_score);
594 }
595
596 #[test]
597 fn test_repo_root_accessor() {
598 let timestamps = HashMap::from([(PathBuf::from("test.rs"), 1000)]);
599 let index = RecencyIndex::from_timestamps(timestamps, Path::new("/test/repo"));
600
601 assert_eq!(index.repo_root(), Path::new("/test/repo"));
602 }
603
604 #[test]
605 fn test_timestamp_range() {
606 let timestamps = HashMap::from([
607 (PathBuf::from("old.rs"), 1000),
608 (PathBuf::from("new.rs"), 5000),
609 ]);
610
611 let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
612 assert_eq!(index.timestamp_range(), Some((1000, 5000)));
613
614 // Empty index
615 let empty = RecencyIndex::from_timestamps(HashMap::new(), Path::new("/repo"));
616 assert_eq!(empty.timestamp_range(), None);
617 }
618
619 #[test]
620 fn test_file_count() {
621 let timestamps = HashMap::from([
622 (PathBuf::from("a.rs"), 1000),
623 (PathBuf::from("b.rs"), 2000),
624 (PathBuf::from("c.rs"), 3000),
625 ]);
626
627 let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
628 assert_eq!(index.file_count(), 3);
629 }
630}