Skip to main content

sqry_core/git/
recency.rs

1//! Recency scoring for hybrid search based on git commit timestamps
2//!
3//! This module provides repository-relative recency scoring for Stage 3 hybrid search.
4//! Scores are normalized to [0.0, 1.0] where 1.0 = newest file, 0.0 = oldest file.
5//!
6//! # Design Principles
7//!
8//! - **Deterministic**: Same repo state → same scores (no wall-clock dependency)
9//! - **Relative scoring**: Normalized against repo's own history
10//! - **Local-only**: Uses local git history (no network operations, always safe in offline mode)
11//! - **Graceful fallback**: Returns neutral 0.5 when git unavailable
12//!
13//! # Example
14//!
15//! ```no_run
16//! use sqry_core::git::recency::RecencyIndex;
17//! use std::path::Path;
18//!
19//! let repo = Path::new("/path/to/repo");
20//! let index = RecencyIndex::from_repo(repo)?;
21//!
22//! let score = index.score_for_file(Path::new("src/main.rs"));
23//! println!("Recency score: {score}"); // 0.0 (oldest) to 1.0 (newest)
24//! # Ok::<(), Box<dyn std::error::Error>>(())
25//! ```
26
27use super::{GitBackend, GitError, Result, SubprocessGit};
28use std::collections::HashMap;
29use std::path::{Path, PathBuf};
30
31/// Recency index that normalizes file timestamps relative to repository history
32///
33/// This index builds a mapping of file paths to their last commit timestamps,
34/// then normalizes scores to [0.0, 1.0] based on the repository's min/max timestamps.
35///
36/// # Scoring Formula
37///
38/// ```text
39/// score = (timestamp - min_ts) / (max_ts - min_ts)
40/// ```
41///
42/// - **1.0**: Newest file in the repository
43/// - **0.5**: Mid-point between oldest and newest (or neutral fallback)
44/// - **0.0**: Oldest file in the repository
45///
46/// # Thread Safety
47///
48/// This struct is Send + Sync and can be shared across threads.
49#[derive(Debug, Clone)]
50pub struct RecencyIndex {
51    /// Map of file paths to Unix epoch timestamps (seconds)
52    by_file: HashMap<PathBuf, i64>,
53
54    /// Minimum timestamp across all tracked files
55    min_ts: i64,
56
57    /// Maximum timestamp across all tracked files
58    max_ts: i64,
59
60    /// Repository root path (canonicalized)
61    repo_root: PathBuf,
62}
63
64impl RecencyIndex {
65    #[inline]
66    #[allow(clippy::cast_precision_loss)] // Timestamp ranges are bounded; lossy f32 cast is acceptable for scoring ratios
67    fn to_f32_lossy(value: i64) -> f32 {
68        value as f32
69    }
70
71    /// Build a recency index from a git repository
72    ///
73    /// Walks all tracked files in the repository and records their last commit timestamps.
74    ///
75    /// # Arguments
76    ///
77    /// * `root` - Path to repository root (or any directory within the repo)
78    ///
79    /// # Returns
80    ///
81    /// * `Ok(RecencyIndex)` - Successfully built index
82    /// * `Err(GitError::NotARepo)` - Path is not a git repository
83    /// * `Err(GitError::NotFound)` - Git binary not in PATH
84    /// * `Err(GitError)` - Other git command failures
85    ///
86    /// # Performance
87    ///
88    /// This operation is relatively expensive (O(n) where n = tracked files).
89    /// Consider caching the index and rebuilding only when the repository changes.
90    ///
91    /// # Examples
92    ///
93    /// ```no_run
94    /// # use sqry_core::git::recency::RecencyIndex;
95    /// # use std::path::Path;
96    /// let index = RecencyIndex::from_repo(Path::new("/path/to/repo"))?;
97    /// println!("Indexed {} files", index.file_count());
98    /// # Ok::<(), Box<dyn std::error::Error>>(())
99    /// ```
100    ///
101    /// # Errors
102    ///
103    /// Returns `GitError` when repository discovery or git commands fail,
104    /// or when the underlying git output is malformed.
105    pub fn from_repo(root: &Path) -> Result<Self> {
106        let backend = SubprocessGit::new();
107
108        // Get canonicalized repo root
109        let repo_root = backend.repo_root(root)?;
110
111        // Get list of all tracked files
112        let tracked_files = Self::get_tracked_files(&repo_root)?;
113
114        if tracked_files.is_empty() {
115            // Empty repository (no commits or no tracked files)
116            return Ok(Self {
117                by_file: HashMap::new(),
118                min_ts: 0,
119                max_ts: 0,
120                repo_root,
121            });
122        }
123
124        // Build timestamp map
125        let mut by_file = HashMap::new();
126        let mut min_ts = i64::MAX;
127        let mut max_ts = i64::MIN;
128
129        for file_path in tracked_files {
130            if let Some(timestamp) = Self::get_file_timestamp(&repo_root, &file_path)? {
131                min_ts = min_ts.min(timestamp);
132                max_ts = max_ts.max(timestamp);
133                by_file.insert(file_path, timestamp);
134            }
135        }
136
137        // Handle edge case: all files have same timestamp
138        if min_ts == max_ts {
139            log::debug!(
140                "RecencyIndex: All files have identical timestamps ({min_ts}), scores will be neutral 0.5"
141            );
142        }
143
144        Ok(Self {
145            by_file,
146            min_ts,
147            max_ts,
148            repo_root,
149        })
150    }
151
152    /// Create a recency index from explicit timestamp data (for testing)
153    ///
154    /// This constructor allows creating an index without accessing git,
155    /// useful for deterministic unit tests.
156    ///
157    /// # Arguments
158    ///
159    /// * `by_file` - Map of file paths to Unix epoch timestamps
160    /// * `repo_root` - Repository root path (used for path resolution)
161    ///
162    /// # Panics
163    ///
164    /// Panics if `by_file` is empty (use an empty `HashMap` to represent
165    /// an empty repository, which will result in neutral 0.5 scores).
166    ///
167    /// # Examples
168    ///
169    /// ```
170    /// # use sqry_core::git::recency::RecencyIndex;
171    /// # use std::collections::HashMap;
172    /// # use std::path::{Path, PathBuf};
173    /// let timestamps = HashMap::from([
174    ///     (PathBuf::from("old.rs"), 1000),
175    ///     (PathBuf::from("mid.rs"), 2000),
176    ///     (PathBuf::from("new.rs"), 3000),
177    /// ]);
178    /// let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
179    /// assert!(index.score_for_file(Path::new("new.rs")) > index.score_for_file(Path::new("old.rs")));
180    /// ```
181    #[must_use]
182    pub fn from_timestamps(by_file: HashMap<PathBuf, i64>, repo_root: &Path) -> Self {
183        if by_file.is_empty() {
184            return Self {
185                by_file,
186                min_ts: 0,
187                max_ts: 0,
188                repo_root: repo_root.to_path_buf(),
189            };
190        }
191
192        let min_ts = *by_file.values().min().expect("by_file is not empty");
193        let max_ts = *by_file.values().max().expect("by_file is not empty");
194
195        Self {
196            by_file,
197            min_ts,
198            max_ts,
199            repo_root: repo_root.to_path_buf(),
200        }
201    }
202
203    /// Compute recency score for a file
204    ///
205    /// Returns a normalized score in [0.0, 1.0] where:
206    /// - **1.0**: Newest file in repository
207    /// - **0.5**: Neutral (file not in index, or all files have same timestamp)
208    /// - **0.0**: Oldest file in repository
209    ///
210    /// # Arguments
211    ///
212    /// * `path` - File path (absolute or relative to repo root)
213    ///
214    /// # Returns
215    ///
216    /// Normalized recency score (0.0-1.0)
217    ///
218    /// # Fallback Behavior
219    ///
220    /// Returns 0.5 (neutral) when:
221    /// - File not found in index
222    /// - All files have identical timestamps (`min_ts` == `max_ts`)
223    /// - Empty repository (no tracked files)
224    ///
225    /// # Examples
226    ///
227    /// ```no_run
228    /// # use sqry_core::git::recency::RecencyIndex;
229    /// # use std::path::Path;
230    /// let index = RecencyIndex::from_repo(Path::new("/repo"))?;
231    ///
232    /// // Absolute path
233    /// let score = index.score_for_file(Path::new("/repo/src/main.rs"));
234    ///
235    /// // Relative path (resolved against repo root)
236    /// let score = index.score_for_file(Path::new("src/main.rs"));
237    ///
238    /// // File not in index → neutral 0.5
239    /// let score = index.score_for_file(Path::new("not_tracked.txt"));
240    /// assert_eq!(score, 0.5);
241    /// # Ok::<(), Box<dyn std::error::Error>>(())
242    /// ```
243    #[must_use]
244    pub fn score_for_file(&self, path: &Path) -> f32 {
245        // Handle empty repository
246        if self.by_file.is_empty() {
247            return 0.5;
248        }
249
250        // Try both absolute and relative paths
251        let timestamp = self
252            .by_file
253            .get(path)
254            .or_else(|| {
255                // Try making path relative to repo root
256                if path.is_absolute() {
257                    path.strip_prefix(&self.repo_root)
258                        .ok()
259                        .and_then(|rel| self.by_file.get(rel))
260                } else {
261                    None
262                }
263            })
264            .or_else(|| {
265                // Try making path absolute
266                if path.is_relative() {
267                    let abs = self.repo_root.join(path);
268                    self.by_file.get(&abs)
269                } else {
270                    None
271                }
272            });
273
274        let Some(&ts) = timestamp else {
275            return 0.5;
276        };
277
278        if self.max_ts == self.min_ts {
279            0.5
280        } else {
281            let score = Self::to_f32_lossy(ts - self.min_ts)
282                / Self::to_f32_lossy(self.max_ts - self.min_ts);
283            score.clamp(0.0, 1.0)
284        }
285    }
286
287    /// Get the number of files tracked in this index
288    #[must_use]
289    pub fn file_count(&self) -> usize {
290        self.by_file.len()
291    }
292
293    /// Get the repository root path
294    #[must_use]
295    pub fn repo_root(&self) -> &Path {
296        &self.repo_root
297    }
298
299    /// Get the timestamp range (min, max) in Unix epoch seconds
300    ///
301    /// Returns `None` for empty repositories.
302    #[must_use]
303    pub fn timestamp_range(&self) -> Option<(i64, i64)> {
304        if self.by_file.is_empty() {
305            None
306        } else {
307            Some((self.min_ts, self.max_ts))
308        }
309    }
310
311    /// Get list of all tracked files in repository
312    ///
313    /// Uses `git ls-files` to enumerate tracked files.
314    ///
315    /// # Security
316    ///
317    /// - Uses `SubprocessGit`'s `execute_git` (enforces output limits and timeouts)
318    /// - Uses null-terminated output (-z) to handle special characters in filenames
319    /// - Respects .gitignore and git configuration
320    /// - No shell invocation (command array arguments)
321    fn get_tracked_files(repo_root: &Path) -> Result<Vec<PathBuf>> {
322        // Use SubprocessGit's execute_git for safety (output limits, timeouts)
323        let stdout = SubprocessGit::execute_git(
324            &["-C", &repo_root.display().to_string(), "ls-files", "-z"],
325            None, // Use default timeout
326        )?;
327
328        // Parse null-terminated output
329        let files: Vec<PathBuf> = stdout
330            .split('\0')
331            .filter(|s| !s.is_empty())
332            .map(PathBuf::from)
333            .collect();
334
335        Ok(files)
336    }
337
338    /// Get last commit timestamp for a file
339    ///
340    /// Uses `git log -1 --format=%ct -- <file>` to get the committer timestamp.
341    ///
342    /// # Security
343    ///
344    /// - Uses `SubprocessGit`'s `execute_git` (enforces output limits and timeouts)
345    /// - No shell invocation (command array arguments)
346    ///
347    /// # Returns
348    ///
349    /// * `Ok(Some(timestamp))` - File has commit history
350    /// * `Ok(None)` - File is tracked but has no commits (newly added)
351    /// * `Err(GitError)` - Git command failed
352    fn get_file_timestamp(repo_root: &Path, file_path: &Path) -> Result<Option<i64>> {
353        // Convert paths to strings (must bind to variables for lifetime)
354        let repo_root_str = repo_root.display().to_string();
355        let file_path_str = file_path.display().to_string();
356
357        // Build args
358        let args = vec![
359            "-C",
360            &repo_root_str,
361            "log",
362            "-1",
363            "--format=%ct",
364            "--",
365            &file_path_str,
366        ];
367
368        // Use SubprocessGit's execute_git for safety (output limits, timeouts)
369        let stdout = SubprocessGit::execute_git(&args, None)?;
370
371        // Empty output means file has no commits yet (newly added)
372        if stdout.trim().is_empty() {
373            return Ok(None);
374        }
375
376        // Parse timestamp
377        let timestamp: i64 = stdout.trim().parse().map_err(|e| {
378            GitError::InvalidOutput(format!(
379                "Failed to parse timestamp '{}' for {}: {e}",
380                stdout.trim(),
381                file_path.display()
382            ))
383        })?;
384
385        Ok(Some(timestamp))
386    }
387}
388
389#[cfg(test)]
390mod tests {
391    use super::*;
392    use std::fs;
393    use std::process::Command;
394    use tempfile::TempDir;
395
396    const SCORE_EPSILON: f32 = 1.0e-6;
397
398    fn assert_score_close(actual: f32, expected: f32) {
399        assert!(
400            (actual - expected).abs() < SCORE_EPSILON,
401            "expected {expected}, got {actual}"
402        );
403    }
404
405    /// Helper to create a git repo with explicit timestamps
406    ///
407    /// Creates files and commits them with controlled timestamps for deterministic testing.
408    fn create_test_repo_with_timestamps() -> (TempDir, Vec<(&'static str, i64)>) {
409        let tmpdir = tempfile::tempdir().unwrap();
410        let path = tmpdir.path();
411
412        // Initialize git repo
413        let init = Command::new("git")
414            .args(["init"])
415            .current_dir(path)
416            .output()
417            .expect("git init failed");
418        assert!(init.status.success());
419
420        // Configure git
421        Command::new("git")
422            .args(["config", "user.name", "Test"])
423            .current_dir(path)
424            .output()
425            .unwrap();
426        Command::new("git")
427            .args(["config", "user.email", "test@example.com"])
428            .current_dir(path)
429            .output()
430            .unwrap();
431        Command::new("git")
432            .args(["config", "commit.gpgSign", "false"])
433            .current_dir(path)
434            .output()
435            .unwrap();
436
437        // Create files with different timestamps
438        let files = vec![
439            ("old.rs", 1000i64), // Oldest
440            ("mid.rs", 2000i64), // Middle
441            ("new.rs", 3000i64), // Newest
442        ];
443
444        for (filename, timestamp) in &files {
445            // Create file
446            fs::write(path.join(filename), format!("// {filename}")).unwrap();
447
448            // Stage file
449            Command::new("git")
450                .args(["add", filename])
451                .current_dir(path)
452                .output()
453                .unwrap();
454
455            // Commit with explicit timestamp
456            let commit = Command::new("git")
457                .env("GIT_COMMITTER_DATE", timestamp.to_string())
458                .env("GIT_AUTHOR_DATE", timestamp.to_string())
459                .args(["commit", "-m", &format!("Add {filename}")])
460                .current_dir(path)
461                .output()
462                .unwrap();
463            assert!(
464                commit.status.success(),
465                "commit failed for {filename}: {commit:?}"
466            );
467        }
468
469        (tmpdir, files)
470    }
471
472    #[test]
473    fn test_from_timestamps_normalization() {
474        let timestamps = HashMap::from([
475            (PathBuf::from("old.rs"), 1000),
476            (PathBuf::from("mid.rs"), 2000),
477            (PathBuf::from("new.rs"), 3000),
478        ]);
479
480        let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
481
482        // Check normalization
483        assert_score_close(index.score_for_file(Path::new("old.rs")), 0.0); // Oldest = 0.0
484        assert_score_close(index.score_for_file(Path::new("mid.rs")), 0.5); // Middle = 0.5
485        assert_score_close(index.score_for_file(Path::new("new.rs")), 1.0); // Newest = 1.0
486    }
487
488    #[test]
489    fn test_from_timestamps_ordering() {
490        let timestamps = HashMap::from([
491            (PathBuf::from("old.rs"), 1000),
492            (PathBuf::from("mid.rs"), 2000),
493            (PathBuf::from("new.rs"), 3000),
494        ]);
495
496        let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
497
498        // Verify ordering
499        let old_score = index.score_for_file(Path::new("old.rs"));
500        let mid_score = index.score_for_file(Path::new("mid.rs"));
501        let new_score = index.score_for_file(Path::new("new.rs"));
502
503        assert!(new_score > mid_score);
504        assert!(mid_score > old_score);
505    }
506
507    #[test]
508    fn test_from_timestamps_missing_file() {
509        let timestamps = HashMap::from([
510            (PathBuf::from("old.rs"), 1000),
511            (PathBuf::from("new.rs"), 3000),
512        ]);
513
514        let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
515
516        // Missing file returns neutral 0.5
517        assert_score_close(index.score_for_file(Path::new("missing.rs")), 0.5);
518    }
519
520    #[test]
521    fn test_from_timestamps_identical_timestamps() {
522        let timestamps = HashMap::from([
523            (PathBuf::from("a.rs"), 1000),
524            (PathBuf::from("b.rs"), 1000),
525            (PathBuf::from("c.rs"), 1000),
526        ]);
527
528        let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
529
530        // All files have same timestamp → neutral 0.5
531        assert_score_close(index.score_for_file(Path::new("a.rs")), 0.5);
532        assert_score_close(index.score_for_file(Path::new("b.rs")), 0.5);
533        assert_score_close(index.score_for_file(Path::new("c.rs")), 0.5);
534    }
535
536    #[test]
537    fn test_from_timestamps_empty() {
538        let timestamps = HashMap::new();
539        let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
540
541        // Empty repository → neutral 0.5
542        assert_score_close(index.score_for_file(Path::new("any.rs")), 0.5);
543        assert_eq!(index.file_count(), 0);
544    }
545
546    #[test]
547    #[ignore = "Requires git binary and filesystem access"]
548    fn test_from_repo_real_git() {
549        let (tmpdir, _files) = create_test_repo_with_timestamps();
550        let index = RecencyIndex::from_repo(tmpdir.path()).unwrap();
551
552        assert_eq!(index.file_count(), 3);
553
554        // Verify score ordering (newer files score higher)
555        let old_score = index.score_for_file(Path::new("old.rs"));
556        let mid_score = index.score_for_file(Path::new("mid.rs"));
557        let new_score = index.score_for_file(Path::new("new.rs"));
558
559        assert!(
560            new_score > mid_score,
561            "new ({new_score}) should be > mid ({mid_score})"
562        );
563        assert!(
564            mid_score > old_score,
565            "mid ({mid_score}) should be > old ({old_score})"
566        );
567
568        // Newest should be close to 1.0, oldest close to 0.0
569        assert!(
570            new_score > 0.9,
571            "newest file should score > 0.9, got {new_score}"
572        );
573        assert!(
574            old_score < 0.1,
575            "oldest file should score < 0.1, got {old_score}"
576        );
577    }
578
579    #[test]
580    #[ignore = "Requires git binary and filesystem access"]
581    fn test_from_repo_absolute_and_relative_paths() {
582        let (tmpdir, _files) = create_test_repo_with_timestamps();
583        let index = RecencyIndex::from_repo(tmpdir.path()).unwrap();
584
585        // Relative path
586        let rel_score = index.score_for_file(Path::new("new.rs"));
587
588        // Absolute path
589        let abs_path = tmpdir.path().join("new.rs");
590        let abs_score = index.score_for_file(&abs_path);
591
592        // Should be identical
593        assert_score_close(rel_score, abs_score);
594    }
595
596    #[test]
597    fn test_repo_root_accessor() {
598        let timestamps = HashMap::from([(PathBuf::from("test.rs"), 1000)]);
599        let index = RecencyIndex::from_timestamps(timestamps, Path::new("/test/repo"));
600
601        assert_eq!(index.repo_root(), Path::new("/test/repo"));
602    }
603
604    #[test]
605    fn test_timestamp_range() {
606        let timestamps = HashMap::from([
607            (PathBuf::from("old.rs"), 1000),
608            (PathBuf::from("new.rs"), 5000),
609        ]);
610
611        let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
612        assert_eq!(index.timestamp_range(), Some((1000, 5000)));
613
614        // Empty index
615        let empty = RecencyIndex::from_timestamps(HashMap::new(), Path::new("/repo"));
616        assert_eq!(empty.timestamp_range(), None);
617    }
618
619    #[test]
620    fn test_file_count() {
621        let timestamps = HashMap::from([
622            (PathBuf::from("a.rs"), 1000),
623            (PathBuf::from("b.rs"), 2000),
624            (PathBuf::from("c.rs"), 3000),
625        ]);
626
627        let index = RecencyIndex::from_timestamps(timestamps, Path::new("/repo"));
628        assert_eq!(index.file_count(), 3);
629    }
630}