Skip to main content

rag_rat_core/index/
git_history.rs

1use std::{
2    collections::{BTreeMap, BTreeSet},
3    path::{Path, PathBuf},
4    process::Command,
5};
6
7use rayon::prelude::*;
8use rusqlite::{Connection, OptionalExtension, params};
9use serde::Serialize;
10use sha2::{Digest, Sha256};
11
12use crate::search::lexical::SearchHit;
13
14#[derive(Debug, Clone, Serialize)]
15pub struct GitHistoryIndexStatus {
16    pub available: bool,
17    pub head: Option<String>,
18    pub indexed_head: Option<String>,
19    pub commit_count: u64,
20    pub file_change_count: u64,
21}
22
23#[derive(Debug, Clone, Serialize)]
24pub struct CommitSearchHit {
25    pub hash: String,
26    pub author_name: String,
27    pub author_email: String,
28    pub authored_at_s: i64,
29    pub committed_at_s: i64,
30    pub subject: String,
31    pub body: String,
32    pub changed_file_count: i64,
33    pub score: f64,
34    pub evidence_kind: &'static str,
35}
36
37#[derive(Debug, Clone, Serialize)]
38pub struct PathHistoryItem {
39    pub hash: String,
40    pub path: String,
41    pub additions: Option<i64>,
42    pub deletions: Option<i64>,
43    pub change_kind: String,
44    pub author_name: String,
45    pub authored_at_s: i64,
46    pub subject: String,
47    pub evidence_kind: &'static str,
48}
49
50#[derive(Debug, Clone, Serialize)]
51pub struct SymbolHistoryItem {
52    pub symbol: String,
53    pub qualified_name: String,
54    pub path: String,
55    pub start_byte: i64,
56    pub end_byte: i64,
57    pub commit: PathHistoryItem,
58    pub evidence_kind: &'static str,
59}
60
61#[derive(Debug, Clone, Serialize)]
62pub struct QueryCommitHit {
63    pub hash: String,
64    pub author_name: String,
65    pub authored_at_s: i64,
66    pub subject: String,
67    pub changed_file_count: i64,
68    pub evidence: Vec<String>,
69    pub score: f64,
70    pub evidence_kind: &'static str,
71}
72
73#[derive(Debug, Clone, Serialize)]
74pub struct ChunkBlameSummary {
75    pub chunk_id: i64,
76    pub path: String,
77    pub start_line: i64,
78    pub end_line: i64,
79    pub source_text_hash: String,
80    pub line_count: i64,
81    pub dominant_commit: Option<String>,
82    pub dominant_commit_lines: i64,
83    pub newest_commit: Option<String>,
84    pub newest_commit_time_s: Option<i64>,
85    pub oldest_commit: Option<String>,
86    pub oldest_commit_time_s: Option<i64>,
87    pub commit_counts: BTreeMap<String, i64>,
88    pub evidence_kind: &'static str,
89}
90
91#[derive(Debug)]
92struct GitRepo {
93    worktree_root: PathBuf,
94    head: String,
95}
96
97#[derive(Debug)]
98struct CommitRecord {
99    hash: String,
100    author_name: String,
101    author_email: String,
102    authored_at_s: i64,
103    committed_at_s: i64,
104    subject: String,
105    body: String,
106}
107
108#[derive(Debug)]
109struct FileChange {
110    commit_hash: String,
111    path: String,
112    additions: Option<i64>,
113    deletions: Option<i64>,
114    change_kind: String,
115}
116
117#[derive(Debug)]
118pub(crate) struct PreparedGitHistory {
119    repo: Option<GitRepo>,
120    commits: Vec<CommitRecord>,
121    changes: Vec<FileChange>,
122}
123
124pub(crate) fn prepare(root: &Path) -> anyhow::Result<PreparedGitHistory> {
125    let Some(repo) = git_repo(root) else {
126        return Ok(PreparedGitHistory { repo: None, commits: Vec::new(), changes: Vec::new() });
127    };
128    let commits = read_commits(root)?;
129    let changes = read_file_changes(root, &repo.worktree_root)?;
130    Ok(PreparedGitHistory { repo: Some(repo), commits, changes })
131}
132
133pub(crate) fn apply_prepared(
134    conn: &Connection,
135    root: &Path,
136    prepared: PreparedGitHistory,
137) -> anyhow::Result<GitHistoryIndexStatus> {
138    let Some(repo) = prepared.repo else {
139        clear(conn)?;
140        return status(conn, root);
141    };
142
143    conn.execute_batch(
144        "
145        DELETE FROM commit_fts;
146        DELETE FROM git_chunk_blame;
147        DELETE FROM git_file_changes;
148        DELETE FROM git_commits;
149        ",
150    )?;
151
152    for commit in &prepared.commits {
153        conn.execute(
154            "INSERT INTO git_commits(hash, author_name, author_email, authored_at_s, committed_at_s, subject, body, changed_file_count)
155             VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, 0)",
156            params![
157                commit.hash,
158                commit.author_name,
159                commit.author_email,
160                commit.authored_at_s,
161                commit.committed_at_s,
162                commit.subject,
163                commit.body,
164            ],
165        )?;
166    }
167
168    let mut changed_counts = BTreeMap::<String, i64>::new();
169    for change in prepared.changes {
170        *changed_counts.entry(change.commit_hash.clone()).or_default() += 1;
171        conn.execute(
172            "INSERT INTO git_file_changes(commit_hash, path, additions, deletions, change_kind)
173             VALUES (?1, ?2, ?3, ?4, ?5)",
174            params![
175                change.commit_hash,
176                change.path,
177                change.additions,
178                change.deletions,
179                change.change_kind,
180            ],
181        )?;
182    }
183    for (hash, count) in changed_counts {
184        conn.execute(
185            "UPDATE git_commits SET changed_file_count = ?2 WHERE hash = ?1",
186            params![hash, count],
187        )?;
188    }
189
190    conn.execute_batch(
191        "
192        INSERT INTO commit_fts(rowid, subject, body)
193        SELECT rowid, subject, body FROM git_commits;
194        ",
195    )?;
196    set_meta(conn, "git_history_indexed_head", &repo.head)?;
197    status(conn, root)
198}
199
200pub fn index(conn: &Connection, root: &Path) -> anyhow::Result<GitHistoryIndexStatus> {
201    let prepared = prepare(root)?;
202    apply_prepared(conn, root, prepared)
203}
204
205pub fn status(conn: &Connection, root: &Path) -> anyhow::Result<GitHistoryIndexStatus> {
206    let repo = git_repo(root);
207    let commit_count = count_table(conn, "git_commits")?;
208    let file_change_count = count_table(conn, "git_file_changes")?;
209    Ok(GitHistoryIndexStatus {
210        available: repo.is_some(),
211        head: repo.map(|repo| repo.head),
212        indexed_head: meta(conn, "git_history_indexed_head")?,
213        commit_count,
214        file_change_count,
215    })
216}
217
218pub fn commit_search(
219    conn: &Connection,
220    query: &str,
221    limit: u32,
222) -> anyhow::Result<Vec<CommitSearchHit>> {
223    let fts_query = fts_query(query);
224    let mut stmt = conn.prepare(
225        "
226        SELECT git_commits.hash, git_commits.author_name, git_commits.author_email,
227               git_commits.authored_at_s, git_commits.committed_at_s,
228               git_commits.subject, git_commits.body, git_commits.changed_file_count,
229               bm25(commit_fts) AS score
230        FROM commit_fts
231        JOIN git_commits ON git_commits.rowid = commit_fts.rowid
232        WHERE commit_fts MATCH ?1
233        ORDER BY score, git_commits.authored_at_s DESC
234        LIMIT ?2
235        ",
236    )?;
237    let rows = stmt.query_map(params![fts_query, i64::from(limit)], |row| {
238        Ok(CommitSearchHit {
239            hash: row.get(0)?,
240            author_name: row.get(1)?,
241            author_email: row.get(2)?,
242            authored_at_s: row.get(3)?,
243            committed_at_s: row.get(4)?,
244            subject: row.get(5)?,
245            body: row.get(6)?,
246            changed_file_count: row.get(7)?,
247            score: row.get(8)?,
248            evidence_kind: "historical",
249        })
250    })?;
251    let mut hits = collect_rows(rows)?;
252    for (rank, hit) in hits.iter_mut().enumerate() {
253        hit.score = positive_rank_score(rank);
254    }
255    Ok(hits)
256}
257
258fn positive_rank_score(rank: usize) -> f64 {
259    1.0 / ((rank + 1) as f64).sqrt()
260}
261
262pub fn history_for_path(
263    conn: &Connection,
264    path: &str,
265    limit: u32,
266) -> anyhow::Result<Vec<PathHistoryItem>> {
267    let mut stmt = conn.prepare(
268        "
269        SELECT git_commits.hash, git_file_changes.path, git_file_changes.additions,
270               git_file_changes.deletions, git_file_changes.change_kind,
271               git_commits.author_name, git_commits.authored_at_s, git_commits.subject
272        FROM git_file_changes
273        JOIN git_commits ON git_commits.hash = git_file_changes.commit_hash
274        WHERE git_file_changes.path = ?1
275        ORDER BY git_commits.authored_at_s DESC, git_commits.hash
276        LIMIT ?2
277        ",
278    )?;
279    let rows = stmt.query_map(params![path, i64::from(limit)], path_history_row)?;
280    collect_rows(rows)
281}
282
283pub fn commits_touching_query(
284    conn: &Connection,
285    query: &str,
286    limit: u32,
287    current_hits: &[SearchHit],
288) -> anyhow::Result<Vec<QueryCommitHit>> {
289    let mut combined = BTreeMap::<String, QueryCommitHit>::new();
290    for (rank, hit) in commit_search(conn, query, limit)?.into_iter().enumerate() {
291        combined.insert(
292            hit.hash.clone(),
293            QueryCommitHit {
294                hash: hit.hash,
295                author_name: hit.author_name,
296                authored_at_s: hit.authored_at_s,
297                subject: hit.subject,
298                changed_file_count: hit.changed_file_count,
299                evidence: vec!["commit_message".to_string()],
300                score: rank as f64,
301                evidence_kind: "historical",
302            },
303        );
304    }
305
306    let mut paths = BTreeSet::new();
307    for hit in current_hits {
308        paths.insert(hit.path.as_str());
309    }
310    for path in paths {
311        for item in history_for_path(conn, path, limit)? {
312            let entry = combined.entry(item.hash.clone()).or_insert_with(|| QueryCommitHit {
313                hash: item.hash.clone(),
314                author_name: item.author_name.clone(),
315                authored_at_s: item.authored_at_s,
316                subject: item.subject.clone(),
317                changed_file_count: 0,
318                evidence: Vec::new(),
319                score: f64::from(limit),
320                evidence_kind: "historical",
321            });
322            if !entry.evidence.iter().any(|value| value == "file_change") {
323                entry.evidence.push("file_change".to_string());
324            }
325            entry.score -= 0.25;
326        }
327    }
328
329    let mut hits = combined.into_values().collect::<Vec<_>>();
330    hits.sort_by(|left, right| {
331        left.score
332            .partial_cmp(&right.score)
333            .unwrap_or(std::cmp::Ordering::Equal)
334            .then_with(|| right.authored_at_s.cmp(&left.authored_at_s))
335    });
336    hits.truncate(usize::try_from(limit).unwrap_or(usize::MAX));
337    Ok(hits)
338}
339
340pub fn cached_blame(
341    conn: &Connection,
342    chunk_id: i64,
343    source_text_hash: &str,
344) -> anyhow::Result<Option<ChunkBlameSummary>> {
345    conn.query_row(
346        "
347        SELECT chunk_id, path, start_line, end_line, source_text_hash, line_count,
348               dominant_commit, dominant_commit_lines, newest_commit, newest_commit_time_s,
349               oldest_commit, oldest_commit_time_s, commit_counts_json
350        FROM git_chunk_blame
351        WHERE chunk_id = ?1 AND source_text_hash = ?2
352        ",
353        params![chunk_id, source_text_hash],
354        blame_row,
355    )
356    .optional()
357    .map_err(Into::into)
358}
359
360pub fn store_blame(conn: &Connection, summary: &ChunkBlameSummary) -> anyhow::Result<()> {
361    let counts = serde_json::to_string(&summary.commit_counts)?;
362    conn.execute(
363        "
364        INSERT INTO git_chunk_blame(
365            chunk_id, source_text_hash, path, start_line, end_line, line_count,
366            dominant_commit, dominant_commit_lines, newest_commit, newest_commit_time_s,
367            oldest_commit, oldest_commit_time_s, commit_counts_json, computed_at_ms
368        )
369        VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14)
370        ON CONFLICT(chunk_id) DO UPDATE SET
371            source_text_hash = excluded.source_text_hash,
372            path = excluded.path,
373            start_line = excluded.start_line,
374            end_line = excluded.end_line,
375            line_count = excluded.line_count,
376            dominant_commit = excluded.dominant_commit,
377            dominant_commit_lines = excluded.dominant_commit_lines,
378            newest_commit = excluded.newest_commit,
379            newest_commit_time_s = excluded.newest_commit_time_s,
380            oldest_commit = excluded.oldest_commit,
381            oldest_commit_time_s = excluded.oldest_commit_time_s,
382            commit_counts_json = excluded.commit_counts_json,
383            computed_at_ms = excluded.computed_at_ms
384        ",
385        params![
386            summary.chunk_id,
387            summary.source_text_hash,
388            summary.path,
389            summary.start_line,
390            summary.end_line,
391            summary.line_count,
392            summary.dominant_commit,
393            summary.dominant_commit_lines,
394            summary.newest_commit,
395            summary.newest_commit_time_s,
396            summary.oldest_commit,
397            summary.oldest_commit_time_s,
398            counts,
399            crate::index::now_ms(),
400        ],
401    )?;
402    Ok(())
403}
404
405pub fn blame_lines(root: &Path, path: &str, start_line: i64, end_line: i64) -> Vec<BlameLine> {
406    let range = format!("{start_line},{end_line}");
407    let Some(output) = git_output(root, &["blame", "--line-porcelain", "-L", &range, "--", path])
408    else {
409        return Vec::new();
410    };
411    parse_blame(&output)
412}
413
414#[derive(Debug, Clone)]
415pub struct BlameLine {
416    pub commit: String,
417    pub author_time_s: Option<i64>,
418}
419
420pub fn source_text_hash(text: &str) -> String {
421    hex_sha256(text.as_bytes())
422}
423
424fn clear(conn: &Connection) -> anyhow::Result<()> {
425    conn.execute_batch(
426        "
427        DELETE FROM commit_fts;
428        DELETE FROM git_chunk_blame;
429        DELETE FROM git_file_changes;
430        DELETE FROM git_commits;
431        DELETE FROM index_meta WHERE key = 'git_history_indexed_head';
432        ",
433    )?;
434    Ok(())
435}
436
437fn read_commits(root: &Path) -> anyhow::Result<Vec<CommitRecord>> {
438    let Some(output) = git_output(
439        root,
440        &["log", "--format=format:%H%x1f%an%x1f%ae%x1f%at%x1f%ct%x1f%s%x1f%B%x1e", "--", "."],
441    ) else {
442        return Ok(Vec::new());
443    };
444    Ok(output
445        .split('\x1e')
446        .collect::<Vec<_>>()
447        .into_par_iter()
448        .filter_map(parse_commit_record)
449        .collect())
450}
451
452fn read_file_changes(root: &Path, worktree_root: &Path) -> anyhow::Result<Vec<FileChange>> {
453    let Some(output) = git_output(root, &["log", "--numstat", "--format=format:%x1e%H", "--", "."])
454    else {
455        return Ok(Vec::new());
456    };
457    Ok(output
458        .split('\x1e')
459        .collect::<Vec<_>>()
460        .into_par_iter()
461        .flat_map(|record| parse_file_change_record(root, worktree_root, record))
462        .collect())
463}
464
465fn parse_commit_record(record: &str) -> Option<CommitRecord> {
466    let record = record.trim();
467    if record.is_empty() {
468        return None;
469    }
470    let mut parts = record.splitn(7, '\x1f');
471    let hash = parts.next()?;
472    let author_name = parts.next()?;
473    let author_email = parts.next()?;
474    let authored_at_s = parts.next()?;
475    let committed_at_s = parts.next()?;
476    let subject = parts.next()?;
477    let body = parts.next().unwrap_or_default().trim().to_string();
478    Some(CommitRecord {
479        hash: hash.to_string(),
480        author_name: author_name.to_string(),
481        author_email: author_email.to_string(),
482        authored_at_s: authored_at_s.parse().unwrap_or(0),
483        committed_at_s: committed_at_s.parse().unwrap_or(0),
484        subject: subject.to_string(),
485        body,
486    })
487}
488
489fn parse_file_change_record(root: &Path, worktree_root: &Path, record: &str) -> Vec<FileChange> {
490    let mut lines = record.lines().filter(|line| !line.trim().is_empty());
491    let Some(hash) = lines.next().map(str::trim).filter(|line| !line.is_empty()) else {
492        return Vec::new();
493    };
494    let mut changes = Vec::new();
495    for line in lines {
496        let fields = line.split('\t').collect::<Vec<_>>();
497        if fields.len() < 3 {
498            continue;
499        }
500        let Some(path) = normalize_git_path(root, worktree_root, fields[2]) else {
501            continue;
502        };
503        changes.push(FileChange {
504            commit_hash: hash.to_string(),
505            path,
506            additions: parse_numstat_count(fields[0]),
507            deletions: parse_numstat_count(fields[1]),
508            change_kind: "modified".to_string(),
509        });
510    }
511    changes
512}
513
514fn normalize_git_path(root: &Path, worktree_root: &Path, path: &str) -> Option<String> {
515    let path = normalize_rename_path(path);
516    let path = Path::new(path);
517    if let Ok(relative) = worktree_root.join(path).strip_prefix(root) {
518        return Some(path_string(relative));
519    }
520    if root.join(path).exists() || !path.is_absolute() {
521        return Some(path_string(path));
522    }
523    None
524}
525
526fn normalize_rename_path(path: &str) -> &str {
527    path.rsplit(" => ").next().unwrap_or(path).trim_matches('{').trim_matches('}')
528}
529
530fn parse_numstat_count(value: &str) -> Option<i64> {
531    (value != "-").then(|| value.parse::<i64>().ok()).flatten()
532}
533
534fn parse_blame(output: &str) -> Vec<BlameLine> {
535    let mut lines = Vec::new();
536    let mut current_commit = None::<String>;
537    let mut current_time = None::<i64>;
538    for line in output.lines() {
539        if let Some((hash, _rest)) = line.split_once(' ')
540            && hash.len() == 40
541            && hash.chars().all(|c| c.is_ascii_hexdigit())
542        {
543            current_commit = Some(hash.to_string());
544            current_time = None;
545            continue;
546        }
547        if let Some(value) = line.strip_prefix("author-time ") {
548            current_time = value.parse().ok();
549            continue;
550        }
551        if line.starts_with('\t')
552            && let Some(commit) = current_commit.clone()
553        {
554            lines.push(BlameLine { commit, author_time_s: current_time });
555        }
556    }
557    lines
558}
559
560fn path_history_row(row: &rusqlite::Row<'_>) -> rusqlite::Result<PathHistoryItem> {
561    Ok(PathHistoryItem {
562        hash: row.get(0)?,
563        path: row.get(1)?,
564        additions: row.get(2)?,
565        deletions: row.get(3)?,
566        change_kind: row.get(4)?,
567        author_name: row.get(5)?,
568        authored_at_s: row.get(6)?,
569        subject: row.get(7)?,
570        evidence_kind: "historical",
571    })
572}
573
574fn blame_row(row: &rusqlite::Row<'_>) -> rusqlite::Result<ChunkBlameSummary> {
575    let counts_json: String = row.get(12)?;
576    let commit_counts = serde_json::from_str(&counts_json).unwrap_or_default();
577    Ok(ChunkBlameSummary {
578        chunk_id: row.get(0)?,
579        path: row.get(1)?,
580        start_line: row.get(2)?,
581        end_line: row.get(3)?,
582        source_text_hash: row.get(4)?,
583        line_count: row.get(5)?,
584        dominant_commit: row.get(6)?,
585        dominant_commit_lines: row.get(7)?,
586        newest_commit: row.get(8)?,
587        newest_commit_time_s: row.get(9)?,
588        oldest_commit: row.get(10)?,
589        oldest_commit_time_s: row.get(11)?,
590        commit_counts,
591        evidence_kind: "historical",
592    })
593}
594
595fn collect_rows<T>(
596    rows: rusqlite::MappedRows<'_, impl FnMut(&rusqlite::Row<'_>) -> rusqlite::Result<T>>,
597) -> anyhow::Result<Vec<T>> {
598    let mut out = Vec::new();
599    for row in rows {
600        out.push(row?);
601    }
602    Ok(out)
603}
604
605fn count_table(conn: &Connection, table: &str) -> anyhow::Result<u64> {
606    let count =
607        conn.query_row(&format!("SELECT COUNT(*) FROM {table}"), [], |row| row.get::<_, i64>(0))?;
608    Ok(u64::try_from(count).unwrap_or(0))
609}
610
611fn git_repo(root: &Path) -> Option<GitRepo> {
612    let worktree_root = git_output(root, &["rev-parse", "--show-toplevel"])?;
613    let head = git_output(root, &["rev-parse", "HEAD"])?;
614    Some(GitRepo { worktree_root: PathBuf::from(worktree_root), head })
615}
616
617fn git_output(root: &Path, args: &[&str]) -> Option<String> {
618    let output = Command::new("git").args(args).current_dir(root).output().ok()?;
619    if !output.status.success() {
620        return None;
621    }
622    Some(String::from_utf8_lossy(&output.stdout).trim().to_string())
623}
624
625fn fts_query(query: &str) -> String {
626    let terms = query
627        .split(|c: char| !c.is_alphanumeric() && c != '_' && c != '-')
628        .filter(|term| !term.is_empty())
629        .map(|term| format!("\"{}\"", term.replace('"', "\"\"")))
630        .collect::<Vec<_>>();
631    if terms.is_empty() { "\"\"".to_string() } else { terms.join(" OR ") }
632}
633
634fn meta(conn: &Connection, key: &str) -> anyhow::Result<Option<String>> {
635    Ok(conn
636        .query_row("SELECT value FROM index_meta WHERE key = ?1", [key], |row| row.get(0))
637        .optional()?)
638}
639
640fn set_meta(conn: &Connection, key: &str, value: &str) -> anyhow::Result<()> {
641    conn.execute(
642        "INSERT INTO index_meta(key, value) VALUES (?1, ?2)
643         ON CONFLICT(key) DO UPDATE SET value = excluded.value",
644        params![key, value],
645    )?;
646    Ok(())
647}
648
649fn hex_sha256(bytes: &[u8]) -> String {
650    let hash = Sha256::digest(bytes);
651    let mut out = String::with_capacity(hash.len() * 2);
652    for byte in hash {
653        use std::fmt::Write as _;
654        let _ = write!(out, "{byte:02x}");
655    }
656    out
657}
658
659fn path_string(path: &Path) -> String {
660    path.to_string_lossy().replace('\\', "/")
661}