1use std::{
2 collections::{BTreeMap, BTreeSet},
3 path::{Path, PathBuf},
4 process::Command,
5};
6
7use rayon::prelude::*;
8use rusqlite::{Connection, OptionalExtension, params};
9use serde::Serialize;
10use sha2::{Digest, Sha256};
11
12use crate::search::lexical::SearchHit;
13
14#[derive(Debug, Clone, Serialize)]
15pub struct GitHistoryIndexStatus {
16 pub available: bool,
17 pub head: Option<String>,
18 pub indexed_head: Option<String>,
19 pub commit_count: u64,
20 pub file_change_count: u64,
21}
22
23#[derive(Debug, Clone, Serialize)]
24pub struct CommitSearchHit {
25 pub hash: String,
26 pub author_name: String,
27 pub author_email: String,
28 pub authored_at_s: i64,
29 pub committed_at_s: i64,
30 pub subject: String,
31 pub body: String,
32 pub changed_file_count: i64,
33 pub score: f64,
34 pub evidence_kind: &'static str,
35}
36
37#[derive(Debug, Clone, Serialize)]
38pub struct PathHistoryItem {
39 pub hash: String,
40 pub path: String,
41 pub additions: Option<i64>,
42 pub deletions: Option<i64>,
43 pub change_kind: String,
44 pub author_name: String,
45 pub authored_at_s: i64,
46 pub subject: String,
47 pub evidence_kind: &'static str,
48}
49
50#[derive(Debug, Clone, Serialize)]
51pub struct SymbolHistoryItem {
52 pub symbol: String,
53 pub qualified_name: String,
54 pub path: String,
55 pub start_byte: i64,
56 pub end_byte: i64,
57 pub commit: PathHistoryItem,
58 pub evidence_kind: &'static str,
59}
60
61#[derive(Debug, Clone, Serialize)]
62pub struct QueryCommitHit {
63 pub hash: String,
64 pub author_name: String,
65 pub authored_at_s: i64,
66 pub subject: String,
67 pub changed_file_count: i64,
68 pub evidence: Vec<String>,
69 pub score: f64,
70 pub evidence_kind: &'static str,
71}
72
73#[derive(Debug, Clone, Serialize)]
74pub struct ChunkBlameSummary {
75 pub chunk_id: i64,
76 pub path: String,
77 pub start_line: i64,
78 pub end_line: i64,
79 pub source_text_hash: String,
80 pub line_count: i64,
81 pub dominant_commit: Option<String>,
82 pub dominant_commit_lines: i64,
83 pub newest_commit: Option<String>,
84 pub newest_commit_time_s: Option<i64>,
85 pub oldest_commit: Option<String>,
86 pub oldest_commit_time_s: Option<i64>,
87 pub commit_counts: BTreeMap<String, i64>,
88 pub evidence_kind: &'static str,
89}
90
91#[derive(Debug)]
92struct GitRepo {
93 worktree_root: PathBuf,
94 head: String,
95}
96
97#[derive(Debug)]
98struct CommitRecord {
99 hash: String,
100 author_name: String,
101 author_email: String,
102 authored_at_s: i64,
103 committed_at_s: i64,
104 subject: String,
105 body: String,
106}
107
108#[derive(Debug)]
109struct FileChange {
110 commit_hash: String,
111 path: String,
112 additions: Option<i64>,
113 deletions: Option<i64>,
114 change_kind: String,
115}
116
117#[derive(Debug)]
118pub(crate) struct PreparedGitHistory {
119 repo: Option<GitRepo>,
120 commits: Vec<CommitRecord>,
121 changes: Vec<FileChange>,
122}
123
124pub(crate) fn prepare(root: &Path) -> anyhow::Result<PreparedGitHistory> {
125 let Some(repo) = git_repo(root) else {
126 return Ok(PreparedGitHistory { repo: None, commits: Vec::new(), changes: Vec::new() });
127 };
128 let commits = read_commits(root)?;
129 let changes = read_file_changes(root, &repo.worktree_root)?;
130 Ok(PreparedGitHistory { repo: Some(repo), commits, changes })
131}
132
133pub(crate) fn apply_prepared(
134 conn: &Connection,
135 root: &Path,
136 prepared: PreparedGitHistory,
137) -> anyhow::Result<GitHistoryIndexStatus> {
138 let Some(repo) = prepared.repo else {
139 clear(conn)?;
140 return status(conn, root);
141 };
142
143 conn.execute_batch(
144 "
145 DELETE FROM commit_fts;
146 DELETE FROM git_chunk_blame;
147 DELETE FROM git_file_changes;
148 DELETE FROM git_commits;
149 ",
150 )?;
151
152 for commit in &prepared.commits {
153 conn.execute(
154 "INSERT INTO git_commits(hash, author_name, author_email, authored_at_s, committed_at_s, subject, body, changed_file_count)
155 VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, 0)",
156 params![
157 commit.hash,
158 commit.author_name,
159 commit.author_email,
160 commit.authored_at_s,
161 commit.committed_at_s,
162 commit.subject,
163 commit.body,
164 ],
165 )?;
166 }
167
168 let mut changed_counts = BTreeMap::<String, i64>::new();
169 for change in prepared.changes {
170 *changed_counts.entry(change.commit_hash.clone()).or_default() += 1;
171 conn.execute(
172 "INSERT INTO git_file_changes(commit_hash, path, additions, deletions, change_kind)
173 VALUES (?1, ?2, ?3, ?4, ?5)",
174 params![
175 change.commit_hash,
176 change.path,
177 change.additions,
178 change.deletions,
179 change.change_kind,
180 ],
181 )?;
182 }
183 for (hash, count) in changed_counts {
184 conn.execute(
185 "UPDATE git_commits SET changed_file_count = ?2 WHERE hash = ?1",
186 params![hash, count],
187 )?;
188 }
189
190 conn.execute_batch(
191 "
192 INSERT INTO commit_fts(rowid, subject, body)
193 SELECT rowid, subject, body FROM git_commits;
194 ",
195 )?;
196 set_meta(conn, "git_history_indexed_head", &repo.head)?;
197 status(conn, root)
198}
199
200pub fn index(conn: &Connection, root: &Path) -> anyhow::Result<GitHistoryIndexStatus> {
201 let prepared = prepare(root)?;
202 apply_prepared(conn, root, prepared)
203}
204
205pub fn status(conn: &Connection, root: &Path) -> anyhow::Result<GitHistoryIndexStatus> {
206 let repo = git_repo(root);
207 let commit_count = count_table(conn, "git_commits")?;
208 let file_change_count = count_table(conn, "git_file_changes")?;
209 Ok(GitHistoryIndexStatus {
210 available: repo.is_some(),
211 head: repo.map(|repo| repo.head),
212 indexed_head: meta(conn, "git_history_indexed_head")?,
213 commit_count,
214 file_change_count,
215 })
216}
217
218pub fn commit_search(
219 conn: &Connection,
220 query: &str,
221 limit: u32,
222) -> anyhow::Result<Vec<CommitSearchHit>> {
223 let fts_query = fts_query(query);
224 let mut stmt = conn.prepare(
225 "
226 SELECT git_commits.hash, git_commits.author_name, git_commits.author_email,
227 git_commits.authored_at_s, git_commits.committed_at_s,
228 git_commits.subject, git_commits.body, git_commits.changed_file_count,
229 bm25(commit_fts) AS score
230 FROM commit_fts
231 JOIN git_commits ON git_commits.rowid = commit_fts.rowid
232 WHERE commit_fts MATCH ?1
233 ORDER BY score, git_commits.authored_at_s DESC
234 LIMIT ?2
235 ",
236 )?;
237 let rows = stmt.query_map(params![fts_query, i64::from(limit)], |row| {
238 Ok(CommitSearchHit {
239 hash: row.get(0)?,
240 author_name: row.get(1)?,
241 author_email: row.get(2)?,
242 authored_at_s: row.get(3)?,
243 committed_at_s: row.get(4)?,
244 subject: row.get(5)?,
245 body: row.get(6)?,
246 changed_file_count: row.get(7)?,
247 score: row.get(8)?,
248 evidence_kind: "historical",
249 })
250 })?;
251 let mut hits = collect_rows(rows)?;
252 for (rank, hit) in hits.iter_mut().enumerate() {
253 hit.score = positive_rank_score(rank);
254 }
255 Ok(hits)
256}
257
258fn positive_rank_score(rank: usize) -> f64 {
259 1.0 / ((rank + 1) as f64).sqrt()
260}
261
262pub fn history_for_path(
263 conn: &Connection,
264 path: &str,
265 limit: u32,
266) -> anyhow::Result<Vec<PathHistoryItem>> {
267 let mut stmt = conn.prepare(
268 "
269 SELECT git_commits.hash, git_file_changes.path, git_file_changes.additions,
270 git_file_changes.deletions, git_file_changes.change_kind,
271 git_commits.author_name, git_commits.authored_at_s, git_commits.subject
272 FROM git_file_changes
273 JOIN git_commits ON git_commits.hash = git_file_changes.commit_hash
274 WHERE git_file_changes.path = ?1
275 ORDER BY git_commits.authored_at_s DESC, git_commits.hash
276 LIMIT ?2
277 ",
278 )?;
279 let rows = stmt.query_map(params![path, i64::from(limit)], path_history_row)?;
280 collect_rows(rows)
281}
282
283pub fn commits_touching_query(
284 conn: &Connection,
285 query: &str,
286 limit: u32,
287 current_hits: &[SearchHit],
288) -> anyhow::Result<Vec<QueryCommitHit>> {
289 let mut combined = BTreeMap::<String, QueryCommitHit>::new();
290 for (rank, hit) in commit_search(conn, query, limit)?.into_iter().enumerate() {
291 combined.insert(
292 hit.hash.clone(),
293 QueryCommitHit {
294 hash: hit.hash,
295 author_name: hit.author_name,
296 authored_at_s: hit.authored_at_s,
297 subject: hit.subject,
298 changed_file_count: hit.changed_file_count,
299 evidence: vec!["commit_message".to_string()],
300 score: rank as f64,
301 evidence_kind: "historical",
302 },
303 );
304 }
305
306 let mut paths = BTreeSet::new();
307 for hit in current_hits {
308 paths.insert(hit.path.as_str());
309 }
310 for path in paths {
311 for item in history_for_path(conn, path, limit)? {
312 let entry = combined.entry(item.hash.clone()).or_insert_with(|| QueryCommitHit {
313 hash: item.hash.clone(),
314 author_name: item.author_name.clone(),
315 authored_at_s: item.authored_at_s,
316 subject: item.subject.clone(),
317 changed_file_count: 0,
318 evidence: Vec::new(),
319 score: f64::from(limit),
320 evidence_kind: "historical",
321 });
322 if !entry.evidence.iter().any(|value| value == "file_change") {
323 entry.evidence.push("file_change".to_string());
324 }
325 entry.score -= 0.25;
326 }
327 }
328
329 let mut hits = combined.into_values().collect::<Vec<_>>();
330 hits.sort_by(|left, right| {
331 left.score
332 .partial_cmp(&right.score)
333 .unwrap_or(std::cmp::Ordering::Equal)
334 .then_with(|| right.authored_at_s.cmp(&left.authored_at_s))
335 });
336 hits.truncate(usize::try_from(limit).unwrap_or(usize::MAX));
337 Ok(hits)
338}
339
340pub fn cached_blame(
341 conn: &Connection,
342 chunk_id: i64,
343 source_text_hash: &str,
344) -> anyhow::Result<Option<ChunkBlameSummary>> {
345 conn.query_row(
346 "
347 SELECT chunk_id, path, start_line, end_line, source_text_hash, line_count,
348 dominant_commit, dominant_commit_lines, newest_commit, newest_commit_time_s,
349 oldest_commit, oldest_commit_time_s, commit_counts_json
350 FROM git_chunk_blame
351 WHERE chunk_id = ?1 AND source_text_hash = ?2
352 ",
353 params![chunk_id, source_text_hash],
354 blame_row,
355 )
356 .optional()
357 .map_err(Into::into)
358}
359
360pub fn store_blame(conn: &Connection, summary: &ChunkBlameSummary) -> anyhow::Result<()> {
361 let counts = serde_json::to_string(&summary.commit_counts)?;
362 conn.execute(
363 "
364 INSERT INTO git_chunk_blame(
365 chunk_id, source_text_hash, path, start_line, end_line, line_count,
366 dominant_commit, dominant_commit_lines, newest_commit, newest_commit_time_s,
367 oldest_commit, oldest_commit_time_s, commit_counts_json, computed_at_ms
368 )
369 VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14)
370 ON CONFLICT(chunk_id) DO UPDATE SET
371 source_text_hash = excluded.source_text_hash,
372 path = excluded.path,
373 start_line = excluded.start_line,
374 end_line = excluded.end_line,
375 line_count = excluded.line_count,
376 dominant_commit = excluded.dominant_commit,
377 dominant_commit_lines = excluded.dominant_commit_lines,
378 newest_commit = excluded.newest_commit,
379 newest_commit_time_s = excluded.newest_commit_time_s,
380 oldest_commit = excluded.oldest_commit,
381 oldest_commit_time_s = excluded.oldest_commit_time_s,
382 commit_counts_json = excluded.commit_counts_json,
383 computed_at_ms = excluded.computed_at_ms
384 ",
385 params![
386 summary.chunk_id,
387 summary.source_text_hash,
388 summary.path,
389 summary.start_line,
390 summary.end_line,
391 summary.line_count,
392 summary.dominant_commit,
393 summary.dominant_commit_lines,
394 summary.newest_commit,
395 summary.newest_commit_time_s,
396 summary.oldest_commit,
397 summary.oldest_commit_time_s,
398 counts,
399 crate::index::now_ms(),
400 ],
401 )?;
402 Ok(())
403}
404
405pub fn blame_lines(root: &Path, path: &str, start_line: i64, end_line: i64) -> Vec<BlameLine> {
406 let range = format!("{start_line},{end_line}");
407 let Some(output) = git_output(root, &["blame", "--line-porcelain", "-L", &range, "--", path])
408 else {
409 return Vec::new();
410 };
411 parse_blame(&output)
412}
413
414#[derive(Debug, Clone)]
415pub struct BlameLine {
416 pub commit: String,
417 pub author_time_s: Option<i64>,
418}
419
420pub fn source_text_hash(text: &str) -> String {
421 hex_sha256(text.as_bytes())
422}
423
424fn clear(conn: &Connection) -> anyhow::Result<()> {
425 conn.execute_batch(
426 "
427 DELETE FROM commit_fts;
428 DELETE FROM git_chunk_blame;
429 DELETE FROM git_file_changes;
430 DELETE FROM git_commits;
431 DELETE FROM index_meta WHERE key = 'git_history_indexed_head';
432 ",
433 )?;
434 Ok(())
435}
436
437fn read_commits(root: &Path) -> anyhow::Result<Vec<CommitRecord>> {
438 let Some(output) = git_output(
439 root,
440 &["log", "--format=format:%H%x1f%an%x1f%ae%x1f%at%x1f%ct%x1f%s%x1f%B%x1e", "--", "."],
441 ) else {
442 return Ok(Vec::new());
443 };
444 Ok(output
445 .split('\x1e')
446 .collect::<Vec<_>>()
447 .into_par_iter()
448 .filter_map(parse_commit_record)
449 .collect())
450}
451
452fn read_file_changes(root: &Path, worktree_root: &Path) -> anyhow::Result<Vec<FileChange>> {
453 let Some(output) = git_output(root, &["log", "--numstat", "--format=format:%x1e%H", "--", "."])
454 else {
455 return Ok(Vec::new());
456 };
457 Ok(output
458 .split('\x1e')
459 .collect::<Vec<_>>()
460 .into_par_iter()
461 .flat_map(|record| parse_file_change_record(root, worktree_root, record))
462 .collect())
463}
464
465fn parse_commit_record(record: &str) -> Option<CommitRecord> {
466 let record = record.trim();
467 if record.is_empty() {
468 return None;
469 }
470 let mut parts = record.splitn(7, '\x1f');
471 let hash = parts.next()?;
472 let author_name = parts.next()?;
473 let author_email = parts.next()?;
474 let authored_at_s = parts.next()?;
475 let committed_at_s = parts.next()?;
476 let subject = parts.next()?;
477 let body = parts.next().unwrap_or_default().trim().to_string();
478 Some(CommitRecord {
479 hash: hash.to_string(),
480 author_name: author_name.to_string(),
481 author_email: author_email.to_string(),
482 authored_at_s: authored_at_s.parse().unwrap_or(0),
483 committed_at_s: committed_at_s.parse().unwrap_or(0),
484 subject: subject.to_string(),
485 body,
486 })
487}
488
489fn parse_file_change_record(root: &Path, worktree_root: &Path, record: &str) -> Vec<FileChange> {
490 let mut lines = record.lines().filter(|line| !line.trim().is_empty());
491 let Some(hash) = lines.next().map(str::trim).filter(|line| !line.is_empty()) else {
492 return Vec::new();
493 };
494 let mut changes = Vec::new();
495 for line in lines {
496 let fields = line.split('\t').collect::<Vec<_>>();
497 if fields.len() < 3 {
498 continue;
499 }
500 let Some(path) = normalize_git_path(root, worktree_root, fields[2]) else {
501 continue;
502 };
503 changes.push(FileChange {
504 commit_hash: hash.to_string(),
505 path,
506 additions: parse_numstat_count(fields[0]),
507 deletions: parse_numstat_count(fields[1]),
508 change_kind: "modified".to_string(),
509 });
510 }
511 changes
512}
513
514fn normalize_git_path(root: &Path, worktree_root: &Path, path: &str) -> Option<String> {
515 let path = normalize_rename_path(path);
516 let path = Path::new(path);
517 if let Ok(relative) = worktree_root.join(path).strip_prefix(root) {
518 return Some(path_string(relative));
519 }
520 if root.join(path).exists() || !path.is_absolute() {
521 return Some(path_string(path));
522 }
523 None
524}
525
526fn normalize_rename_path(path: &str) -> &str {
527 path.rsplit(" => ").next().unwrap_or(path).trim_matches('{').trim_matches('}')
528}
529
530fn parse_numstat_count(value: &str) -> Option<i64> {
531 (value != "-").then(|| value.parse::<i64>().ok()).flatten()
532}
533
534fn parse_blame(output: &str) -> Vec<BlameLine> {
535 let mut lines = Vec::new();
536 let mut current_commit = None::<String>;
537 let mut current_time = None::<i64>;
538 for line in output.lines() {
539 if let Some((hash, _rest)) = line.split_once(' ')
540 && hash.len() == 40
541 && hash.chars().all(|c| c.is_ascii_hexdigit())
542 {
543 current_commit = Some(hash.to_string());
544 current_time = None;
545 continue;
546 }
547 if let Some(value) = line.strip_prefix("author-time ") {
548 current_time = value.parse().ok();
549 continue;
550 }
551 if line.starts_with('\t')
552 && let Some(commit) = current_commit.clone()
553 {
554 lines.push(BlameLine { commit, author_time_s: current_time });
555 }
556 }
557 lines
558}
559
560fn path_history_row(row: &rusqlite::Row<'_>) -> rusqlite::Result<PathHistoryItem> {
561 Ok(PathHistoryItem {
562 hash: row.get(0)?,
563 path: row.get(1)?,
564 additions: row.get(2)?,
565 deletions: row.get(3)?,
566 change_kind: row.get(4)?,
567 author_name: row.get(5)?,
568 authored_at_s: row.get(6)?,
569 subject: row.get(7)?,
570 evidence_kind: "historical",
571 })
572}
573
574fn blame_row(row: &rusqlite::Row<'_>) -> rusqlite::Result<ChunkBlameSummary> {
575 let counts_json: String = row.get(12)?;
576 let commit_counts = serde_json::from_str(&counts_json).unwrap_or_default();
577 Ok(ChunkBlameSummary {
578 chunk_id: row.get(0)?,
579 path: row.get(1)?,
580 start_line: row.get(2)?,
581 end_line: row.get(3)?,
582 source_text_hash: row.get(4)?,
583 line_count: row.get(5)?,
584 dominant_commit: row.get(6)?,
585 dominant_commit_lines: row.get(7)?,
586 newest_commit: row.get(8)?,
587 newest_commit_time_s: row.get(9)?,
588 oldest_commit: row.get(10)?,
589 oldest_commit_time_s: row.get(11)?,
590 commit_counts,
591 evidence_kind: "historical",
592 })
593}
594
595fn collect_rows<T>(
596 rows: rusqlite::MappedRows<'_, impl FnMut(&rusqlite::Row<'_>) -> rusqlite::Result<T>>,
597) -> anyhow::Result<Vec<T>> {
598 let mut out = Vec::new();
599 for row in rows {
600 out.push(row?);
601 }
602 Ok(out)
603}
604
605fn count_table(conn: &Connection, table: &str) -> anyhow::Result<u64> {
606 let count =
607 conn.query_row(&format!("SELECT COUNT(*) FROM {table}"), [], |row| row.get::<_, i64>(0))?;
608 Ok(u64::try_from(count).unwrap_or(0))
609}
610
611fn git_repo(root: &Path) -> Option<GitRepo> {
612 let worktree_root = git_output(root, &["rev-parse", "--show-toplevel"])?;
613 let head = git_output(root, &["rev-parse", "HEAD"])?;
614 Some(GitRepo { worktree_root: PathBuf::from(worktree_root), head })
615}
616
617fn git_output(root: &Path, args: &[&str]) -> Option<String> {
618 let output = Command::new("git").args(args).current_dir(root).output().ok()?;
619 if !output.status.success() {
620 return None;
621 }
622 Some(String::from_utf8_lossy(&output.stdout).trim().to_string())
623}
624
625fn fts_query(query: &str) -> String {
626 let terms = query
627 .split(|c: char| !c.is_alphanumeric() && c != '_' && c != '-')
628 .filter(|term| !term.is_empty())
629 .map(|term| format!("\"{}\"", term.replace('"', "\"\"")))
630 .collect::<Vec<_>>();
631 if terms.is_empty() { "\"\"".to_string() } else { terms.join(" OR ") }
632}
633
634fn meta(conn: &Connection, key: &str) -> anyhow::Result<Option<String>> {
635 Ok(conn
636 .query_row("SELECT value FROM index_meta WHERE key = ?1", [key], |row| row.get(0))
637 .optional()?)
638}
639
640fn set_meta(conn: &Connection, key: &str, value: &str) -> anyhow::Result<()> {
641 conn.execute(
642 "INSERT INTO index_meta(key, value) VALUES (?1, ?2)
643 ON CONFLICT(key) DO UPDATE SET value = excluded.value",
644 params![key, value],
645 )?;
646 Ok(())
647}
648
649fn hex_sha256(bytes: &[u8]) -> String {
650 let hash = Sha256::digest(bytes);
651 let mut out = String::with_capacity(hash.len() * 2);
652 for byte in hash {
653 use std::fmt::Write as _;
654 let _ = write!(out, "{byte:02x}");
655 }
656 out
657}
658
659fn path_string(path: &Path) -> String {
660 path.to_string_lossy().replace('\\', "/")
661}