Skip to main content

codemem_engine/enrichment/
temporal.rs

1//! Temporal graph layer: commit and PR nodes with symbol-level ModifiedBy edges.
2//!
3//! Replays git history (default 90 days) to build a layered graph where each
4//! commit is a node connected to the symbols/files it modified. PRs are detected
5//! from merge/squash commit patterns and connected to their commits via PartOf edges.
6
7use crate::review::parse_diff;
8use crate::CodememEngine;
9use codemem_core::{CodememError, Edge, GraphNode, NodeKind, RelationshipType};
10use serde_json::json;
11use std::collections::{HashMap, HashSet};
12
13/// Git's empty tree SHA — used as the diff parent for root commits.
14const EMPTY_TREE_SHA: &str = "4b825dc642cb6eb9a060e54bf899d69f82d7a419";
15
16/// Result of temporal graph ingestion.
17#[derive(Debug, Default)]
18pub struct TemporalIngestResult {
19    pub commits_processed: usize,
20    pub commits_skipped: usize,
21    pub pr_nodes_created: usize,
22    pub modified_by_edges: usize,
23    pub part_of_edges: usize,
24    pub symbols_expired: usize,
25}
26
27/// Parsed commit from git log.
28pub(crate) struct ParsedCommit {
29    hash: String,
30    short_hash: String,
31    parents: Vec<String>,
32    author: String,
33    date: chrono::DateTime<chrono::Utc>,
34    subject: String,
35    files: Vec<String>,
36}
37
38/// Detected PR from commit patterns.
39struct DetectedPR {
40    /// PR number (from commit subject).
41    number: String,
42    /// Commit hashes belonging to this PR.
43    commits: Vec<String>,
44    /// Whether this was a squash merge (single commit).
45    squash: bool,
46    /// Timestamp from the merge/squash commit.
47    merged_at: chrono::DateTime<chrono::Utc>,
48    /// Subject of the merge commit (used as PR title).
49    title: String,
50    /// Author of the merge commit.
51    author: String,
52}
53
54/// Check if a commit looks like a bot/CI commit that should be compacted.
55fn is_bot_commit(author: &str, files: &[String]) -> bool {
56    let bot_author = author.contains("[bot]")
57        || author.ends_with("-bot")
58        || author.ends_with("bot)")
59        || author == "renovate"
60        || author == "github-actions";
61
62    if bot_author {
63        return true;
64    }
65
66    // All files are lock/generated files
67    if !files.is_empty()
68        && files.iter().all(|f| {
69            f.ends_with(".lock")
70                || f.ends_with("lock.json")
71                || f.ends_with("lock.yaml")
72                || f == "CHANGELOG.md"
73                || f == "Cargo.lock"
74                || f == "bun.lock"
75                || f == "yarn.lock"
76                || f == "package-lock.json"
77                || f == "pnpm-lock.yaml"
78                || f == "Gemfile.lock"
79                || f == "poetry.lock"
80                || f == "go.sum"
81        })
82    {
83        return true;
84    }
85
86    false
87}
88
89/// Extract PR number from a commit subject.
90/// Matches: `feat: add foo (#123)`, `Merge pull request #123 from ...`
91fn extract_pr_number(subject: &str) -> Option<String> {
92    // Squash merge: "subject (#123)"
93    if let Some(start) = subject.rfind("(#") {
94        if let Some(end) = subject[start..].find(')') {
95            let num = &subject[start + 2..start + end];
96            if num.chars().all(|c| c.is_ascii_digit()) {
97                return Some(num.to_string());
98            }
99        }
100    }
101    // GitHub merge commit: "Merge pull request #123 from ..."
102    if let Some(rest) = subject.strip_prefix("Merge pull request #") {
103        let num: String = rest.chars().take_while(|c| c.is_ascii_digit()).collect();
104        if !num.is_empty() {
105            return Some(num);
106        }
107    }
108    None
109}
110
111impl CodememEngine {
112    /// Ingest git history into the temporal graph layer.
113    ///
114    /// Creates Commit nodes, PullRequest nodes, and ModifiedBy edges.
115    /// Detects squash/merge PRs and compacts bot commits.
116    pub fn ingest_git_temporal(
117        &self,
118        path: &str,
119        days: u64,
120        namespace: Option<&str>,
121    ) -> Result<TemporalIngestResult, CodememError> {
122        let mut result = TemporalIngestResult::default();
123        let ns = namespace.unwrap_or("");
124
125        // ── Step 1: Parse git log with parent hashes and subject ────────
126        let commits = self.parse_git_log(path, days)?;
127        if commits.is_empty() {
128            return Ok(result);
129        }
130
131        // ── Step 2: Check for incremental ingestion ────────────────────
132        let last_ingested = self.get_last_ingested_commit(ns);
133        let commits: Vec<ParsedCommit> = if let Some(ref last_hash) = last_ingested {
134            // Skip commits we've already processed
135            let skip_idx = commits.iter().position(|c| c.hash == *last_hash);
136            match skip_idx {
137                Some(idx) => commits.into_iter().take(idx).collect(),
138                None => commits, // Last commit not found, process all
139            }
140        } else {
141            commits
142        };
143
144        if commits.is_empty() {
145            return Ok(result);
146        }
147
148        // ── Step 3: Compact bot/repetitive commits ──────────────────────
149        let (real_commits, bot_groups) = compact_bot_commits(commits);
150        result.commits_skipped = bot_groups.values().map(|g| g.len().saturating_sub(1)).sum();
151
152        // ── Step 4: Create commit nodes and ModifiedBy edges ────────────
153        let now = chrono::Utc::now();
154        let mut commit_nodes = Vec::new();
155        let mut edges = Vec::new();
156
157        for commit in &real_commits {
158            let commit_id = format!("commit:{}", commit.hash);
159
160            let node = GraphNode {
161                id: commit_id.clone(),
162                kind: NodeKind::Commit,
163                label: format!("{} {}", commit.short_hash, commit.subject),
164                payload: {
165                    let mut p = HashMap::new();
166                    p.insert("hash".into(), json!(commit.hash));
167                    p.insert("short_hash".into(), json!(commit.short_hash));
168                    p.insert("author".into(), json!(commit.author));
169                    p.insert("date".into(), json!(commit.date.to_rfc3339()));
170                    p.insert("subject".into(), json!(commit.subject));
171                    p.insert("parents".into(), json!(commit.parents));
172                    p.insert("files_changed".into(), json!(commit.files.len()));
173                    p
174                },
175                centrality: 0.0,
176                memory_id: None,
177                namespace: Some(ns.to_string()),
178                valid_from: Some(commit.date),
179                valid_to: None,
180            };
181            commit_nodes.push(node);
182
183            // File-level ModifiedBy edges
184            for file in &commit.files {
185                let file_id = format!("file:{file}");
186                edges.push(Edge {
187                    id: format!("modby:{file_id}:{}", commit.hash),
188                    src: file_id,
189                    dst: commit_id.clone(),
190                    relationship: RelationshipType::ModifiedBy,
191                    weight: 0.4,
192                    properties: {
193                        let mut p = HashMap::new();
194                        p.insert("commit_date".into(), json!(commit.date.to_rfc3339()));
195                        p
196                    },
197                    created_at: now,
198                    valid_from: Some(commit.date),
199                    valid_to: None,
200                });
201                result.modified_by_edges += 1;
202            }
203
204            result.commits_processed += 1;
205        }
206
207        // ── Step 5: Symbol-level ModifiedBy edges (via diff) ────────────
208        // Only for recent commits (last 30 days) to limit cost
209        let symbol_cutoff = now - chrono::Duration::days(30);
210        for commit in &real_commits {
211            if commit.date < symbol_cutoff {
212                continue;
213            }
214            let symbol_edges = self.commit_symbol_edges(path, commit, ns);
215            edges.extend(symbol_edges);
216        }
217
218        // ── Step 6: Create compacted bot commit nodes ───────────────────
219        for (key, group) in &bot_groups {
220            if group.is_empty() {
221                continue;
222            }
223            let representative = &group[0];
224            let commit_id = format!("commit:{}", representative.hash);
225            let node = GraphNode {
226                id: commit_id,
227                kind: NodeKind::Commit,
228                label: format!("{} [{}x] {}", representative.short_hash, group.len(), key),
229                payload: {
230                    let mut p = HashMap::new();
231                    p.insert("hash".into(), json!(representative.hash));
232                    p.insert("author".into(), json!(representative.author));
233                    p.insert("date".into(), json!(representative.date.to_rfc3339()));
234                    p.insert("compacted_count".into(), json!(group.len()));
235                    p.insert("bot".into(), json!(true));
236                    p
237                },
238                centrality: 0.0,
239                memory_id: None,
240                namespace: Some(ns.to_string()),
241                valid_from: Some(representative.date),
242                valid_to: None,
243            };
244            commit_nodes.push(node);
245        }
246
247        // ── Step 7: Detect PRs and create PR nodes + PartOf edges ───────
248        let prs = detect_prs(&real_commits);
249        for pr in &prs {
250            let pr_id = format!("pr:{}", pr.number);
251            let node = GraphNode {
252                id: pr_id.clone(),
253                kind: NodeKind::PullRequest,
254                label: format!("#{} {}", pr.number, pr.title),
255                payload: {
256                    let mut p = HashMap::new();
257                    p.insert("number".into(), json!(pr.number));
258                    p.insert("title".into(), json!(pr.title));
259                    p.insert("author".into(), json!(pr.author));
260                    p.insert("squash".into(), json!(pr.squash));
261                    p.insert("commit_count".into(), json!(pr.commits.len()));
262                    p
263                },
264                centrality: 0.0,
265                memory_id: None,
266                namespace: Some(ns.to_string()),
267                valid_from: Some(pr.merged_at),
268                valid_to: None,
269            };
270            commit_nodes.push(node);
271            result.pr_nodes_created += 1;
272
273            for commit_hash in &pr.commits {
274                let commit_id = format!("commit:{commit_hash}");
275                edges.push(Edge {
276                    id: format!("partof:{commit_id}:{pr_id}"),
277                    src: commit_id,
278                    dst: pr_id.clone(),
279                    relationship: RelationshipType::PartOf,
280                    weight: 0.4,
281                    properties: HashMap::new(),
282                    created_at: now,
283                    valid_from: Some(pr.merged_at),
284                    valid_to: None,
285                });
286                result.part_of_edges += 1;
287            }
288        }
289
290        // ── Step 8: Detect deleted symbols ──────────────────────────────
291        result.symbols_expired = self.expire_deleted_symbols(path, &real_commits, ns)?;
292
293        // ── Step 9: Persist to storage and in-memory graph ──────────────
294        self.storage.insert_graph_nodes_batch(&commit_nodes)?;
295        self.storage.insert_graph_edges_batch(&edges)?;
296
297        // Single lock scope for both nodes and edges to ensure atomic
298        // visibility to concurrent readers.
299        {
300            let mut graph = self.lock_graph()?;
301            for node in commit_nodes {
302                let _ = graph.add_node(node);
303            }
304            self.add_edges_with_placeholders(&mut **graph, &edges)?;
305        }
306
307        // Record last ingested commit for incremental runs
308        if let Some(latest) = real_commits.first() {
309            self.record_last_ingested_commit(ns, &latest.hash);
310        }
311
312        Ok(result)
313    }
314
315    /// Ensure all edge endpoints exist in the in-memory graph, creating placeholder
316    /// nodes as needed, then add the edges. Logs warnings for any remaining failures.
317    ///
318    /// Placeholder nodes are also persisted to storage so they survive restarts.
319    /// Callers must hold the graph lock; this avoids a double-lock window where
320    /// concurrent readers could see nodes without their edges.
321    pub(crate) fn add_edges_with_placeholders(
322        &self,
323        graph: &mut dyn codemem_core::GraphBackend,
324        edges: &[Edge],
325    ) -> Result<(), CodememError> {
326        let mut warn_count = 0u32;
327        let mut total_failures = 0u32;
328
329        for edge in edges {
330            // Ensure src node exists
331            for endpoint_id in [&edge.src, &edge.dst] {
332                if graph.get_node(endpoint_id)?.is_none() {
333                    let kind = if endpoint_id.starts_with("file:") {
334                        NodeKind::File
335                    } else if endpoint_id.starts_with("sym:") {
336                        NodeKind::Function
337                    } else if endpoint_id.starts_with("commit:") {
338                        NodeKind::Commit
339                    } else if endpoint_id.starts_with("pr:") {
340                        NodeKind::PullRequest
341                    } else {
342                        NodeKind::External
343                    };
344
345                    let label = endpoint_id
346                        .find(':')
347                        .map(|i| &endpoint_id[i + 1..])
348                        .unwrap_or(endpoint_id)
349                        .to_string();
350
351                    let placeholder = GraphNode {
352                        id: endpoint_id.clone(),
353                        kind,
354                        label,
355                        payload: HashMap::new(),
356                        centrality: 0.0,
357                        memory_id: None,
358                        namespace: None,
359                        valid_from: None,
360                        valid_to: None,
361                    };
362                    // Persist to storage so placeholder survives restarts
363                    let _ = self.storage.insert_graph_node(&placeholder);
364                    let _ = graph.add_node(placeholder);
365                }
366            }
367
368            if let Err(e) = graph.add_edge(edge.clone()) {
369                total_failures += 1;
370                if warn_count < 5 {
371                    tracing::warn!(
372                        "Failed to add edge {} ({} -> {}): {e}",
373                        edge.id,
374                        edge.src,
375                        edge.dst
376                    );
377                    warn_count += 1;
378                }
379            }
380        }
381
382        if total_failures > 0 && total_failures > warn_count {
383            tracing::warn!(
384                "... and {} more edge insertion failures (total: {})",
385                total_failures - warn_count,
386                total_failures
387            );
388        }
389
390        Ok(())
391    }
392
393    /// Parse git log output into structured commits.
394    fn parse_git_log(&self, path: &str, days: u64) -> Result<Vec<ParsedCommit>, CodememError> {
395        let output = std::process::Command::new("git")
396            .args([
397                "-C",
398                path,
399                "log",
400                "--format=COMMIT:%H|%P|%an|%aI|%s",
401                "--name-only",
402                "--diff-filter=AMDRT",
403                &format!("--since={days} days ago"),
404            ])
405            .output()
406            .map_err(|e| CodememError::Internal(format!("Failed to run git: {e}")))?;
407
408        if !output.status.success() {
409            let stderr = String::from_utf8_lossy(&output.stderr);
410            return Err(CodememError::Internal(format!("git log failed: {stderr}")));
411        }
412
413        let stdout = String::from_utf8_lossy(&output.stdout);
414        let mut commits = Vec::new();
415
416        for block in stdout.split("COMMIT:").skip(1) {
417            let mut lines = block.lines();
418            if let Some(header) = lines.next() {
419                let parts: Vec<&str> = header.splitn(5, '|').collect();
420                if parts.len() >= 5 {
421                    let hash = parts[0].to_string();
422                    let short_hash = hash[..hash.len().min(7)].to_string();
423                    let parents: Vec<String> =
424                        parts[1].split_whitespace().map(|s| s.to_string()).collect();
425                    let author = parts[2].to_string();
426                    let date = match chrono::DateTime::parse_from_rfc3339(parts[3]) {
427                        Ok(dt) => dt.with_timezone(&chrono::Utc),
428                        Err(e) => {
429                            tracing::warn!(
430                                "Skipping commit {}: unparseable date {:?}: {e}",
431                                &parts[0][..parts[0].len().min(7)],
432                                parts[3]
433                            );
434                            continue;
435                        }
436                    };
437                    let subject = parts[4].to_string();
438                    let files: Vec<String> = lines
439                        .filter(|l| !l.trim().is_empty())
440                        .map(|l| l.trim().to_string())
441                        .collect();
442
443                    commits.push(ParsedCommit {
444                        hash,
445                        short_hash,
446                        parents,
447                        author,
448                        date,
449                        subject,
450                        files,
451                    });
452                }
453            }
454        }
455
456        Ok(commits)
457    }
458
459    /// Get symbol-level ModifiedBy edges for a single commit by running git diff.
460    fn commit_symbol_edges(&self, path: &str, commit: &ParsedCommit, namespace: &str) -> Vec<Edge> {
461        let mut edges = Vec::new();
462        let parent = commit
463            .parents
464            .first()
465            .map(|s| s.as_str())
466            .unwrap_or(EMPTY_TREE_SHA);
467
468        let diff_output = std::process::Command::new("git")
469            .args(["-C", path, "diff", parent, &commit.hash, "--unified=0"])
470            .output();
471
472        let diff_text = match diff_output {
473            Ok(o) if o.status.success() => String::from_utf8_lossy(&o.stdout).to_string(),
474            _ => return edges,
475        };
476
477        let hunks = parse_diff(&diff_text);
478        if hunks.is_empty() {
479            return edges;
480        }
481
482        // Build file→symbols map from graph
483        let graph = match self.lock_graph() {
484            Ok(g) => g,
485            Err(e) => {
486                tracing::warn!("Failed to lock graph for symbol-level diff: {e}");
487                return edges;
488            }
489        };
490        let all_nodes = graph.get_all_nodes();
491
492        let mut file_symbols: HashMap<&str, Vec<(&str, u32, u32)>> = HashMap::new();
493        for node in &all_nodes {
494            if matches!(
495                node.kind,
496                NodeKind::Function
497                    | NodeKind::Method
498                    | NodeKind::Class
499                    | NodeKind::Trait
500                    | NodeKind::Interface
501                    | NodeKind::Enum
502            ) {
503                if let (Some(fp), Some(ls), Some(le)) = (
504                    node.payload.get("file_path").and_then(|v| v.as_str()),
505                    node.payload
506                        .get("line_start")
507                        .and_then(|v| v.as_u64())
508                        .map(|v| v as u32),
509                    node.payload
510                        .get("line_end")
511                        .and_then(|v| v.as_u64())
512                        .map(|v| v as u32),
513                ) {
514                    if node.namespace.as_deref() == Some(namespace) || namespace.is_empty() {
515                        file_symbols.entry(fp).or_default().push((&node.id, ls, le));
516                    }
517                }
518            }
519        }
520        drop(graph);
521
522        let commit_id = format!("commit:{}", commit.hash);
523        let now = chrono::Utc::now();
524        let mut seen = HashSet::new();
525
526        for hunk in &hunks {
527            if let Some(symbols) = file_symbols.get(hunk.file_path.as_str()) {
528                let changed_lines: HashSet<u32> = hunk
529                    .added_lines
530                    .iter()
531                    .chain(hunk.removed_lines.iter())
532                    .copied()
533                    .collect();
534
535                for &(sym_id, line_start, line_end) in symbols {
536                    if changed_lines
537                        .iter()
538                        .any(|&l| l >= line_start && l <= line_end)
539                        && seen.insert(sym_id)
540                    {
541                        edges.push(Edge {
542                            id: format!("modby:{}:{}", sym_id, commit.hash),
543                            src: sym_id.to_string(),
544                            dst: commit_id.clone(),
545                            relationship: RelationshipType::ModifiedBy,
546                            weight: 0.4,
547                            properties: {
548                                let mut p = HashMap::new();
549                                p.insert("commit_date".into(), json!(commit.date.to_rfc3339()));
550                                p.insert("symbol_level".into(), json!(true));
551                                p
552                            },
553                            created_at: now,
554                            valid_from: Some(commit.date),
555                            valid_to: None,
556                        });
557                    }
558                }
559            }
560        }
561
562        edges
563    }
564
565    /// Set valid_to on symbols/files that were deleted in the given commits.
566    ///
567    /// Uses `git log --diff-filter=D` to find deleted files, then collects
568    /// expired nodes before writing — avoids holding the graph lock during
569    /// storage writes (deadlock risk).
570    pub(crate) fn expire_deleted_symbols(
571        &self,
572        path: &str,
573        commits: &[ParsedCommit],
574        namespace: &str,
575    ) -> Result<usize, CodememError> {
576        // Find deleted files from the already-parsed commits' time range
577        let since = commits
578            .last()
579            .map(|c| c.date.to_rfc3339())
580            .unwrap_or_else(|| "90 days ago".to_string());
581
582        let output = std::process::Command::new("git")
583            .args([
584                "-C",
585                path,
586                "log",
587                "--format=COMMIT:%H|%aI",
588                "--diff-filter=D",
589                "--name-only",
590                &format!("--since={since}"),
591            ])
592            .output()
593            .map_err(|e| CodememError::Internal(format!("Failed to run git: {e}")))?;
594
595        if !output.status.success() {
596            return Ok(0);
597        }
598
599        let stdout = String::from_utf8_lossy(&output.stdout);
600
601        // Parse deletion events: (date, set of deleted file paths)
602        let mut deletions: Vec<(chrono::DateTime<chrono::Utc>, HashSet<String>)> = Vec::new();
603        for block in stdout.split("COMMIT:").skip(1) {
604            let mut lines = block.lines();
605            let date = lines
606                .next()
607                .and_then(|h| {
608                    let parts: Vec<&str> = h.splitn(2, '|').collect();
609                    parts.get(1).and_then(|d| {
610                        chrono::DateTime::parse_from_rfc3339(d)
611                            .ok()
612                            .map(|dt| dt.with_timezone(&chrono::Utc))
613                    })
614                })
615                .unwrap_or_else(chrono::Utc::now);
616
617            let files: HashSet<String> = lines
618                .filter(|l| !l.trim().is_empty())
619                .map(|l| l.trim().to_string())
620                .collect();
621
622            if !files.is_empty() {
623                deletions.push((date, files));
624            }
625        }
626
627        if deletions.is_empty() {
628            return Ok(0);
629        }
630
631        // Filter out files that currently exist in the working tree
632        // (they were deleted then re-created, so should not be expired)
633        for (_date, deleted_files) in &mut deletions {
634            deleted_files.retain(|f| {
635                let full_path = std::path::Path::new(path).join(f);
636                !full_path.exists()
637            });
638        }
639        deletions.retain(|(_, files)| !files.is_empty());
640
641        if deletions.is_empty() {
642            return Ok(0);
643        }
644
645        // Phase 1: collect expired nodes under graph lock (read-only)
646        let expired_nodes: Vec<GraphNode> = {
647            let graph = self.lock_graph()?;
648            let all_nodes = graph.get_all_nodes();
649            let mut to_expire = Vec::new();
650
651            for (date, deleted_files) in &deletions {
652                for node in &all_nodes {
653                    if node.valid_to.is_some() {
654                        continue;
655                    }
656                    if !namespace.is_empty() && node.namespace.as_deref() != Some(namespace) {
657                        continue;
658                    }
659
660                    let should_expire = match node.kind {
661                        NodeKind::File => {
662                            let fp = node.id.strip_prefix("file:").unwrap_or(&node.id);
663                            deleted_files.contains(fp)
664                        }
665                        _ => node
666                            .payload
667                            .get("file_path")
668                            .and_then(|v| v.as_str())
669                            .is_some_and(|fp| deleted_files.contains(fp)),
670                    };
671
672                    if should_expire {
673                        let mut expired_node = node.clone();
674                        expired_node.valid_to = Some(*date);
675                        to_expire.push(expired_node);
676                    }
677                }
678            }
679            to_expire
680        };
681        // Graph lock dropped here
682
683        // Phase 2: write to storage and in-memory graph separately
684        let count = expired_nodes.len();
685        if !expired_nodes.is_empty() {
686            self.storage.insert_graph_nodes_batch(&expired_nodes)?;
687            let mut graph = self.lock_graph()?;
688            for node in expired_nodes {
689                let _ = graph.add_node(node);
690            }
691        }
692
693        Ok(count)
694    }
695
696    /// Get the last ingested commit hash for incremental processing.
697    fn get_last_ingested_commit(&self, namespace: &str) -> Option<String> {
698        let sentinel_id = format!("commit:_HEAD:{namespace}");
699        if let Ok(Some(node)) = self.storage.get_graph_node(&sentinel_id) {
700            node.payload
701                .get("hash")
702                .and_then(|v| v.as_str())
703                .map(|s| s.to_string())
704        } else {
705            None
706        }
707    }
708
709    /// Record the last ingested commit hash for incremental processing.
710    fn record_last_ingested_commit(&self, namespace: &str, hash: &str) {
711        let sentinel_id = format!("commit:_HEAD:{namespace}");
712        let node = GraphNode {
713            id: sentinel_id,
714            kind: NodeKind::Commit,
715            label: format!("_HEAD:{namespace}"),
716            payload: {
717                let mut p = HashMap::new();
718                p.insert("hash".into(), json!(hash));
719                p.insert("sentinel".into(), json!(true));
720                p
721            },
722            centrality: 0.0,
723            memory_id: None,
724            namespace: Some(namespace.to_string()),
725            valid_from: None,
726            valid_to: None,
727        };
728        let _ = self.storage.insert_graph_node(&node);
729    }
730}
731
732/// Separate real commits from bot/repetitive commits.
733/// Bot commits are grouped by (author, file pattern) key.
734fn compact_bot_commits(
735    commits: Vec<ParsedCommit>,
736) -> (Vec<ParsedCommit>, HashMap<String, Vec<ParsedCommit>>) {
737    let mut real = Vec::new();
738    let mut bot_groups: HashMap<String, Vec<ParsedCommit>> = HashMap::new();
739
740    for commit in commits {
741        if is_bot_commit(&commit.author, &commit.files) {
742            let key = format!(
743                "{}:{}",
744                commit.author,
745                commit
746                    .files
747                    .first()
748                    .map(|f| f.as_str())
749                    .unwrap_or("unknown")
750            );
751            bot_groups.entry(key).or_default().push(commit);
752        } else {
753            real.push(commit);
754        }
755    }
756
757    (real, bot_groups)
758}
759
760/// Detect PRs from commit patterns.
761fn detect_prs(commits: &[ParsedCommit]) -> Vec<DetectedPR> {
762    let mut prs = Vec::new();
763    let mut seen_prs: HashSet<String> = HashSet::new();
764
765    for commit in commits {
766        if let Some(pr_number) = extract_pr_number(&commit.subject) {
767            if seen_prs.contains(&pr_number) {
768                continue;
769            }
770            seen_prs.insert(pr_number.clone());
771
772            let is_merge = commit.parents.len() > 1;
773            let is_squash = commit.parents.len() == 1;
774
775            // For squash merges: single commit = single PR
776            // For merge commits: collect commits between this merge and the previous one
777            let commit_hashes = if is_squash {
778                vec![commit.hash.clone()]
779            } else if is_merge && commit.parents.len() == 2 {
780                // The second parent is the branch head; commits between
781                // first parent and this merge are the PR's commits.
782                // For simplicity, just reference the merge commit itself.
783                vec![commit.hash.clone()]
784            } else {
785                vec![commit.hash.clone()]
786            };
787
788            prs.push(DetectedPR {
789                number: pr_number,
790                commits: commit_hashes,
791                squash: is_squash,
792                merged_at: commit.date,
793                title: commit.subject.clone(),
794                author: commit.author.clone(),
795            });
796        }
797    }
798
799    prs
800}
801
802#[cfg(test)]
803mod tests {
804    use super::*;
805
806    #[test]
807    fn extract_pr_number_squash() {
808        assert_eq!(
809            extract_pr_number("feat: add foo (#123)"),
810            Some("123".to_string())
811        );
812        assert_eq!(
813            extract_pr_number("fix: something (#42)"),
814            Some("42".to_string())
815        );
816    }
817
818    #[test]
819    fn extract_pr_number_merge() {
820        assert_eq!(
821            extract_pr_number("Merge pull request #456 from org/branch"),
822            Some("456".to_string())
823        );
824    }
825
826    #[test]
827    fn extract_pr_number_none() {
828        assert_eq!(extract_pr_number("chore: update deps"), None);
829        assert_eq!(extract_pr_number("fix bug in #parser"), None);
830    }
831
832    #[test]
833    fn bot_detection() {
834        assert!(is_bot_commit("dependabot[bot]", &[]));
835        assert!(is_bot_commit("renovate", &[]));
836        assert!(is_bot_commit("some-user", &["Cargo.lock".to_string()]));
837        assert!(is_bot_commit(
838            "some-user",
839            &["package-lock.json".to_string()]
840        ));
841        assert!(!is_bot_commit("some-user", &["src/main.rs".to_string()]));
842    }
843
844    #[test]
845    fn compact_separates_bots() {
846        let commits = vec![
847            ParsedCommit {
848                hash: "aaa".into(),
849                short_hash: "aaa".into(),
850                parents: vec![],
851                author: "dev".into(),
852                date: chrono::Utc::now(),
853                subject: "feat: real work".into(),
854                files: vec!["src/main.rs".into()],
855            },
856            ParsedCommit {
857                hash: "bbb".into(),
858                short_hash: "bbb".into(),
859                parents: vec![],
860                author: "dependabot[bot]".into(),
861                date: chrono::Utc::now(),
862                subject: "chore: bump deps".into(),
863                files: vec!["Cargo.lock".into()],
864            },
865        ];
866        let (real, bots) = compact_bot_commits(commits);
867        assert_eq!(real.len(), 1);
868        assert_eq!(real[0].hash, "aaa");
869        assert_eq!(bots.len(), 1);
870    }
871
872    #[test]
873    fn detect_prs_from_squash() {
874        let commits = vec![
875            ParsedCommit {
876                hash: "abc123".into(),
877                short_hash: "abc123".into(),
878                parents: vec!["def456".into()],
879                author: "dev".into(),
880                date: chrono::Utc::now(),
881                subject: "feat: add feature (#10)".into(),
882                files: vec!["src/lib.rs".into()],
883            },
884            ParsedCommit {
885                hash: "xyz789".into(),
886                short_hash: "xyz789".into(),
887                parents: vec!["abc123".into()],
888                author: "dev".into(),
889                date: chrono::Utc::now(),
890                subject: "fix: plain commit".into(),
891                files: vec!["src/main.rs".into()],
892            },
893        ];
894        let prs = detect_prs(&commits);
895        assert_eq!(prs.len(), 1);
896        assert_eq!(prs[0].number, "10");
897        assert!(prs[0].squash);
898        assert_eq!(prs[0].commits, vec!["abc123"]);
899    }
900}