Skip to main content

fallow_core/
churn.rs

1//! Git churn analysis for hotspot detection.
2//!
3//! Shells out to `git log` to collect per-file change history, then computes
4//! recency-weighted churn scores and trend indicators.
5
6use rustc_hash::FxHashMap;
7use std::path::{Path, PathBuf};
8use std::process::{Command, Output};
9use std::sync::OnceLock;
10
11use serde::Deserialize;
12
13pub use fallow_types::churn::ChurnTrend;
14
15/// Function pointer signature used by `set_spawn_hook` to intercept the
16/// `git log --numstat` subprocess. Lets the CLI route long-running git
17/// log calls through its `ScopedChild` registry so SIGINT / SIGTERM
18/// reap the subprocess instead of leaving it running after the parent
19/// exits. See `crates/cli/src/signal/` and issue #477.
20pub type ChurnSpawnHook = fn(&mut Command) -> std::io::Result<Output>;
21
22static SPAWN_HOOK: OnceLock<ChurnSpawnHook> = OnceLock::new();
23
24/// Install a spawn-hook that wraps the `git log` subprocess. Idempotent;
25/// subsequent calls are no-ops. Called once from the CLI's `main()` to
26/// route through the signal registry; defaults to `Command::output`
27/// when not set so the function-pointer indirection stays free for tests
28/// and embedders that don't care.
29pub fn set_spawn_hook(hook: ChurnSpawnHook) {
30    let _ = SPAWN_HOOK.set(hook);
31}
32
33fn spawn_output(command: &mut Command) -> std::io::Result<Output> {
34    if let Some(hook) = SPAWN_HOOK.get() {
35        hook(command)
36    } else {
37        command.output()
38    }
39}
40
41/// Number of seconds in one day.
42const SECS_PER_DAY: f64 = 86_400.0;
43
44/// Recency weight half-life in days. A commit from 90 days ago counts half
45/// as much as today's commit; 180 days ago counts 25%.
46const HALF_LIFE_DAYS: f64 = 90.0;
47
48/// Schema discriminator a `--churn-file` document must declare.
49const CHURN_FILE_SCHEMA: &str = "fallow-churn/v1";
50
51/// Upper bound on imported churn events. A file past this size is a sign of a
52/// pathological export (whole-history dump of a giant monorepo) rather than a
53/// useful hotspot window; parsing is rejected so we never allocate unbounded
54/// state from a single untrusted file. Mirrors the diff parser's
55/// `MAX_ADDED_LINES` guard in the CLI.
56const MAX_CHURN_EVENTS: usize = 5_000_000;
57
58/// Reject an imported `timestamp` more than this many seconds in the future
59/// (one year). A unix-seconds commit time is never legitimately this far ahead
60/// even with clock skew, so a value past it is almost always a millisecond
61/// timestamp (~52000 years out) or corruption. Caught loudly because the
62/// recency decay uses `saturating_sub`, so a future timestamp would otherwise
63/// clamp to age 0, give every commit full weight, and silently collapse the
64/// recency signal that distinguishes recent from old churn.
65const MAX_FUTURE_TIMESTAMP_SECS: u64 = 365 * 24 * 60 * 60;
66
67/// Parsed duration for the `--since` flag.
68#[derive(Debug, Clone)]
69pub struct SinceDuration {
70    /// Value to pass to `git log --after` (e.g., `"6 months ago"` or `"2025-06-01"`).
71    pub git_after: String,
72    /// Human-readable display string (e.g., `"6 months"`).
73    pub display: String,
74}
75
76/// Per-author commit aggregation for a single file.
77///
78/// Authors are interned via [`ChurnResult::author_pool`] indices to keep
79/// per-file maps small and the bitcode cache compact.
80#[derive(Debug, Clone, Copy)]
81pub struct AuthorContribution {
82    /// Total commits by this author touching this file in the analysis window.
83    pub commits: u32,
84    /// Recency-weighted commit sum (exponential decay, half-life 90 days).
85    pub weighted_commits: f64,
86    /// Earliest commit timestamp by this author (epoch seconds).
87    pub first_commit_ts: u64,
88    /// Latest commit timestamp by this author (epoch seconds).
89    pub last_commit_ts: u64,
90}
91
92/// Per-file churn data collected from git history.
93#[derive(Debug, Clone)]
94pub struct FileChurn {
95    /// Absolute file path.
96    pub path: PathBuf,
97    /// Total number of commits touching this file in the analysis window.
98    pub commits: u32,
99    /// Recency-weighted commit count (exponential decay, half-life 90 days).
100    pub weighted_commits: f64,
101    /// Total lines added across all commits.
102    pub lines_added: u32,
103    /// Total lines deleted across all commits.
104    pub lines_deleted: u32,
105    /// Churn trend: accelerating, stable, or cooling.
106    pub trend: ChurnTrend,
107    /// Per-author contributions keyed by interned author index.
108    /// Indices reference [`ChurnResult::author_pool`].
109    pub authors: FxHashMap<u32, AuthorContribution>,
110}
111
112/// Result of churn analysis.
113#[derive(Debug)]
114pub struct ChurnResult {
115    /// Per-file churn data, keyed by absolute path.
116    pub files: FxHashMap<PathBuf, FileChurn>,
117    /// Whether the repository is a shallow clone.
118    pub shallow_clone: bool,
119    /// Author email pool. Per-file [`AuthorContribution`] entries reference
120    /// authors by their index into this vector.
121    pub author_pool: Vec<String>,
122}
123
124/// Parse a `--since` value into a git-compatible duration.
125///
126/// Accepts:
127/// - Durations: `6m`, `6months`, `90d`, `90days`, `1y`, `1year`, `2w`, `2weeks`
128/// - ISO dates: `2025-06-01`
129///
130/// # Errors
131///
132/// Returns an error if the input is not a recognized duration format or ISO date,
133/// the numeric part is invalid, or the duration is zero.
134pub fn parse_since(input: &str) -> Result<SinceDuration, String> {
135    if is_iso_date(input) {
136        return Ok(SinceDuration {
137            git_after: input.to_string(),
138            display: input.to_string(),
139        });
140    }
141
142    let (num_str, unit) = split_number_unit(input)?;
143    let num: u64 = num_str
144        .parse()
145        .map_err(|_| format!("invalid number in --since: {input}"))?;
146
147    if num == 0 {
148        return Err("--since duration must be greater than 0".to_string());
149    }
150
151    match unit {
152        "d" | "day" | "days" => {
153            let s = if num == 1 { "" } else { "s" };
154            Ok(SinceDuration {
155                git_after: format!("{num} day{s} ago"),
156                display: format!("{num} day{s}"),
157            })
158        }
159        "w" | "week" | "weeks" => {
160            let s = if num == 1 { "" } else { "s" };
161            Ok(SinceDuration {
162                git_after: format!("{num} week{s} ago"),
163                display: format!("{num} week{s}"),
164            })
165        }
166        "m" | "month" | "months" => {
167            let s = if num == 1 { "" } else { "s" };
168            Ok(SinceDuration {
169                git_after: format!("{num} month{s} ago"),
170                display: format!("{num} month{s}"),
171            })
172        }
173        "y" | "year" | "years" => {
174            let s = if num == 1 { "" } else { "s" };
175            Ok(SinceDuration {
176                git_after: format!("{num} year{s} ago"),
177                display: format!("{num} year{s}"),
178            })
179        }
180        _ => Err(format!(
181            "unknown duration unit '{unit}' in --since. Use d/w/m/y (e.g., 6m, 90d, 1y)"
182        )),
183    }
184}
185
186/// Analyze git churn for files in the given root directory.
187///
188/// Returns `None` if git is not available or the directory is not a git repository.
189pub fn analyze_churn(root: &Path, since: &SinceDuration) -> Option<ChurnResult> {
190    let shallow = is_shallow_clone(root);
191    let state = analyze_churn_events(root, since, None)?;
192    Some(build_churn_result(state, shallow))
193}
194
195/// A `fallow-churn/v1` import document: a normalized, VCS-agnostic stand-in for
196/// `git log --numstat` output. Unknown fields are ignored (no
197/// `deny_unknown_fields`) so wrappers may carry extra metadata and so the
198/// reserved `commit` field can be added in a future revision without breaking
199/// v1 consumers.
200#[derive(Debug, Deserialize)]
201struct ChurnFileDoc {
202    schema: String,
203    #[serde(default)]
204    events: Vec<ChurnFileEvent>,
205}
206
207/// One per-(commit, file) change event, the natural shape of a `<vcs> log
208/// --numstat` row. `commit` is intentionally NOT a field: extra keys are
209/// already ignored, so a wrapper emitting `commit` is forward-compatible and a
210/// future revision can promote it to a real field without a breaking change.
211#[derive(Debug, Deserialize)]
212struct ChurnFileEvent {
213    /// Repo-root-relative, forward-slash path. Joined to `root`.
214    path: String,
215    /// Commit time, unix SECONDS UTC (not milliseconds).
216    timestamp: u64,
217    /// Opaque author identity (email recommended); absent contributes no
218    /// ownership signal. fallow does NOT apply mailmap to imported authors.
219    #[serde(default)]
220    author: Option<String>,
221    /// Lines added in this file in this commit.
222    added: u32,
223    /// Lines deleted in this file in this commit.
224    deleted: u32,
225}
226
227/// Build churn data from a normalized `fallow-churn/v1` JSON import instead of
228/// `git log`. Lets projects on a non-git VCS (Yandex Arc, Mercurial, Perforce)
229/// feed change history into hotspot / ownership / bus-factor analysis: a small
230/// wrapper translates the VCS log into the contract and fallow runs all the
231/// usual recency-weighting, trend, and ownership logic on the imported events.
232///
233/// `root` is the project root that relative event paths are joined to (matching
234/// how the git path joins numstat paths), so the churn keys line up with the
235/// analyzed files. Returns a human-readable error (the CLI maps it to exit code
236/// 2) on a missing file, malformed JSON, wrong `schema`, an empty event path, a
237/// far-future timestamp, or an event count past `MAX_CHURN_EVENTS`. An empty
238/// `events` array is valid (no hotspots), not an error. Never runs `git`.
239pub fn analyze_churn_from_file(path: &Path, root: &Path) -> Result<ChurnResult, String> {
240    let raw = std::fs::read_to_string(path)
241        .map_err(|e| format!("failed to read churn file {}: {e}", path.display()))?;
242    let doc: ChurnFileDoc = serde_json::from_str(&raw)
243        .map_err(|e| format!("failed to parse churn file {}: {e}", path.display()))?;
244    if doc.schema != CHURN_FILE_SCHEMA {
245        return Err(format!(
246            "churn file {} declares schema \"{}\", expected \"{CHURN_FILE_SCHEMA}\"",
247            path.display(),
248            doc.schema
249        ));
250    }
251    if doc.events.len() > MAX_CHURN_EVENTS {
252        return Err(format!(
253            "churn file {} has {} events, exceeding the {MAX_CHURN_EVENTS} limit",
254            path.display(),
255            doc.events.len()
256        ));
257    }
258
259    let state = churn_event_state_from_doc(&doc, path, root)?;
260    Ok(build_churn_result(state, false))
261}
262
263/// Validate and fold a parsed `fallow-churn/v1` document into event state.
264///
265/// Rejects empty paths and far-future (likely millisecond) timestamps; interns
266/// authors into the pool exactly as the git-log path does.
267fn churn_event_state_from_doc(
268    doc: &ChurnFileDoc,
269    path: &Path,
270    root: &Path,
271) -> Result<ChurnEventState, String> {
272    let mut builder = ChurnFileImportBuilder::new(path, root, churn_file_future_limit());
273
274    for event in &doc.events {
275        builder.push_event(event)?;
276    }
277
278    Ok(builder.finish())
279}
280
281fn churn_file_future_limit() -> u64 {
282    let now_secs = std::time::SystemTime::now()
283        .duration_since(std::time::UNIX_EPOCH)
284        .unwrap_or_default()
285        .as_secs();
286    now_secs.saturating_add(MAX_FUTURE_TIMESTAMP_SECS)
287}
288
289struct ChurnFileImportBuilder<'a> {
290    path: &'a Path,
291    root: &'a Path,
292    future_limit: u64,
293    files: FxHashMap<PathBuf, FileEvents>,
294    author_pool: Vec<String>,
295    author_index: FxHashMap<String, u32>,
296}
297
298impl<'a> ChurnFileImportBuilder<'a> {
299    fn new(path: &'a Path, root: &'a Path, future_limit: u64) -> Self {
300        Self {
301            path,
302            root,
303            future_limit,
304            files: FxHashMap::default(),
305            author_pool: Vec::new(),
306            author_index: FxHashMap::default(),
307        }
308    }
309
310    fn push_event(&mut self, event: &ChurnFileEvent) -> Result<(), String> {
311        let rel = normalize_churn_event_path(self.path, &event.path)?;
312        validate_churn_event_timestamp(self.path, event.timestamp, self.future_limit, &rel)?;
313
314        let abs_path = self.root.join(&rel);
315        let author_idx = self.intern_author(event.author.as_deref());
316        self.files
317            .entry(abs_path)
318            .or_insert_with(|| FileEvents { events: Vec::new() })
319            .events
320            .push(CachedCommitEvent {
321                timestamp: event.timestamp,
322                lines_added: event.added,
323                lines_deleted: event.deleted,
324                author_idx,
325            });
326        Ok(())
327    }
328
329    fn intern_author(&mut self, author: Option<&str>) -> Option<u32> {
330        author
331            .map(str::trim)
332            .filter(|email| !email.is_empty())
333            .map(|email| intern_author(email, &mut self.author_pool, &mut self.author_index))
334    }
335
336    fn finish(self) -> ChurnEventState {
337        ChurnEventState {
338            files: self.files,
339            author_pool: self.author_pool,
340        }
341    }
342}
343
344fn normalize_churn_event_path(path: &Path, event_path: &str) -> Result<String, String> {
345    let normalized = event_path.replace('\\', "/");
346    let rel = normalized.trim();
347    if rel.is_empty() {
348        return Err(format!(
349            "churn file {} has an event with an empty path",
350            path.display()
351        ));
352    }
353    Ok(rel.to_string())
354}
355
356fn validate_churn_event_timestamp(
357    path: &Path,
358    timestamp: u64,
359    future_limit: u64,
360    rel: &str,
361) -> Result<(), String> {
362    if timestamp <= future_limit {
363        return Ok(());
364    }
365
366    Err(format!(
367        "churn file {} has event timestamp {} for \"{rel}\" more than a year in the \
368         future; timestamps must be unix SECONDS (not milliseconds), UTC",
369        path.display(),
370        timestamp
371    ))
372}
373
374/// Check if the repository is a shallow clone.
375#[must_use]
376pub fn is_shallow_clone(root: &Path) -> bool {
377    let mut command = crate::spawn::git();
378    command
379        .args(["rev-parse", "--is-shallow-repository"])
380        .current_dir(root);
381    command.output().is_ok_and(|o| {
382        String::from_utf8_lossy(&o.stdout)
383            .trim()
384            .eq_ignore_ascii_case("true")
385    })
386}
387
388/// Check if the directory is inside a git repository.
389#[must_use]
390pub fn is_git_repo(root: &Path) -> bool {
391    let mut command = crate::spawn::git();
392    command
393        .args(["rev-parse", "--git-dir"])
394        .current_dir(root)
395        .stdout(std::process::Stdio::null())
396        .stderr(std::process::Stdio::null());
397    command.status().is_ok_and(|s| s.success())
398}
399
400/// Maximum size of a churn cache file (64 MB). The incremental cache stores
401/// per-commit events, so it needs more headroom than the old aggregate rows.
402const MAX_CHURN_CACHE_SIZE: usize = 64 * 1024 * 1024;
403
404/// Cache schema version. Bump when the on-disk shape of [`ChurnCache`]
405/// changes so older payloads are rejected on load. Bumped to 3 when the cache
406/// switched from aggregate rows to per-commit events for incremental updates.
407const CHURN_CACHE_VERSION: u8 = 3;
408
409/// Serializable per-commit event for the disk cache.
410#[derive(Clone, bitcode::Encode, bitcode::Decode)]
411struct CachedCommitEvent {
412    timestamp: u64,
413    lines_added: u32,
414    lines_deleted: u32,
415    author_idx: Option<u32>,
416}
417
418/// Serializable per-file churn entry for the disk cache.
419#[derive(Clone, bitcode::Encode, bitcode::Decode)]
420struct CachedFileChurn {
421    path: String,
422    events: Vec<CachedCommitEvent>,
423}
424
425/// Cached churn data keyed by last indexed SHA and since string.
426#[derive(Clone, bitcode::Encode, bitcode::Decode)]
427struct ChurnCache {
428    /// Schema version; must equal [`CHURN_CACHE_VERSION`] to be accepted.
429    version: u8,
430    last_indexed_sha: String,
431    git_after: String,
432    files: Vec<CachedFileChurn>,
433    shallow_clone: bool,
434    /// Author email pool referenced by [`CachedCommitEvent::author_idx`].
435    author_pool: Vec<String>,
436}
437
438/// Per-file commit events retained in memory while building or updating churn.
439struct FileEvents {
440    events: Vec<CachedCommitEvent>,
441}
442
443/// Event-level churn state. Unlike [`ChurnResult`], this preserves commit
444/// timestamps so a cache can merge new commits and recompute trend/recency.
445struct ChurnEventState {
446    files: FxHashMap<PathBuf, FileEvents>,
447    author_pool: Vec<String>,
448}
449
450/// Get the full HEAD SHA for cache keying.
451fn get_head_sha(root: &Path) -> Option<String> {
452    let mut command = crate::spawn::git();
453    command.args(["rev-parse", "HEAD"]).current_dir(root);
454    command
455        .output()
456        .ok()
457        .filter(|o| o.status.success())
458        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
459}
460
461/// Check whether `ancestor` is still reachable from `descendant`.
462fn is_ancestor(root: &Path, ancestor: &str, descendant: &str) -> bool {
463    let mut command = crate::spawn::git();
464    command
465        .args(["merge-base", "--is-ancestor", ancestor, descendant])
466        .current_dir(root);
467    command.status().is_ok_and(|s| s.success())
468}
469
470/// Try to load churn data from disk cache. Returns `None` on cache miss
471/// or version mismatch.
472fn load_churn_cache(cache_dir: &Path, git_after: &str) -> Option<ChurnCache> {
473    let cache_file = cache_dir.join("churn.bin");
474    let data = std::fs::read(&cache_file).ok()?;
475    if data.len() > MAX_CHURN_CACHE_SIZE {
476        return None;
477    }
478    let cache: ChurnCache = bitcode::decode(&data).ok()?;
479    if cache.version != CHURN_CACHE_VERSION || cache.git_after != git_after {
480        return None;
481    }
482    Some(cache)
483}
484
485/// Save churn data to disk cache.
486fn save_churn_cache(
487    cache_dir: &Path,
488    last_indexed_sha: &str,
489    git_after: &str,
490    state: &ChurnEventState,
491    shallow_clone: bool,
492) {
493    let files: Vec<CachedFileChurn> = state
494        .files
495        .iter()
496        .map(|f| CachedFileChurn {
497            path: f.0.to_string_lossy().to_string(),
498            events: f.1.events.clone(),
499        })
500        .collect();
501    let cache = ChurnCache {
502        version: CHURN_CACHE_VERSION,
503        last_indexed_sha: last_indexed_sha.to_string(),
504        git_after: git_after.to_string(),
505        files,
506        shallow_clone,
507        author_pool: state.author_pool.clone(),
508    };
509    let _ = std::fs::create_dir_all(cache_dir);
510    let data = bitcode::encode(&cache);
511    let tmp = cache_dir.join("churn.bin.tmp");
512    if std::fs::write(&tmp, data).is_ok() {
513        let _ = std::fs::rename(&tmp, cache_dir.join("churn.bin"));
514    }
515}
516
517/// Analyze churn with disk caching. Uses cached result when HEAD SHA and
518/// since duration match. If HEAD advanced from the cached SHA, runs an
519/// incremental `git log <cached>..HEAD --numstat` scan and merges it.
520///
521/// Returns `(ChurnResult, bool)` where the bool indicates whether reusable
522/// cache state was used.
523/// Returns `None` if git analysis fails.
524pub fn analyze_churn_cached(
525    root: &Path,
526    since: &SinceDuration,
527    cache_dir: &Path,
528    no_cache: bool,
529) -> Option<(ChurnResult, bool)> {
530    let head_sha = get_head_sha(root)?;
531
532    if !no_cache && let Some(result) = try_reuse_churn_cache(root, since, cache_dir, &head_sha) {
533        return Some((result, true));
534    }
535
536    analyze_fresh_churn(root, since, cache_dir, no_cache, &head_sha).map(|result| (result, false))
537}
538
539fn try_reuse_churn_cache(
540    root: &Path,
541    since: &SinceDuration,
542    cache_dir: &Path,
543    head_sha: &str,
544) -> Option<ChurnResult> {
545    let cache = load_churn_cache(cache_dir, &since.git_after)?;
546    if cache.last_indexed_sha == head_sha {
547        let shallow_clone = cache.shallow_clone;
548        return Some(build_churn_result(cache.into_event_state(), shallow_clone));
549    }
550
551    if !is_ancestor(root, &cache.last_indexed_sha, head_sha) {
552        return None;
553    }
554
555    extend_churn_cache(root, since, cache_dir, head_sha, cache)
556}
557
558fn extend_churn_cache(
559    root: &Path,
560    since: &SinceDuration,
561    cache_dir: &Path,
562    head_sha: &str,
563    cache: ChurnCache,
564) -> Option<ChurnResult> {
565    let shallow_clone = is_shallow_clone(root);
566    let range = format!("{}..HEAD", cache.last_indexed_sha);
567    let delta = analyze_churn_events(root, since, Some(&range))?;
568    let mut state = cache.into_event_state();
569    merge_churn_states(&mut state, delta);
570    save_churn_cache(cache_dir, head_sha, &since.git_after, &state, shallow_clone);
571    Some(build_churn_result(state, shallow_clone))
572}
573
574fn analyze_fresh_churn(
575    root: &Path,
576    since: &SinceDuration,
577    cache_dir: &Path,
578    no_cache: bool,
579    head_sha: &str,
580) -> Option<ChurnResult> {
581    let shallow_clone = is_shallow_clone(root);
582    let state = analyze_churn_events(root, since, None)?;
583    if !no_cache {
584        save_churn_cache(cache_dir, head_sha, &since.git_after, &state, shallow_clone);
585    }
586
587    Some(build_churn_result(state, shallow_clone))
588}
589
590impl ChurnCache {
591    fn into_event_state(self) -> ChurnEventState {
592        let files = self
593            .files
594            .into_iter()
595            .map(|entry| {
596                (
597                    PathBuf::from(entry.path),
598                    FileEvents {
599                        events: entry.events,
600                    },
601                )
602            })
603            .collect();
604        ChurnEventState {
605            files,
606            author_pool: self.author_pool,
607        }
608    }
609}
610
611/// Run `git log --numstat` and return event-level churn state.
612fn analyze_churn_events(
613    root: &Path,
614    since: &SinceDuration,
615    revision_range: Option<&str>,
616) -> Option<ChurnEventState> {
617    let mut command = crate::spawn::git();
618    command.arg("log");
619    if let Some(range) = revision_range {
620        command.arg(range);
621    }
622    command
623        .args([
624            "--numstat",
625            "--no-merges",
626            "--no-renames",
627            "--use-mailmap",
628            "--format=format:%at|%ae",
629            &format!("--after={}", since.git_after),
630        ])
631        .current_dir(root);
632
633    let output = match spawn_output(&mut command) {
634        Ok(o) => o,
635        Err(e) => {
636            tracing::warn!("hotspot analysis skipped: failed to run git: {e}");
637            return None;
638        }
639    };
640
641    if !output.status.success() {
642        let stderr = String::from_utf8_lossy(&output.stderr);
643        tracing::warn!("hotspot analysis skipped: git log failed: {stderr}");
644        return None;
645    }
646
647    let stdout = String::from_utf8_lossy(&output.stdout);
648    Some(parse_git_log_events(&stdout, root))
649}
650
651/// Merge new churn events into cached event state.
652fn merge_churn_states(base: &mut ChurnEventState, delta: ChurnEventState) {
653    let mut base_author_index: FxHashMap<String, u32> = base
654        .author_pool
655        .iter()
656        .enumerate()
657        .filter_map(|(idx, email)| u32::try_from(idx).ok().map(|idx| (email.clone(), idx)))
658        .collect();
659
660    let mut author_mapping: FxHashMap<u32, u32> = FxHashMap::default();
661    for (old_idx, email) in delta.author_pool.into_iter().enumerate() {
662        let Ok(old_idx) = u32::try_from(old_idx) else {
663            continue;
664        };
665        let new_idx = intern_author(&email, &mut base.author_pool, &mut base_author_index);
666        author_mapping.insert(old_idx, new_idx);
667    }
668
669    for (path, mut file) in delta.files {
670        for event in &mut file.events {
671            event.author_idx = event
672                .author_idx
673                .and_then(|idx| author_mapping.get(&idx).copied());
674        }
675        base.files
676            .entry(path)
677            .and_modify(|existing| existing.events.append(&mut file.events))
678            .or_insert(file);
679    }
680}
681
682/// Parse `git log --numstat --format=format:%at|%ae` output into events.
683fn parse_git_log_events(stdout: &str, root: &Path) -> ChurnEventState {
684    let now_secs = std::time::SystemTime::now()
685        .duration_since(std::time::UNIX_EPOCH)
686        .unwrap_or_default()
687        .as_secs();
688
689    let mut parser = GitLogEventParser::new(root, now_secs);
690
691    for line in stdout.lines() {
692        parser.consume_line(line);
693    }
694
695    parser.finish()
696}
697
698struct GitLogEventParser<'a> {
699    root: &'a Path,
700    now_secs: u64,
701    files: FxHashMap<PathBuf, FileEvents>,
702    author_pool: Vec<String>,
703    author_index: FxHashMap<String, u32>,
704    current_timestamp: Option<u64>,
705    current_author_idx: Option<u32>,
706}
707
708impl<'a> GitLogEventParser<'a> {
709    fn new(root: &'a Path, now_secs: u64) -> Self {
710        Self {
711            root,
712            now_secs,
713            files: FxHashMap::default(),
714            author_pool: Vec::new(),
715            author_index: FxHashMap::default(),
716            current_timestamp: None,
717            current_author_idx: None,
718        }
719    }
720
721    fn consume_line(&mut self, line: &str) {
722        let line = line.trim();
723        if line.is_empty() {
724            return;
725        }
726
727        if self.record_commit_header(line) {
728            return;
729        }
730        if self.record_legacy_timestamp(line) {
731            return;
732        }
733        self.record_numstat(line);
734    }
735
736    fn record_commit_header(&mut self, line: &str) -> bool {
737        let Some((ts_str, email)) = line.split_once('|') else {
738            return false;
739        };
740        let Ok(ts) = ts_str.parse::<u64>() else {
741            return false;
742        };
743
744        self.current_timestamp = Some(ts);
745        self.current_author_idx = Some(intern_author(
746            email,
747            &mut self.author_pool,
748            &mut self.author_index,
749        ));
750        true
751    }
752
753    fn record_legacy_timestamp(&mut self, line: &str) -> bool {
754        let Ok(ts) = line.parse::<u64>() else {
755            return false;
756        };
757
758        self.current_timestamp = Some(ts);
759        self.current_author_idx = None;
760        true
761    }
762
763    fn record_numstat(&mut self, line: &str) {
764        let Some((added, deleted, path)) = parse_numstat_line(line) else {
765            return;
766        };
767
768        let ts = self.current_timestamp.unwrap_or(self.now_secs);
769        self.files
770            .entry(self.root.join(path))
771            .or_insert_with(|| FileEvents { events: Vec::new() })
772            .events
773            .push(CachedCommitEvent {
774                timestamp: ts,
775                lines_added: added,
776                lines_deleted: deleted,
777                author_idx: self.current_author_idx,
778            });
779    }
780
781    fn finish(self) -> ChurnEventState {
782        ChurnEventState {
783            files: self.files,
784            author_pool: self.author_pool,
785        }
786    }
787}
788
789/// Aggregate one file's raw commit events into a [`FileChurn`], applying
790/// recency weighting, trend detection, and per-author accumulation.
791#[expect(
792    clippy::cast_possible_truncation,
793    reason = "commit count per file is bounded by git history depth"
794)]
795fn aggregate_file_churn(path: PathBuf, file: FileEvents, now_secs: u64) -> FileChurn {
796    let mut timestamps = Vec::with_capacity(file.events.len());
797    let mut weighted_commits = 0.0;
798    let mut lines_added = 0;
799    let mut lines_deleted = 0;
800    let mut authors: FxHashMap<u32, AuthorContribution> = FxHashMap::default();
801
802    for event in file.events {
803        timestamps.push(event.timestamp);
804        let age_days = (now_secs.saturating_sub(event.timestamp)) as f64 / SECS_PER_DAY;
805        let weight = 0.5_f64.powf(age_days / HALF_LIFE_DAYS);
806        weighted_commits += weight;
807        lines_added += event.lines_added;
808        lines_deleted += event.lines_deleted;
809        accumulate_author(&mut authors, event.author_idx, weight, event.timestamp);
810    }
811
812    let commits = timestamps.len() as u32;
813    let trend = compute_trend(&timestamps);
814    for c in authors.values_mut() {
815        c.weighted_commits = (c.weighted_commits * 100.0).round() / 100.0;
816    }
817    FileChurn {
818        path,
819        commits,
820        weighted_commits: (weighted_commits * 100.0).round() / 100.0,
821        lines_added,
822        lines_deleted,
823        trend,
824        authors,
825    }
826}
827
828/// Fold a single commit's author contribution into the per-author map.
829fn accumulate_author(
830    authors: &mut FxHashMap<u32, AuthorContribution>,
831    author_idx: Option<u32>,
832    weight: f64,
833    timestamp: u64,
834) {
835    let Some(idx) = author_idx else {
836        return;
837    };
838    authors
839        .entry(idx)
840        .and_modify(|c| {
841            c.commits += 1;
842            c.weighted_commits += weight;
843            c.first_commit_ts = c.first_commit_ts.min(timestamp);
844            c.last_commit_ts = c.last_commit_ts.max(timestamp);
845        })
846        .or_insert(AuthorContribution {
847            commits: 1,
848            weighted_commits: weight,
849            first_commit_ts: timestamp,
850            last_commit_ts: timestamp,
851        });
852}
853
854/// Convert event-level churn state into the public aggregate result.
855fn build_churn_result(state: ChurnEventState, shallow_clone: bool) -> ChurnResult {
856    let now_secs = std::time::SystemTime::now()
857        .duration_since(std::time::UNIX_EPOCH)
858        .unwrap_or_default()
859        .as_secs();
860
861    let files = state
862        .files
863        .into_iter()
864        .map(|(path, file)| {
865            let churn = aggregate_file_churn(path.clone(), file, now_secs);
866            (path, churn)
867        })
868        .collect();
869
870    ChurnResult {
871        files,
872        shallow_clone,
873        author_pool: state.author_pool,
874    }
875}
876
877/// Parse `git log --numstat --format=format:%at|%ae` output.
878///
879/// Returns a per-file churn map plus the author email pool referenced by
880/// interned indices in [`FileChurn::authors`].
881#[cfg(test)]
882fn parse_git_log(stdout: &str, root: &Path) -> (FxHashMap<PathBuf, FileChurn>, Vec<String>) {
883    let result = build_churn_result(parse_git_log_events(stdout, root), false);
884    (result.files, result.author_pool)
885}
886
887/// Intern an author email into the pool, returning its stable index.
888fn intern_author(email: &str, pool: &mut Vec<String>, index: &mut FxHashMap<String, u32>) -> u32 {
889    if let Some(&idx) = index.get(email) {
890        return idx;
891    }
892    #[expect(
893        clippy::cast_possible_truncation,
894        reason = "author count is bounded by git history; u32 is far above any realistic ceiling"
895    )]
896    let idx = pool.len() as u32;
897    let owned = email.to_string();
898    index.insert(owned.clone(), idx);
899    pool.push(owned);
900    idx
901}
902
903/// Parse a single numstat line: `"10\t5\tpath/to/file.ts"`.
904/// Binary files show as `"-\t-\tpath"`, skip those.
905fn parse_numstat_line(line: &str) -> Option<(u32, u32, &str)> {
906    let mut parts = line.splitn(3, '\t');
907    let added_str = parts.next()?;
908    let deleted_str = parts.next()?;
909    let path = parts.next()?;
910
911    let added: u32 = added_str.parse().ok()?;
912    let deleted: u32 = deleted_str.parse().ok()?;
913
914    Some((added, deleted, path))
915}
916
917/// Compute churn trend by splitting commits into two temporal halves.
918///
919/// Finds the midpoint between the oldest and newest commit timestamps,
920/// then compares commit counts in each half:
921/// - Recent > 1.5× older → Accelerating
922/// - Recent < 0.67× older → Cooling
923/// - Otherwise → Stable
924fn compute_trend(timestamps: &[u64]) -> ChurnTrend {
925    if timestamps.len() < 2 {
926        return ChurnTrend::Stable;
927    }
928
929    let min_ts = timestamps.iter().copied().min().unwrap_or(0);
930    let max_ts = timestamps.iter().copied().max().unwrap_or(0);
931
932    if max_ts == min_ts {
933        return ChurnTrend::Stable;
934    }
935
936    let midpoint = min_ts + (max_ts - min_ts) / 2;
937    let recent = timestamps.iter().filter(|&&ts| ts > midpoint).count() as f64;
938    let older = timestamps.iter().filter(|&&ts| ts <= midpoint).count() as f64;
939
940    if older < 1.0 {
941        return ChurnTrend::Stable;
942    }
943
944    let ratio = recent / older;
945    if ratio > 1.5 {
946        ChurnTrend::Accelerating
947    } else if ratio < 0.67 {
948        ChurnTrend::Cooling
949    } else {
950        ChurnTrend::Stable
951    }
952}
953
954fn is_iso_date(input: &str) -> bool {
955    input.len() == 10
956        && input.as_bytes().get(4) == Some(&b'-')
957        && input.as_bytes().get(7) == Some(&b'-')
958        && input[..4].bytes().all(|b| b.is_ascii_digit())
959        && input[5..7].bytes().all(|b| b.is_ascii_digit())
960        && input[8..10].bytes().all(|b| b.is_ascii_digit())
961}
962
963fn split_number_unit(input: &str) -> Result<(&str, &str), String> {
964    let pos = input.find(|c: char| !c.is_ascii_digit()).ok_or_else(|| {
965        format!("--since requires a unit suffix (e.g., 6m, 90d, 1y), got: {input}")
966    })?;
967    if pos == 0 {
968        return Err(format!(
969            "--since must start with a number (e.g., 6m, 90d, 1y), got: {input}"
970        ));
971    }
972    Ok((&input[..pos], &input[pos..]))
973}
974
975#[cfg(test)]
976mod tests {
977    use super::*;
978
979    #[test]
980    fn parse_since_months_short() {
981        let d = parse_since("6m").unwrap();
982        assert_eq!(d.git_after, "6 months ago");
983        assert_eq!(d.display, "6 months");
984    }
985
986    #[test]
987    fn parse_since_months_long() {
988        let d = parse_since("6months").unwrap();
989        assert_eq!(d.git_after, "6 months ago");
990        assert_eq!(d.display, "6 months");
991    }
992
993    #[test]
994    fn parse_since_days() {
995        let d = parse_since("90d").unwrap();
996        assert_eq!(d.git_after, "90 days ago");
997        assert_eq!(d.display, "90 days");
998    }
999
1000    #[test]
1001    fn parse_since_year_singular() {
1002        let d = parse_since("1y").unwrap();
1003        assert_eq!(d.git_after, "1 year ago");
1004        assert_eq!(d.display, "1 year");
1005    }
1006
1007    #[test]
1008    fn parse_since_years_plural() {
1009        let d = parse_since("2years").unwrap();
1010        assert_eq!(d.git_after, "2 years ago");
1011        assert_eq!(d.display, "2 years");
1012    }
1013
1014    #[test]
1015    fn parse_since_weeks() {
1016        let d = parse_since("2w").unwrap();
1017        assert_eq!(d.git_after, "2 weeks ago");
1018        assert_eq!(d.display, "2 weeks");
1019    }
1020
1021    #[test]
1022    fn parse_since_iso_date() {
1023        let d = parse_since("2025-06-01").unwrap();
1024        assert_eq!(d.git_after, "2025-06-01");
1025        assert_eq!(d.display, "2025-06-01");
1026    }
1027
1028    #[test]
1029    fn parse_since_month_singular() {
1030        let d = parse_since("1month").unwrap();
1031        assert_eq!(d.display, "1 month");
1032    }
1033
1034    #[test]
1035    fn parse_since_day_singular() {
1036        let d = parse_since("1day").unwrap();
1037        assert_eq!(d.display, "1 day");
1038    }
1039
1040    #[test]
1041    fn parse_since_zero_rejected() {
1042        assert!(parse_since("0m").is_err());
1043    }
1044
1045    #[test]
1046    fn parse_since_no_unit_rejected() {
1047        assert!(parse_since("90").is_err());
1048    }
1049
1050    #[test]
1051    fn parse_since_unknown_unit_rejected() {
1052        assert!(parse_since("6x").is_err());
1053    }
1054
1055    #[test]
1056    fn parse_since_no_number_rejected() {
1057        assert!(parse_since("months").is_err());
1058    }
1059
1060    #[test]
1061    fn numstat_normal() {
1062        let (a, d, p) = parse_numstat_line("10\t5\tsrc/file.ts").unwrap();
1063        assert_eq!(a, 10);
1064        assert_eq!(d, 5);
1065        assert_eq!(p, "src/file.ts");
1066    }
1067
1068    #[test]
1069    fn numstat_binary_skipped() {
1070        assert!(parse_numstat_line("-\t-\tsrc/image.png").is_none());
1071    }
1072
1073    #[test]
1074    fn numstat_zero_lines() {
1075        let (a, d, p) = parse_numstat_line("0\t0\tsrc/empty.ts").unwrap();
1076        assert_eq!(a, 0);
1077        assert_eq!(d, 0);
1078        assert_eq!(p, "src/empty.ts");
1079    }
1080
1081    #[test]
1082    fn trend_empty_is_stable() {
1083        assert_eq!(compute_trend(&[]), ChurnTrend::Stable);
1084    }
1085
1086    #[test]
1087    fn trend_single_commit_is_stable() {
1088        assert_eq!(compute_trend(&[100]), ChurnTrend::Stable);
1089    }
1090
1091    #[test]
1092    fn trend_accelerating() {
1093        let timestamps = vec![100, 200, 800, 850, 900, 950, 1000];
1094        assert_eq!(compute_trend(&timestamps), ChurnTrend::Accelerating);
1095    }
1096
1097    #[test]
1098    fn trend_cooling() {
1099        let timestamps = vec![100, 150, 200, 250, 300, 900, 1000];
1100        assert_eq!(compute_trend(&timestamps), ChurnTrend::Cooling);
1101    }
1102
1103    #[test]
1104    fn trend_stable_even_distribution() {
1105        let timestamps = vec![100, 200, 300, 700, 800, 900];
1106        assert_eq!(compute_trend(&timestamps), ChurnTrend::Stable);
1107    }
1108
1109    #[test]
1110    fn trend_same_timestamp_is_stable() {
1111        let timestamps = vec![500, 500, 500];
1112        assert_eq!(compute_trend(&timestamps), ChurnTrend::Stable);
1113    }
1114
1115    #[test]
1116    fn iso_date_valid() {
1117        assert!(is_iso_date("2025-06-01"));
1118        assert!(is_iso_date("2025-12-31"));
1119    }
1120
1121    #[test]
1122    fn iso_date_with_time_rejected() {
1123        assert!(!is_iso_date("2025-06-01T00:00:00"));
1124    }
1125
1126    #[test]
1127    fn iso_date_invalid() {
1128        assert!(!is_iso_date("6months"));
1129        assert!(!is_iso_date("2025"));
1130        assert!(!is_iso_date("not-a-date"));
1131        assert!(!is_iso_date("abcd-ef-gh"));
1132    }
1133
1134    #[test]
1135    fn trend_display() {
1136        assert_eq!(ChurnTrend::Accelerating.to_string(), "accelerating");
1137        assert_eq!(ChurnTrend::Stable.to_string(), "stable");
1138        assert_eq!(ChurnTrend::Cooling.to_string(), "cooling");
1139    }
1140
1141    #[test]
1142    fn parse_git_log_single_commit() {
1143        let root = Path::new("/project");
1144        let output = "1700000000\n10\t5\tsrc/index.ts\n";
1145        let (result, _) = parse_git_log(output, root);
1146        assert_eq!(result.len(), 1);
1147        let churn = &result[&PathBuf::from("/project/src/index.ts")];
1148        assert_eq!(churn.commits, 1);
1149        assert_eq!(churn.lines_added, 10);
1150        assert_eq!(churn.lines_deleted, 5);
1151    }
1152
1153    #[test]
1154    fn parse_git_log_multiple_commits_same_file() {
1155        let root = Path::new("/project");
1156        let output = "1700000000\n10\t5\tsrc/index.ts\n\n1700100000\n3\t2\tsrc/index.ts\n";
1157        let (result, _) = parse_git_log(output, root);
1158        assert_eq!(result.len(), 1);
1159        let churn = &result[&PathBuf::from("/project/src/index.ts")];
1160        assert_eq!(churn.commits, 2);
1161        assert_eq!(churn.lines_added, 13);
1162        assert_eq!(churn.lines_deleted, 7);
1163    }
1164
1165    #[test]
1166    fn parse_git_log_multiple_files() {
1167        let root = Path::new("/project");
1168        let output = "1700000000\n10\t5\tsrc/a.ts\n3\t1\tsrc/b.ts\n";
1169        let (result, _) = parse_git_log(output, root);
1170        assert_eq!(result.len(), 2);
1171        assert!(result.contains_key(&PathBuf::from("/project/src/a.ts")));
1172        assert!(result.contains_key(&PathBuf::from("/project/src/b.ts")));
1173    }
1174
1175    #[test]
1176    fn parse_git_log_empty_output() {
1177        let root = Path::new("/project");
1178        let (result, _) = parse_git_log("", root);
1179        assert!(result.is_empty());
1180    }
1181
1182    #[test]
1183    fn parse_git_log_skips_binary_files() {
1184        let root = Path::new("/project");
1185        let output = "1700000000\n-\t-\timage.png\n10\t5\tsrc/a.ts\n";
1186        let (result, _) = parse_git_log(output, root);
1187        assert_eq!(result.len(), 1);
1188        assert!(!result.contains_key(&PathBuf::from("/project/image.png")));
1189    }
1190
1191    #[test]
1192    fn parse_git_log_weighted_commits_are_positive() {
1193        let root = Path::new("/project");
1194        let now_secs = std::time::SystemTime::now()
1195            .duration_since(std::time::UNIX_EPOCH)
1196            .unwrap()
1197            .as_secs();
1198        let output = format!("{now_secs}\n10\t5\tsrc/a.ts\n");
1199        let (result, _) = parse_git_log(&output, root);
1200        let churn = &result[&PathBuf::from("/project/src/a.ts")];
1201        assert!(
1202            churn.weighted_commits > 0.0,
1203            "weighted_commits should be positive for recent commits"
1204        );
1205    }
1206
1207    #[test]
1208    fn trend_boundary_1_5x_ratio() {
1209        let timestamps = vec![100, 200, 600, 800, 1000];
1210        assert_eq!(compute_trend(&timestamps), ChurnTrend::Stable);
1211    }
1212
1213    #[test]
1214    fn trend_just_above_1_5x() {
1215        let timestamps = vec![100, 600, 800, 1000];
1216        assert_eq!(compute_trend(&timestamps), ChurnTrend::Accelerating);
1217    }
1218
1219    #[test]
1220    fn trend_boundary_0_67x_ratio() {
1221        let timestamps = vec![100, 200, 300, 600, 1000];
1222        assert_eq!(compute_trend(&timestamps), ChurnTrend::Cooling);
1223    }
1224
1225    #[test]
1226    fn trend_two_timestamps_different() {
1227        let timestamps = vec![100, 200];
1228        assert_eq!(compute_trend(&timestamps), ChurnTrend::Stable);
1229    }
1230
1231    #[test]
1232    fn parse_since_week_singular() {
1233        let d = parse_since("1week").unwrap();
1234        assert_eq!(d.git_after, "1 week ago");
1235        assert_eq!(d.display, "1 week");
1236    }
1237
1238    #[test]
1239    fn parse_since_weeks_long() {
1240        let d = parse_since("3weeks").unwrap();
1241        assert_eq!(d.git_after, "3 weeks ago");
1242        assert_eq!(d.display, "3 weeks");
1243    }
1244
1245    #[test]
1246    fn parse_since_days_long() {
1247        let d = parse_since("30days").unwrap();
1248        assert_eq!(d.git_after, "30 days ago");
1249        assert_eq!(d.display, "30 days");
1250    }
1251
1252    #[test]
1253    fn parse_since_year_long() {
1254        let d = parse_since("1year").unwrap();
1255        assert_eq!(d.git_after, "1 year ago");
1256        assert_eq!(d.display, "1 year");
1257    }
1258
1259    #[test]
1260    fn parse_since_overflow_number_rejected() {
1261        let result = parse_since("99999999999999999999d");
1262        assert!(result.is_err());
1263        let err = result.unwrap_err();
1264        assert!(err.contains("invalid number"));
1265    }
1266
1267    #[test]
1268    fn parse_since_zero_days_rejected() {
1269        assert!(parse_since("0d").is_err());
1270    }
1271
1272    #[test]
1273    fn parse_since_zero_weeks_rejected() {
1274        assert!(parse_since("0w").is_err());
1275    }
1276
1277    #[test]
1278    fn parse_since_zero_years_rejected() {
1279        assert!(parse_since("0y").is_err());
1280    }
1281
1282    #[test]
1283    fn numstat_missing_path() {
1284        assert!(parse_numstat_line("10\t5").is_none());
1285    }
1286
1287    #[test]
1288    fn numstat_single_field() {
1289        assert!(parse_numstat_line("10").is_none());
1290    }
1291
1292    #[test]
1293    fn numstat_empty_string() {
1294        assert!(parse_numstat_line("").is_none());
1295    }
1296
1297    #[test]
1298    fn numstat_only_added_is_binary() {
1299        assert!(parse_numstat_line("-\t5\tsrc/file.ts").is_none());
1300    }
1301
1302    #[test]
1303    fn numstat_only_deleted_is_binary() {
1304        assert!(parse_numstat_line("10\t-\tsrc/file.ts").is_none());
1305    }
1306
1307    #[test]
1308    fn numstat_path_with_spaces() {
1309        let (a, d, p) = parse_numstat_line("3\t1\tpath with spaces/file.ts").unwrap();
1310        assert_eq!(a, 3);
1311        assert_eq!(d, 1);
1312        assert_eq!(p, "path with spaces/file.ts");
1313    }
1314
1315    #[test]
1316    fn numstat_large_numbers() {
1317        let (a, d, p) = parse_numstat_line("9999\t8888\tsrc/big.ts").unwrap();
1318        assert_eq!(a, 9999);
1319        assert_eq!(d, 8888);
1320        assert_eq!(p, "src/big.ts");
1321    }
1322
1323    #[test]
1324    fn iso_date_wrong_separator_positions() {
1325        assert!(!is_iso_date("20-25-0601"));
1326        assert!(!is_iso_date("202506-01-"));
1327    }
1328
1329    #[test]
1330    fn iso_date_too_short() {
1331        assert!(!is_iso_date("2025-06-0"));
1332    }
1333
1334    #[test]
1335    fn iso_date_letters_in_day() {
1336        assert!(!is_iso_date("2025-06-ab"));
1337    }
1338
1339    #[test]
1340    fn iso_date_letters_in_month() {
1341        assert!(!is_iso_date("2025-ab-01"));
1342    }
1343
1344    #[test]
1345    fn split_number_unit_valid() {
1346        let (num, unit) = split_number_unit("42days").unwrap();
1347        assert_eq!(num, "42");
1348        assert_eq!(unit, "days");
1349    }
1350
1351    #[test]
1352    fn split_number_unit_single_digit() {
1353        let (num, unit) = split_number_unit("1m").unwrap();
1354        assert_eq!(num, "1");
1355        assert_eq!(unit, "m");
1356    }
1357
1358    #[test]
1359    fn split_number_unit_no_digits() {
1360        let err = split_number_unit("abc").unwrap_err();
1361        assert!(err.contains("must start with a number"));
1362    }
1363
1364    #[test]
1365    fn split_number_unit_no_unit() {
1366        let err = split_number_unit("123").unwrap_err();
1367        assert!(err.contains("requires a unit suffix"));
1368    }
1369
1370    #[test]
1371    fn parse_git_log_numstat_before_timestamp_uses_now() {
1372        let root = Path::new("/project");
1373        let output = "10\t5\tsrc/no_ts.ts\n";
1374        let (result, _) = parse_git_log(output, root);
1375        assert_eq!(result.len(), 1);
1376        let churn = &result[&PathBuf::from("/project/src/no_ts.ts")];
1377        assert_eq!(churn.commits, 1);
1378        assert_eq!(churn.lines_added, 10);
1379        assert_eq!(churn.lines_deleted, 5);
1380        assert!(
1381            churn.weighted_commits > 0.9,
1382            "weight should be near 1.0 when timestamp defaults to now"
1383        );
1384    }
1385
1386    #[test]
1387    fn parse_git_log_whitespace_lines_ignored() {
1388        let root = Path::new("/project");
1389        let output = "  \n1700000000\n  \n10\t5\tsrc/a.ts\n  \n";
1390        let (result, _) = parse_git_log(output, root);
1391        assert_eq!(result.len(), 1);
1392    }
1393
1394    #[test]
1395    fn parse_git_log_trend_is_computed_per_file() {
1396        let root = Path::new("/project");
1397        let output = "\
13981000\n5\t1\tsrc/old.ts\n\
13992000\n3\t1\tsrc/old.ts\n\
14001000\n1\t0\tsrc/hot.ts\n\
14011800\n1\t0\tsrc/hot.ts\n\
14021900\n1\t0\tsrc/hot.ts\n\
14031950\n1\t0\tsrc/hot.ts\n\
14042000\n1\t0\tsrc/hot.ts\n";
1405        let (result, _) = parse_git_log(output, root);
1406        let old = &result[&PathBuf::from("/project/src/old.ts")];
1407        let hot = &result[&PathBuf::from("/project/src/hot.ts")];
1408        assert_eq!(old.commits, 2);
1409        assert_eq!(hot.commits, 5);
1410        assert_eq!(hot.trend, ChurnTrend::Accelerating);
1411    }
1412
1413    #[test]
1414    fn parse_git_log_weighted_decay_for_old_commits() {
1415        let root = Path::new("/project");
1416        let now = std::time::SystemTime::now()
1417            .duration_since(std::time::UNIX_EPOCH)
1418            .unwrap()
1419            .as_secs();
1420        let old_ts = now - (180 * 86_400);
1421        let output = format!("{old_ts}\n10\t5\tsrc/old.ts\n");
1422        let (result, _) = parse_git_log(&output, root);
1423        let churn = &result[&PathBuf::from("/project/src/old.ts")];
1424        assert!(
1425            churn.weighted_commits < 0.5,
1426            "180-day-old commit should weigh ~0.25, got {}",
1427            churn.weighted_commits
1428        );
1429        assert!(
1430            churn.weighted_commits > 0.1,
1431            "180-day-old commit should weigh ~0.25, got {}",
1432            churn.weighted_commits
1433        );
1434    }
1435
1436    #[test]
1437    fn parse_git_log_path_stored_as_absolute() {
1438        let root = Path::new("/my/project");
1439        let output = "1700000000\n1\t0\tlib/utils.ts\n";
1440        let (result, _) = parse_git_log(output, root);
1441        let key = PathBuf::from("/my/project/lib/utils.ts");
1442        assert!(result.contains_key(&key));
1443        assert_eq!(result[&key].path, key);
1444    }
1445
1446    #[test]
1447    fn parse_git_log_weighted_commits_rounded() {
1448        let root = Path::new("/project");
1449        let now = std::time::SystemTime::now()
1450            .duration_since(std::time::UNIX_EPOCH)
1451            .unwrap()
1452            .as_secs();
1453        let output = format!("{now}\n1\t0\tsrc/a.ts\n");
1454        let (result, _) = parse_git_log(&output, root);
1455        let churn = &result[&PathBuf::from("/project/src/a.ts")];
1456        let decimals = format!("{:.2}", churn.weighted_commits);
1457        assert_eq!(
1458            churn.weighted_commits.to_string().len(),
1459            decimals.len().min(churn.weighted_commits.to_string().len()),
1460            "weighted_commits should be rounded to at most 2 decimal places"
1461        );
1462    }
1463
1464    #[test]
1465    fn trend_serde_serialization() {
1466        assert_eq!(
1467            serde_json::to_string(&ChurnTrend::Accelerating).unwrap(),
1468            "\"accelerating\""
1469        );
1470        assert_eq!(
1471            serde_json::to_string(&ChurnTrend::Stable).unwrap(),
1472            "\"stable\""
1473        );
1474        assert_eq!(
1475            serde_json::to_string(&ChurnTrend::Cooling).unwrap(),
1476            "\"cooling\""
1477        );
1478    }
1479
1480    #[test]
1481    fn parse_git_log_extracts_author_email() {
1482        let root = Path::new("/project");
1483        let output = "1700000000|alice@example.com\n10\t5\tsrc/index.ts\n";
1484        let (result, pool) = parse_git_log(output, root);
1485        assert_eq!(pool, vec!["alice@example.com".to_string()]);
1486        let churn = &result[&PathBuf::from("/project/src/index.ts")];
1487        assert_eq!(churn.authors.len(), 1);
1488        let alice = &churn.authors[&0];
1489        assert_eq!(alice.commits, 1);
1490        assert_eq!(alice.first_commit_ts, 1_700_000_000);
1491        assert_eq!(alice.last_commit_ts, 1_700_000_000);
1492    }
1493
1494    #[test]
1495    fn parse_git_log_intern_dedupes_authors() {
1496        let root = Path::new("/project");
1497        let output = "\
14981700000000|alice@example.com
14991\t0\ta.ts
15001700100000|bob@example.com
15012\t1\tb.ts
15021700200000|alice@example.com
15033\t2\tc.ts
1504";
1505        let (_result, pool) = parse_git_log(output, root);
1506        assert_eq!(pool.len(), 2);
1507        assert!(pool.contains(&"alice@example.com".to_string()));
1508        assert!(pool.contains(&"bob@example.com".to_string()));
1509    }
1510
1511    #[test]
1512    fn parse_git_log_aggregates_per_author() {
1513        let root = Path::new("/project");
1514        let output = "\
15151700000000|alice@example.com
15161\t0\tsrc/index.ts
15171700100000|bob@example.com
15182\t0\tsrc/index.ts
15191700200000|alice@example.com
15201\t1\tsrc/index.ts
1521";
1522        let (result, pool) = parse_git_log(output, root);
1523        let churn = &result[&PathBuf::from("/project/src/index.ts")];
1524        assert_eq!(churn.commits, 3);
1525        assert_eq!(churn.authors.len(), 2);
1526
1527        let alice_idx =
1528            u32::try_from(pool.iter().position(|a| a == "alice@example.com").unwrap()).unwrap();
1529        let alice = &churn.authors[&alice_idx];
1530        assert_eq!(alice.commits, 2);
1531        assert_eq!(alice.first_commit_ts, 1_700_000_000);
1532        assert_eq!(alice.last_commit_ts, 1_700_200_000);
1533    }
1534
1535    #[test]
1536    fn parse_git_log_legacy_bare_timestamp_still_parses() {
1537        let root = Path::new("/project");
1538        let output = "1700000000\n10\t5\tsrc/index.ts\n";
1539        let (result, pool) = parse_git_log(output, root);
1540        assert!(pool.is_empty());
1541        let churn = &result[&PathBuf::from("/project/src/index.ts")];
1542        assert_eq!(churn.commits, 1);
1543        assert!(churn.authors.is_empty());
1544    }
1545
1546    #[test]
1547    fn intern_author_returns_existing_index() {
1548        let mut pool = Vec::new();
1549        let mut index = FxHashMap::default();
1550        let i1 = intern_author("alice@x", &mut pool, &mut index);
1551        let i2 = intern_author("alice@x", &mut pool, &mut index);
1552        assert_eq!(i1, i2);
1553        assert_eq!(pool.len(), 1);
1554    }
1555
1556    #[test]
1557    fn intern_author_assigns_sequential_indices() {
1558        let mut pool = Vec::new();
1559        let mut index = FxHashMap::default();
1560        assert_eq!(intern_author("alice@x", &mut pool, &mut index), 0);
1561        assert_eq!(intern_author("bob@x", &mut pool, &mut index), 1);
1562        assert_eq!(intern_author("carol@x", &mut pool, &mut index), 2);
1563        assert_eq!(intern_author("alice@x", &mut pool, &mut index), 0);
1564    }
1565
1566    fn git(root: &Path, args: &[&str]) {
1567        let status = std::process::Command::new("git")
1568            .args(args)
1569            .current_dir(root)
1570            .status()
1571            .expect("run git");
1572        assert!(status.success(), "git {args:?} failed");
1573    }
1574
1575    fn write(root: &Path, path: &str, contents: &str) {
1576        let path = root.join(path);
1577        std::fs::create_dir_all(path.parent().expect("test path has parent")).unwrap();
1578        std::fs::write(path, contents).unwrap();
1579    }
1580
1581    #[test]
1582    fn cached_churn_merges_new_commits_after_head_advances() {
1583        let repo = tempfile::tempdir().expect("create repo");
1584        let root = repo.path();
1585        git(root, &["init"]);
1586        git(root, &["config", "user.email", "churn@example.test"]);
1587        git(root, &["config", "user.name", "Churn Test"]);
1588        git(root, &["config", "commit.gpgsign", "false"]);
1589
1590        write(root, "src/a.ts", "export const a = 1;\n");
1591        git(root, &["add", "."]);
1592        git(root, &["commit", "-m", "initial"]);
1593
1594        let since = parse_since("1y").unwrap();
1595        let cache = tempfile::tempdir().expect("create cache dir");
1596        let (cold, cold_hit) = analyze_churn_cached(root, &since, cache.path(), false).unwrap();
1597        assert!(!cold_hit);
1598        let file = root.join("src/a.ts");
1599        assert_eq!(cold.files[&file].commits, 1);
1600
1601        let (_warm, warm_hit) = analyze_churn_cached(root, &since, cache.path(), false).unwrap();
1602        assert!(warm_hit);
1603
1604        write(
1605            root,
1606            "src/a.ts",
1607            "export const a = 1;\nexport const b = 2;\n",
1608        );
1609        git(root, &["add", "."]);
1610        git(root, &["commit", "-m", "update a"]);
1611        let head = get_head_sha(root).unwrap();
1612
1613        let (incremental, incremental_hit) =
1614            analyze_churn_cached(root, &since, cache.path(), false).unwrap();
1615        assert!(incremental_hit);
1616        assert_eq!(incremental.files[&file].commits, 2);
1617
1618        let cache = load_churn_cache(cache.path(), &since.git_after).unwrap();
1619        assert_eq!(cache.last_indexed_sha, head);
1620    }
1621
1622    fn write_churn_file(dir: &std::path::Path, contents: &str) -> PathBuf {
1623        let path = dir.join("churn.json");
1624        std::fs::write(&path, contents).unwrap();
1625        path
1626    }
1627
1628    #[test]
1629    fn churn_file_happy_path() {
1630        let dir = tempfile::tempdir().unwrap();
1631        let root = Path::new("/project");
1632        let path = write_churn_file(
1633            dir.path(),
1634            r#"{
1635              "schema": "fallow-churn/v1",
1636              "events": [
1637                { "path": "src/a.ts", "timestamp": 1700000000, "author": "alice@corp", "added": 10, "deleted": 5 },
1638                { "path": "src/a.ts", "timestamp": 1700100000, "author": "bob@corp", "added": 3, "deleted": 2 }
1639              ]
1640            }"#,
1641        );
1642        let result = analyze_churn_from_file(&path, root).unwrap();
1643        let churn = &result.files[&PathBuf::from("/project/src/a.ts")];
1644        assert_eq!(churn.commits, 2);
1645        assert_eq!(churn.lines_added, 13);
1646        assert_eq!(churn.lines_deleted, 7);
1647        assert_eq!(churn.authors.len(), 2);
1648        assert!(result.author_pool.contains(&"alice@corp".to_string()));
1649        assert!(result.author_pool.contains(&"bob@corp".to_string()));
1650        assert!(!result.shallow_clone);
1651    }
1652
1653    #[test]
1654    fn churn_file_matches_git_parse() {
1655        // The same events fed via git numstat and via the JSON import must
1656        // produce identical aggregate churn: the import reuses
1657        // build_churn_result, so only the SOURCE differs.
1658        let dir = tempfile::tempdir().unwrap();
1659        let root = Path::new("/project");
1660        let git_output = "1700000000|alice@corp\n10\t5\tsrc/a.ts\n3\t1\tsrc/b.ts\n\n1700100000|bob@corp\n3\t2\tsrc/a.ts\n";
1661        let (git_files, git_pool) = parse_git_log(git_output, root);
1662
1663        let path = write_churn_file(
1664            dir.path(),
1665            r#"{
1666              "schema": "fallow-churn/v1",
1667              "events": [
1668                { "path": "src/a.ts", "timestamp": 1700000000, "author": "alice@corp", "added": 10, "deleted": 5 },
1669                { "path": "src/b.ts", "timestamp": 1700000000, "author": "alice@corp", "added": 3, "deleted": 1 },
1670                { "path": "src/a.ts", "timestamp": 1700100000, "author": "bob@corp", "added": 3, "deleted": 2 }
1671              ]
1672            }"#,
1673        );
1674        let imported = analyze_churn_from_file(&path, root).unwrap();
1675
1676        assert_eq!(git_pool, imported.author_pool, "author pools diverge");
1677        assert_eq!(git_files.len(), imported.files.len());
1678        for (file, git_churn) in &git_files {
1679            let imp = &imported.files[file];
1680            assert_eq!(git_churn.commits, imp.commits, "commits for {file:?}");
1681            assert_eq!(git_churn.lines_added, imp.lines_added, "added for {file:?}");
1682            assert_eq!(
1683                git_churn.lines_deleted, imp.lines_deleted,
1684                "deleted for {file:?}"
1685            );
1686            assert_eq!(git_churn.trend, imp.trend, "trend for {file:?}");
1687            assert_eq!(
1688                git_churn.authors.len(),
1689                imp.authors.len(),
1690                "authors for {file:?}"
1691            );
1692            assert!(
1693                (git_churn.weighted_commits - imp.weighted_commits).abs() < 0.02,
1694                "weighted_commits for {file:?}: {} vs {}",
1695                git_churn.weighted_commits,
1696                imp.weighted_commits
1697            );
1698        }
1699    }
1700
1701    #[test]
1702    fn churn_file_empty_events_is_valid() {
1703        let dir = tempfile::tempdir().unwrap();
1704        let path = write_churn_file(
1705            dir.path(),
1706            r#"{ "schema": "fallow-churn/v1", "events": [] }"#,
1707        );
1708        let result = analyze_churn_from_file(&path, Path::new("/project")).unwrap();
1709        assert!(result.files.is_empty());
1710        assert!(result.author_pool.is_empty());
1711    }
1712
1713    #[test]
1714    fn churn_file_missing_events_key_is_valid() {
1715        let dir = tempfile::tempdir().unwrap();
1716        let path = write_churn_file(dir.path(), r#"{ "schema": "fallow-churn/v1" }"#);
1717        let result = analyze_churn_from_file(&path, Path::new("/project")).unwrap();
1718        assert!(result.files.is_empty());
1719    }
1720
1721    #[test]
1722    fn churn_file_bad_schema_rejected() {
1723        let dir = tempfile::tempdir().unwrap();
1724        let path = write_churn_file(
1725            dir.path(),
1726            r#"{ "schema": "fallow-churn/v2", "events": [] }"#,
1727        );
1728        let err = analyze_churn_from_file(&path, Path::new("/project")).unwrap_err();
1729        assert!(err.contains("expected \"fallow-churn/v1\""), "{err}");
1730    }
1731
1732    #[test]
1733    fn churn_file_malformed_json_rejected() {
1734        let dir = tempfile::tempdir().unwrap();
1735        let path = write_churn_file(dir.path(), "{ not json");
1736        assert!(analyze_churn_from_file(&path, Path::new("/project")).is_err());
1737    }
1738
1739    #[test]
1740    fn churn_file_missing_file_rejected() {
1741        let err = analyze_churn_from_file(Path::new("/no/such/churn.json"), Path::new("/project"))
1742            .unwrap_err();
1743        assert!(err.contains("failed to read churn file"), "{err}");
1744    }
1745
1746    #[test]
1747    fn churn_file_empty_path_rejected() {
1748        let dir = tempfile::tempdir().unwrap();
1749        let path = write_churn_file(
1750            dir.path(),
1751            r#"{ "schema": "fallow-churn/v1", "events": [ { "path": "  ", "timestamp": 1700000000, "added": 1, "deleted": 0 } ] }"#,
1752        );
1753        let err = analyze_churn_from_file(&path, Path::new("/project")).unwrap_err();
1754        assert!(err.contains("empty path"), "{err}");
1755    }
1756
1757    #[test]
1758    fn churn_file_millisecond_timestamp_rejected() {
1759        let dir = tempfile::tempdir().unwrap();
1760        // 1700000000000 is milliseconds; ~52000 years in the future as seconds.
1761        let path = write_churn_file(
1762            dir.path(),
1763            r#"{ "schema": "fallow-churn/v1", "events": [ { "path": "src/a.ts", "timestamp": 1700000000000, "added": 1, "deleted": 0 } ] }"#,
1764        );
1765        let err = analyze_churn_from_file(&path, Path::new("/project")).unwrap_err();
1766        assert!(err.contains("milliseconds"), "{err}");
1767    }
1768
1769    #[test]
1770    fn churn_file_missing_author_contributes_no_signal() {
1771        let dir = tempfile::tempdir().unwrap();
1772        let path = write_churn_file(
1773            dir.path(),
1774            r#"{ "schema": "fallow-churn/v1", "events": [ { "path": "src/a.ts", "timestamp": 1700000000, "added": 1, "deleted": 0 } ] }"#,
1775        );
1776        let result = analyze_churn_from_file(&path, Path::new("/project")).unwrap();
1777        let churn = &result.files[&PathBuf::from("/project/src/a.ts")];
1778        assert_eq!(churn.commits, 1);
1779        assert!(churn.authors.is_empty());
1780        assert!(result.author_pool.is_empty());
1781    }
1782
1783    #[test]
1784    fn churn_file_empty_author_string_treated_as_absent() {
1785        let dir = tempfile::tempdir().unwrap();
1786        let path = write_churn_file(
1787            dir.path(),
1788            r#"{ "schema": "fallow-churn/v1", "events": [ { "path": "src/a.ts", "timestamp": 1700000000, "author": "  ", "added": 1, "deleted": 0 } ] }"#,
1789        );
1790        let result = analyze_churn_from_file(&path, Path::new("/project")).unwrap();
1791        assert!(result.author_pool.is_empty());
1792    }
1793
1794    #[test]
1795    fn churn_file_unknown_fields_ignored() {
1796        // Extra keys (including the reserved `commit`) are accepted and ignored,
1797        // so a wrapper carrying extra metadata stays forward-compatible.
1798        let dir = tempfile::tempdir().unwrap();
1799        let path = write_churn_file(
1800            dir.path(),
1801            r#"{ "schema": "fallow-churn/v1", "extra": true, "events": [ { "path": "src/a.ts", "timestamp": 1700000000, "author": "alice@corp", "added": 1, "deleted": 0, "commit": "abc123", "tz": "+0200" } ] }"#,
1802        );
1803        let result = analyze_churn_from_file(&path, Path::new("/project")).unwrap();
1804        assert_eq!(result.files[&PathBuf::from("/project/src/a.ts")].commits, 1);
1805    }
1806
1807    #[test]
1808    fn churn_file_backslash_paths_normalized() {
1809        let dir = tempfile::tempdir().unwrap();
1810        let path = write_churn_file(
1811            dir.path(),
1812            r#"{ "schema": "fallow-churn/v1", "events": [ { "path": "src\\a.ts", "timestamp": 1700000000, "added": 1, "deleted": 0 } ] }"#,
1813        );
1814        let result = analyze_churn_from_file(&path, Path::new("/project")).unwrap();
1815        assert!(
1816            result
1817                .files
1818                .contains_key(&PathBuf::from("/project/src/a.ts"))
1819        );
1820    }
1821}