Skip to main content

rag_rat_core/query/
clusters.rs

1use std::collections::{BTreeMap, BTreeSet, HashMap};
2
3use rusqlite::Connection;
4use serde::Serialize;
5
6use crate::query::repo_brief::{
7    self, FileBriefRow, RepoBriefMemoryCounts, enrich_memory_counts, enrich_symbol_kinds,
8};
9
10const MAX_PATH_BUCKET_FILES: usize = 120;
11const MAX_COTOUCH_COMMIT_FILES: usize = 40;
12const MAX_CLUSTER_PATHS: usize = 8;
13const MIN_EDGE_SCORE: f64 = 0.34;
14
15#[derive(Debug, Clone, Copy)]
16pub struct RepoClustersOptions {
17    pub limit: u32,
18    pub include_generated: bool,
19    pub include_memories: bool,
20    pub min_cluster_size: u32,
21}
22
23impl Default for RepoClustersOptions {
24    fn default() -> Self {
25        Self { limit: 10, include_generated: false, include_memories: true, min_cluster_size: 2 }
26    }
27}
28
29#[derive(Debug, Serialize)]
30pub struct RepoClustersReport {
31    pub summary: RepoClustersSummary,
32    pub clusters: Vec<RepoCluster>,
33    pub warnings: Vec<String>,
34}
35
36#[derive(Debug, Serialize)]
37pub struct RepoClustersSummary {
38    pub total_files: u64,
39    pub clustered_files: u64,
40    pub singleton_files: u64,
41    pub returned_clusters: u64,
42    pub generated_files: u64,
43    pub git_commits: u64,
44    pub git_file_changes: u64,
45    pub graph_edges: u64,
46    pub repo_memories: RepoBriefMemoryCounts,
47    pub scoring_note: String,
48}
49
50#[derive(Debug, Serialize)]
51pub struct RepoCluster {
52    pub cluster_id: String,
53    pub name: String,
54    pub category: String,
55    pub confidence: String,
56    pub score: f64,
57    pub metrics: RepoClusterMetrics,
58    pub representative_paths: Vec<String>,
59    pub why: Vec<String>,
60    pub next_tools: Vec<RepoClusterNextTool>,
61}
62
63#[derive(Debug, Default, Clone, Serialize)]
64pub struct RepoClusterMetrics {
65    pub file_count: u64,
66    pub total_lines: u64,
67    pub total_chunks: u64,
68    pub total_symbols: u64,
69    pub fan_in: u64,
70    pub fan_out: u64,
71    pub commit_touch_count: u64,
72    pub recent_touch_count: u64,
73    pub additions: u64,
74    pub deletions: u64,
75    pub co_touch_edges: u64,
76    pub graph_edges: u64,
77    pub languages: BTreeMap<String, u64>,
78    pub kinds: BTreeMap<String, u64>,
79    pub memories: RepoBriefMemoryCounts,
80}
81
82#[derive(Debug, Serialize)]
83pub struct RepoClusterNextTool {
84    pub tool: String,
85    pub reason: String,
86    pub args: BTreeMap<String, serde_json::Value>,
87}
88
89#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
90struct PathPair(usize, usize);
91
92impl PathPair {
93    fn new(left: usize, right: usize) -> Self {
94        if left <= right { Self(left, right) } else { Self(right, left) }
95    }
96}
97
98#[derive(Debug, Default, Clone)]
99struct PairEvidence {
100    co_touch_count: u64,
101    graph_edge_count: u64,
102    path_candidate: bool,
103}
104
105#[derive(Debug, Clone)]
106struct ScoredPair {
107    pair: PathPair,
108    score: f64,
109}
110
111#[derive(Debug, Clone)]
112struct ClusterBuild {
113    rows: Vec<FileBriefRow>,
114    avg_edge_score: f64,
115    co_touch_edges: u64,
116    graph_edges: u64,
117}
118
119pub fn repo_clusters(
120    conn: &Connection,
121    options: RepoClustersOptions,
122) -> anyhow::Result<RepoClustersReport> {
123    let summary = repo_brief::summary_counts(conn)?;
124    let mut warnings = Vec::new();
125    if summary.git_commits == 0 {
126        warnings.push("git history is not indexed; co-touch clustering is unavailable".to_string());
127    }
128    if summary.graph_edges == 0 {
129        warnings.push(
130            "graph edges are not indexed; graph-neighborhood clustering is unavailable".to_string(),
131        );
132    }
133
134    let mut rows = repo_brief::file_rows(conn, options.include_generated)?;
135    if options.include_memories {
136        enrich_memory_counts(conn, &mut rows)?;
137    }
138    enrich_symbol_kinds(conn, &mut rows)?;
139
140    let co_touch_pairs = co_touch_pairs(conn, options.include_generated, &rows)?;
141    let graph_pairs = graph_pairs(conn, options.include_generated, &rows)?;
142    let clusters = cluster_rows(&rows, &co_touch_pairs, &graph_pairs, options.min_cluster_size);
143    let clustered_files = clusters.iter().map(|cluster| cluster.rows.len() as u64).sum::<u64>();
144    let mut clusters = clusters.into_iter().map(cluster_for).collect::<Vec<_>>();
145    clusters.sort_by(|a, b| {
146        b.score
147            .total_cmp(&a.score)
148            .then_with(|| b.metrics.file_count.cmp(&a.metrics.file_count))
149            .then_with(|| a.name.cmp(&b.name))
150    });
151    for (index, cluster) in clusters.iter_mut().enumerate() {
152        cluster.cluster_id = format!("cluster_{:03}", index + 1);
153    }
154
155    let limit = usize::try_from(options.limit.max(1)).unwrap_or(usize::MAX);
156    clusters.truncate(limit);
157    let singleton_files = rows.len() as u64 - clustered_files;
158
159    Ok(RepoClustersReport {
160        summary: RepoClustersSummary {
161            total_files: summary.total_files,
162            clustered_files,
163            singleton_files,
164            returned_clusters: clusters.len() as u64,
165            generated_files: summary.generated_files,
166            git_commits: summary.git_commits,
167            git_file_changes: summary.git_file_changes,
168            graph_edges: summary.graph_edges,
169            repo_memories: summary.memories,
170            scoring_note:
171                "cheap sparse ownership clustering: path proximity, direct graph edges, bounded git co-touches, churn, and metadata"
172                    .to_string(),
173        },
174        clusters,
175        warnings,
176    })
177}
178
179fn cluster_rows(
180    rows: &[FileBriefRow],
181    co_touch_pairs: &BTreeMap<PathPair, u64>,
182    graph_pairs: &BTreeMap<PathPair, u64>,
183    min_cluster_size: u32,
184) -> Vec<ClusterBuild> {
185    if rows.is_empty() {
186        return Vec::new();
187    }
188
189    let candidates = candidate_pairs(rows, co_touch_pairs, graph_pairs);
190    let mut parent = UnionFind::new(rows.len());
191    let mut accepted_pairs = Vec::new();
192    for pair in candidates {
193        if pair.score >= MIN_EDGE_SCORE {
194            parent.union(pair.pair.0, pair.pair.1);
195            accepted_pairs.push(pair);
196        }
197    }
198
199    let mut groups = BTreeMap::<usize, Vec<usize>>::new();
200    for index in 0..rows.len() {
201        groups.entry(parent.find(index)).or_default().push(index);
202    }
203
204    let min_cluster_size = usize::try_from(min_cluster_size.max(2)).unwrap_or(usize::MAX);
205    let mut clusters = Vec::new();
206    for indexes in groups.values().filter(|indexes| indexes.len() >= min_cluster_size) {
207        let index_set = indexes.iter().copied().collect::<BTreeSet<_>>();
208        let mut edge_score_sum = 0.0;
209        let mut edge_count = 0_u64;
210        let mut co_touch_edges = 0_u64;
211        let mut graph_edges = 0_u64;
212        for pair in &accepted_pairs {
213            if index_set.contains(&pair.pair.0) && index_set.contains(&pair.pair.1) {
214                edge_score_sum += pair.score;
215                edge_count += 1;
216                co_touch_edges += co_touch_pairs.get(&pair.pair).copied().unwrap_or(0);
217                graph_edges += graph_pairs.get(&pair.pair).copied().unwrap_or(0);
218            }
219        }
220        let avg_edge_score = if edge_count == 0 { 0.0 } else { edge_score_sum / edge_count as f64 };
221        clusters.push(ClusterBuild {
222            rows: indexes.iter().map(|index| rows[*index].clone()).collect(),
223            avg_edge_score,
224            co_touch_edges,
225            graph_edges,
226        });
227    }
228    clusters
229}
230
231fn candidate_pairs(
232    rows: &[FileBriefRow],
233    co_touch_pairs: &BTreeMap<PathPair, u64>,
234    graph_pairs: &BTreeMap<PathPair, u64>,
235) -> Vec<ScoredPair> {
236    let mut evidence = BTreeMap::<PathPair, PairEvidence>::new();
237    for (pair, count) in co_touch_pairs {
238        evidence.entry(*pair).or_default().co_touch_count = *count;
239    }
240    for (pair, count) in graph_pairs {
241        evidence.entry(*pair).or_default().graph_edge_count = *count;
242    }
243    for pair in path_candidate_pairs(rows) {
244        evidence.entry(pair).or_default().path_candidate = true;
245    }
246
247    evidence
248        .into_iter()
249        .filter_map(|(pair, evidence)| {
250            let score = pair_score(&rows[pair.0], &rows[pair.1], &evidence);
251            (score > 0.0).then_some(ScoredPair { pair, score })
252        })
253        .collect()
254}
255
256fn pair_score(left: &FileBriefRow, right: &FileBriefRow, evidence: &PairEvidence) -> f64 {
257    let path = path_similarity(&left.path, &right.path);
258    let same_bucket = path_bucket(&left.path) == path_bucket(&right.path);
259    if !same_bucket {
260        return 0.0;
261    }
262    let co_touch = capped(evidence.co_touch_count as f64 / 4.0);
263    let graph = capped(evidence.graph_edge_count as f64 / 6.0);
264    let same_language = if left.language == right.language { 1.0 } else { 0.0 };
265    let same_kind = if left.kind == right.kind { 1.0 } else { 0.0 };
266    let churn = churn_similarity(left, right);
267    let path_weight = if evidence.path_candidate || same_bucket { 0.43 } else { 0.25 };
268    capped(
269        path * path_weight
270            + co_touch * 0.24
271            + graph * 0.22
272            + same_language * 0.06
273            + same_kind * 0.03
274            + churn * 0.02,
275    )
276}
277
278fn co_touch_pairs(
279    conn: &Connection,
280    include_generated: bool,
281    rows: &[FileBriefRow],
282) -> anyhow::Result<BTreeMap<PathPair, u64>> {
283    let path_index = path_index(rows);
284    let mut stmt = conn.prepare(
285        "
286        SELECT git_file_changes.commit_hash, git_file_changes.path
287        FROM git_file_changes
288        JOIN files ON files.path = git_file_changes.path
289        WHERE (?1 OR files.generated = 0)
290        ORDER BY git_file_changes.commit_hash, git_file_changes.path
291        ",
292    )?;
293    let mut query_rows = stmt.query([include_generated])?;
294    let mut pairs = BTreeMap::<PathPair, u64>::new();
295    let mut current_commit = String::new();
296    let mut paths = Vec::new();
297    while let Some(row) = query_rows.next()? {
298        let commit = row.get::<_, String>(0)?;
299        let path = row.get::<_, String>(1)?;
300        if current_commit.is_empty() {
301            current_commit = commit.clone();
302        }
303        if commit != current_commit {
304            add_co_touch_pairs(&path_index, &paths, &mut pairs);
305            paths.clear();
306            current_commit = commit;
307        }
308        paths.push(path);
309    }
310    add_co_touch_pairs(&path_index, &paths, &mut pairs);
311    Ok(pairs)
312}
313
314fn add_co_touch_pairs(
315    path_index: &HashMap<&str, usize>,
316    paths: &[String],
317    pairs: &mut BTreeMap<PathPair, u64>,
318) {
319    if paths.len() < 2 || paths.len() > MAX_COTOUCH_COMMIT_FILES {
320        return;
321    }
322    let mut indexes =
323        paths.iter().filter_map(|path| path_index.get(path.as_str()).copied()).collect::<Vec<_>>();
324    indexes.sort_unstable();
325    indexes.dedup();
326    for left in 0..indexes.len() {
327        for right in (left + 1)..indexes.len() {
328            *pairs.entry(PathPair::new(indexes[left], indexes[right])).or_default() += 1;
329        }
330    }
331}
332
333fn graph_pairs(
334    conn: &Connection,
335    include_generated: bool,
336    rows: &[FileBriefRow],
337) -> anyhow::Result<BTreeMap<PathPair, u64>> {
338    let path_index = path_index(rows);
339    let bucket_by_path = rows
340        .iter()
341        .map(|row| (row.path.as_str(), path_bucket(&row.path)))
342        .collect::<HashMap<_, _>>();
343    let mut stmt = conn.prepare(
344        "
345        SELECT source_files.path, target_files.path
346        FROM edges
347        JOIN files source_files ON source_files.id = edges.source_file_id
348        JOIN symbols target_symbols ON target_symbols.id = edges.to_symbol_id
349        JOIN files target_files ON target_files.id = target_symbols.file_id
350        WHERE source_files.path != target_files.path
351          AND (?1 OR (source_files.generated = 0 AND target_files.generated = 0))
352        ",
353    )?;
354    let mut pairs = BTreeMap::<PathPair, u64>::new();
355    let rows = stmt.query_map([include_generated], |row| {
356        Ok((row.get::<_, String>(0)?, row.get::<_, String>(1)?))
357    })?;
358    for row in rows {
359        let (left, right) = row?;
360        let (Some(left_bucket), Some(right_bucket)) =
361            (bucket_by_path.get(left.as_str()), bucket_by_path.get(right.as_str()))
362        else {
363            continue;
364        };
365        if left_bucket != right_bucket {
366            continue;
367        }
368        let (Some(left), Some(right)) =
369            (path_index.get(left.as_str()), path_index.get(right.as_str()))
370        else {
371            continue;
372        };
373        *pairs.entry(PathPair::new(*left, *right)).or_default() += 1;
374    }
375    Ok(pairs)
376}
377
378fn path_candidate_pairs(rows: &[FileBriefRow]) -> Vec<PathPair> {
379    let buckets = indexed_path_buckets(rows);
380    let mut pairs = Vec::new();
381    for indexes in buckets.values() {
382        if indexes.len() < 2 || indexes.len() > MAX_PATH_BUCKET_FILES {
383            continue;
384        }
385        for left in 0..indexes.len() {
386            for right in (left + 1)..indexes.len() {
387                pairs.push(PathPair::new(indexes[left], indexes[right]));
388            }
389        }
390    }
391    pairs
392}
393
394fn cluster_for(cluster: ClusterBuild) -> RepoCluster {
395    let mut metrics = RepoClusterMetrics::default();
396    let mut rows = cluster.rows;
397    rows.sort_by(|a, b| {
398        file_representative_score(b)
399            .total_cmp(&file_representative_score(a))
400            .then_with(|| a.path.cmp(&b.path))
401    });
402
403    for row in &rows {
404        metrics.file_count += 1;
405        metrics.total_lines += row.line_count;
406        metrics.total_chunks += row.chunk_count;
407        metrics.total_symbols += row.symbol_count;
408        metrics.fan_in += row.fan_in;
409        metrics.fan_out += row.fan_out;
410        metrics.commit_touch_count += row.commit_touch_count;
411        metrics.recent_touch_count += row.recent_touch_count;
412        metrics.additions += row.additions;
413        metrics.deletions += row.deletions;
414        *metrics.languages.entry(row.language.clone()).or_default() += 1;
415        *metrics.kinds.entry(row.kind.clone()).or_default() += 1;
416        metrics.memories.active += row.memories.active;
417        metrics.memories.stale += row.memories.stale;
418        metrics.memories.obsolete += row.memories.obsolete;
419        metrics.memories.other += row.memories.other;
420    }
421    metrics.co_touch_edges = cluster.co_touch_edges;
422    metrics.graph_edges = cluster.graph_edges;
423
424    let representative_paths =
425        rows.iter().take(MAX_CLUSTER_PATHS).map(|row| row.path.clone()).collect::<Vec<_>>();
426    let name = cluster_name(&rows);
427    let score = cluster_score(&metrics, cluster.avg_edge_score);
428    let category = cluster_category(&metrics).to_string();
429    let confidence = confidence_for(&metrics, cluster.avg_edge_score).to_string();
430    let why = why_for(&metrics, cluster.avg_edge_score, &name);
431    let next_tools = next_tools_for(representative_paths.first());
432
433    RepoCluster {
434        cluster_id: String::new(),
435        name,
436        category,
437        confidence,
438        score,
439        metrics,
440        representative_paths,
441        why,
442        next_tools,
443    }
444}
445
446fn file_representative_score(row: &FileBriefRow) -> f64 {
447    capped((row.fan_in + row.fan_out) as f64 / 50.0) * 0.40
448        + capped(row.symbol_count as f64 / 40.0) * 0.25
449        + capped(row.commit_touch_count as f64 / 25.0) * 0.25
450        + capped(row.line_count as f64 / 800.0) * 0.10
451}
452
453fn cluster_score(metrics: &RepoClusterMetrics, avg_edge_score: f64) -> f64 {
454    super::round_score(capped(
455        avg_edge_score * 0.35
456            + capped(metrics.file_count as f64 / 12.0) * 0.12
457            + capped((metrics.fan_in + metrics.fan_out) as f64 / 100.0) * 0.18
458            + capped(metrics.commit_touch_count as f64 / 60.0) * 0.15
459            + capped(metrics.total_symbols as f64 / 120.0) * 0.12
460            + capped((metrics.co_touch_edges + metrics.graph_edges) as f64 / 20.0) * 0.08,
461    ))
462}
463
464fn cluster_category(metrics: &RepoClusterMetrics) -> &'static str {
465    if metrics.co_touch_edges > 0 && metrics.graph_edges > 0 {
466        "graph_and_churn_ownership"
467    } else if metrics.co_touch_edges > 0 {
468        "co_touch_ownership"
469    } else if metrics.graph_edges > 0 {
470        "graph_neighborhood"
471    } else {
472        "path_proximity"
473    }
474}
475
476fn confidence_for(metrics: &RepoClusterMetrics, avg_edge_score: f64) -> &'static str {
477    if (metrics.co_touch_edges > 0 && metrics.graph_edges > 0) || avg_edge_score >= 0.55 {
478        "high"
479    } else if metrics.co_touch_edges > 0 || metrics.graph_edges > 0 || avg_edge_score >= 0.42 {
480        "medium"
481    } else {
482        "low"
483    }
484}
485
486fn why_for(metrics: &RepoClusterMetrics, avg_edge_score: f64, name: &str) -> Vec<String> {
487    let mut why = vec![format!("shared ownership bucket: {name}")];
488    if metrics.graph_edges > 0 {
489        why.push(format!(
490            "graph neighborhood: {} direct indexed cross-file edges",
491            metrics.graph_edges
492        ));
493    }
494    if metrics.co_touch_edges > 0 {
495        why.push(format!(
496            "co-touch history: {} bounded same-commit file-pair hits",
497            metrics.co_touch_edges
498        ));
499    }
500    if metrics.commit_touch_count > 0 {
501        why.push(format!(
502            "churn: {} total commit touches, {} recent",
503            metrics.commit_touch_count, metrics.recent_touch_count
504        ));
505    }
506    if metrics.total_symbols > 0 {
507        why.push(format!(
508            "source shape: {} files, {} symbols, {} lines",
509            metrics.file_count, metrics.total_symbols, metrics.total_lines
510        ));
511    }
512    why.push(format!("average accepted edge score: {:.3}", avg_edge_score));
513    why
514}
515
516fn next_tools_for(path: Option<&String>) -> Vec<RepoClusterNextTool> {
517    let Some(path) = path else {
518        return Vec::new();
519    };
520    vec![
521        next_tool(
522            "repo_brief",
523            "compare cluster representatives against spine/churn/god-module rankings",
524            [("mode", "refactor_candidates".to_string())],
525        ),
526        next_tool(
527            "impact_surface",
528            "inspect graph, git, papertrail, and memories for the representative path",
529            [("query", path.clone())],
530        ),
531        next_tool(
532            "git_history_for_path",
533            "inspect co-touch and churn history",
534            [("path", path.clone())],
535        ),
536    ]
537}
538
539fn next_tool<const N: usize>(
540    tool: &str,
541    reason: &str,
542    args: [(&str, String); N],
543) -> RepoClusterNextTool {
544    RepoClusterNextTool {
545        tool: tool.to_string(),
546        reason: reason.to_string(),
547        args: args
548            .into_iter()
549            .map(|(key, value)| (key.to_string(), serde_json::Value::String(value)))
550            .collect(),
551    }
552}
553
554fn cluster_name(rows: &[FileBriefRow]) -> String {
555    let paths = rows.iter().map(|row| row.path.as_str()).collect::<Vec<_>>();
556    let common = common_parent(&paths);
557    if !common.is_empty() {
558        return common;
559    }
560    let dominant_language = dominant_key(rows.iter().map(|row| row.language.as_str()));
561    format!("{dominant_language} files")
562}
563
564fn common_parent(paths: &[&str]) -> String {
565    let mut iter = paths.iter();
566    let Some(first) = iter.next() else {
567        return String::new();
568    };
569    let mut common = parent_components(first);
570    for path in iter {
571        let components = parent_components(path);
572        let keep =
573            common.iter().zip(components.iter()).take_while(|(left, right)| left == right).count();
574        common.truncate(keep);
575    }
576    common.join("/")
577}
578
579fn dominant_key<'a>(values: impl Iterator<Item = &'a str>) -> String {
580    let mut counts = BTreeMap::<&str, u64>::new();
581    for value in values {
582        *counts.entry(value).or_default() += 1;
583    }
584    counts
585        .into_iter()
586        .max_by(|(left_key, left_count), (right_key, right_count)| {
587            left_count.cmp(right_count).then_with(|| right_key.cmp(left_key))
588        })
589        .map(|(value, _)| value.to_string())
590        .unwrap_or_else(|| "source".to_string())
591}
592
593fn path_similarity(left: &str, right: &str) -> f64 {
594    let left_components = path_components(left);
595    let right_components = path_components(right);
596    let common_prefix = left_components
597        .iter()
598        .zip(right_components.iter())
599        .take_while(|(left, right)| left == right)
600        .count();
601    let prefix = if left_components.is_empty() && right_components.is_empty() {
602        0.0
603    } else {
604        common_prefix as f64 / left_components.len().max(right_components.len()) as f64
605    };
606    let left_tokens = path_tokens(left);
607    let right_tokens = path_tokens(right);
608    let union = left_tokens.union(&right_tokens).count();
609    let token_overlap = if union == 0 {
610        0.0
611    } else {
612        left_tokens.intersection(&right_tokens).count() as f64 / union as f64
613    };
614    prefix * 0.65 + token_overlap * 0.35
615}
616
617fn churn_similarity(left: &FileBriefRow, right: &FileBriefRow) -> f64 {
618    let left_churn = left.additions.saturating_add(left.deletions);
619    let right_churn = right.additions.saturating_add(right.deletions);
620    let max_churn = left_churn.max(right_churn);
621    let churn = if max_churn == 0 {
622        0.0
623    } else {
624        1.0 - (left_churn.abs_diff(right_churn) as f64 / max_churn as f64)
625    };
626    let max_touches = left.commit_touch_count.max(right.commit_touch_count);
627    let touches = if max_touches == 0 {
628        0.0
629    } else {
630        1.0 - (left.commit_touch_count.abs_diff(right.commit_touch_count) as f64
631            / max_touches as f64)
632    };
633    (churn + touches) * 0.5
634}
635
636fn path_bucket(path: &str) -> String {
637    let mut components = parent_components(path);
638    components.truncate(5);
639    match components.as_slice() {
640        [] => ".".to_string(),
641        [one] => one.clone(),
642        _ => components.join("/"),
643    }
644}
645
646fn indexed_path_buckets(rows: &[FileBriefRow]) -> BTreeMap<String, Vec<usize>> {
647    let mut buckets = BTreeMap::<String, Vec<usize>>::new();
648    for (index, row) in rows.iter().enumerate() {
649        buckets.entry(path_bucket(&row.path)).or_default().push(index);
650    }
651    buckets
652}
653
654fn parent_components(path: &str) -> Vec<String> {
655    let mut components = path_components(path);
656    components.pop();
657    components
658}
659
660fn path_components(path: &str) -> Vec<String> {
661    path.split('/').filter(|part| !part.is_empty()).map(str::to_string).collect()
662}
663
664fn path_tokens(path: &str) -> BTreeSet<String> {
665    path.split(|ch: char| !ch.is_ascii_alphanumeric())
666        .filter(|part| part.len() > 1)
667        .map(|part| part.to_ascii_lowercase())
668        .collect()
669}
670
671fn path_index(rows: &[FileBriefRow]) -> HashMap<&str, usize> {
672    rows.iter().enumerate().map(|(index, row)| (row.path.as_str(), index)).collect()
673}
674
675fn capped(value: f64) -> f64 {
676    value.clamp(0.0, 1.0)
677}
678
679#[derive(Debug)]
680struct UnionFind {
681    parent: Vec<usize>,
682    rank: Vec<u8>,
683}
684
685impl UnionFind {
686    fn new(len: usize) -> Self {
687        Self { parent: (0..len).collect(), rank: vec![0; len] }
688    }
689
690    fn find(&mut self, value: usize) -> usize {
691        if self.parent[value] != value {
692            self.parent[value] = self.find(self.parent[value]);
693        }
694        self.parent[value]
695    }
696
697    fn union(&mut self, left: usize, right: usize) {
698        let left_root = self.find(left);
699        let right_root = self.find(right);
700        if left_root == right_root {
701            return;
702        }
703        if self.rank[left_root] < self.rank[right_root] {
704            self.parent[left_root] = right_root;
705        } else if self.rank[left_root] > self.rank[right_root] {
706            self.parent[right_root] = left_root;
707        } else {
708            self.parent[right_root] = left_root;
709            self.rank[left_root] += 1;
710        }
711    }
712}
713
714#[cfg(test)]
715mod tests {
716    use super::*;
717
718    #[test]
719    fn path_similarity_prefers_nearby_files() {
720        let nearby = path_similarity("src/actors/sync/actor.rs", "src/actors/sync/msg.rs");
721        let distant = path_similarity("src/actors/sync/actor.rs", "apps/mobile/src/App.tsx");
722        assert!(nearby > distant, "nearby={nearby} distant={distant}");
723    }
724
725    #[test]
726    fn cluster_rows_uses_cotouch_edges() {
727        let rows =
728            vec![row("src/sync/actor.rs"), row("src/sync/msg.rs"), row("apps/mobile/App.tsx")];
729        let co_touch_pairs = BTreeMap::from([(PathPair::new(0, 1), 3)]);
730        let graph_pairs = BTreeMap::new();
731
732        let clusters = cluster_rows(&rows, &co_touch_pairs, &graph_pairs, 2);
733
734        assert_eq!(clusters.len(), 1);
735        assert_eq!(clusters[0].rows.len(), 2);
736        assert_eq!(clusters[0].co_touch_edges, 3);
737    }
738
739    fn row(path: &str) -> FileBriefRow {
740        FileBriefRow {
741            path: path.to_string(),
742            language: "rust".to_string(),
743            kind: "source".to_string(),
744            generated: false,
745            line_count: 10,
746            chunk_count: 1,
747            symbol_count: 1,
748            fan_in: 0,
749            fan_out: 0,
750            commit_touch_count: 1,
751            recent_touch_count: 0,
752            additions: 10,
753            deletions: 0,
754            github_ref_count: 0,
755            symbol_kinds: BTreeMap::new(),
756            memories: RepoBriefMemoryCounts::default(),
757        }
758    }
759}