Skip to main content

dupes_core/
lib.rs

1pub mod analyzer;
2pub mod cli;
3pub mod code_unit;
4pub mod config;
5pub mod error;
6pub mod extractor;
7pub mod fingerprint;
8pub mod grouper;
9pub mod ignore;
10pub mod node;
11pub mod output;
12pub mod scanner;
13pub mod similarity;
14
15use std::collections::HashSet;
16use std::path::PathBuf;
17
18use analyzer::LanguageAnalyzer;
19use code_unit::CodeUnit;
20use config::Config;
21use fingerprint::Fingerprint;
22use grouper::{DuplicateGroup, DuplicationStats};
23
24/// The result of a full analysis run.
25pub struct AnalysisResult {
26    pub stats: DuplicationStats,
27    pub exact_groups: Vec<DuplicateGroup>,
28    pub near_groups: Vec<DuplicateGroup>,
29    pub sub_exact_groups: Vec<DuplicateGroup>,
30    pub sub_near_groups: Vec<DuplicateGroup>,
31    pub warnings: Vec<String>,
32    /// All group fingerprints (exact + near) before ignore filtering.
33    /// Used by the cleanup command to identify stale ignore entries.
34    pub all_fingerprints: HashSet<Fingerprint>,
35}
36
37/// Run the full analysis pipeline using a language analyzer.
38///
39/// Reads each file, parses it via the analyzer, optionally filters test code,
40/// then delegates to [`analyze_units`] for grouping, similarity, and stats.
41pub fn analyze(
42    analyzer: &dyn LanguageAnalyzer,
43    files: &[PathBuf],
44    config: &Config,
45) -> error::Result<AnalysisResult> {
46    let analysis_config = config.analysis_config();
47    let mut units = Vec::new();
48    let mut warnings = Vec::new();
49
50    for path in files {
51        let source = match std::fs::read_to_string(path) {
52            Ok(s) => s,
53            Err(e) => {
54                warnings.push(format!("Failed to read {}: {}", path.display(), e));
55                continue;
56            }
57        };
58        match analyzer.parse_file(path, &source, &analysis_config) {
59            Ok(mut file_units) => {
60                if config.exclude_tests {
61                    file_units.retain(|u| !analyzer.is_test_code(u));
62                }
63                units.extend(file_units);
64            }
65            Err(e) => warnings.push(e.to_string()),
66        }
67    }
68
69    analyze_units(&units, warnings, config)
70}
71
72/// Run the analysis pipeline on pre-parsed code units.
73///
74/// The caller is responsible for scanning files and parsing them into `CodeUnit`s.
75/// This function handles grouping, similarity detection, ignore filtering, and stats.
76pub fn analyze_units(
77    units: &[CodeUnit],
78    warnings: Vec<String>,
79    config: &Config,
80) -> error::Result<AnalysisResult> {
81    // 1. Group exact duplicates
82    let exact_groups = grouper::group_exact_duplicates(units);
83
84    // 2. Find near-duplicates
85    let exact_fps: Vec<_> = exact_groups.iter().map(|g| g.fingerprint).collect();
86    let near_groups = grouper::find_near_duplicates(units, config.similarity_threshold, &exact_fps);
87
88    // 3. Sub-function duplicate detection (opt-in)
89    let (sub_exact_groups, sub_near_groups) = if config.sub_function {
90        // Extract sub-units from each code unit
91        let sub_units: Vec<CodeUnit> = units
92            .iter()
93            .flat_map(|unit| {
94                let sub_units = extractor::extract_sub_units(&unit.body, config.min_sub_nodes);
95                sub_units.into_iter().map(|su| CodeUnit {
96                    kind: su.kind,
97                    name: su.description,
98                    file: unit.file.clone(),
99                    line_start: unit.line_start,
100                    line_end: unit.line_end,
101                    signature: node::NormalizedNode::leaf(node::NodeKind::Opaque),
102                    body: su.node.clone(),
103                    fingerprint: fingerprint::Fingerprint::from_node(&su.node),
104                    node_count: su.node_count,
105                    parent_name: Some(unit.name.clone()),
106                    is_test: unit.is_test,
107                })
108            })
109            .collect();
110
111        let sub_exact = grouper::group_exact_duplicates(&sub_units);
112        let sub_exact_fps: Vec<_> = sub_exact.iter().map(|g| g.fingerprint).collect();
113        let sub_near =
114            grouper::find_near_duplicates(&sub_units, config.similarity_threshold, &sub_exact_fps);
115        (sub_exact, sub_near)
116    } else {
117        (Vec::new(), Vec::new())
118    };
119
120    // 4. Collect all fingerprints before filtering (for cleanup staleness check)
121    let all_fingerprints: HashSet<Fingerprint> = exact_groups
122        .iter()
123        .chain(near_groups.iter())
124        .chain(sub_exact_groups.iter())
125        .chain(sub_near_groups.iter())
126        .map(|g| g.fingerprint)
127        .collect();
128
129    // 5. Apply ignore filtering
130    let ignore_file = ignore::load_ignore_file(&config.root);
131    let exact_groups = ignore::filter_ignored(exact_groups, &ignore_file);
132    let near_groups = ignore::filter_ignored(near_groups, &ignore_file);
133    let sub_exact_groups = ignore::filter_ignored(sub_exact_groups, &ignore_file);
134    let sub_near_groups = ignore::filter_ignored(sub_near_groups, &ignore_file);
135
136    // 6. Compute stats
137    let stats = grouper::compute_stats_with_sub(
138        units,
139        &exact_groups,
140        &near_groups,
141        &sub_exact_groups,
142        &sub_near_groups,
143    );
144
145    Ok(AnalysisResult {
146        stats,
147        exact_groups,
148        near_groups,
149        sub_exact_groups,
150        sub_near_groups,
151        warnings,
152        all_fingerprints,
153    })
154}