debtmap/organization/
file_classifier.rs

1//! File classification and context-aware size threshold determination.
2//!
3//! This module provides heuristics to classify files by their purpose and
4//! architectural role, enabling context-appropriate size thresholds rather
5//! than one-size-fits-all limits.
6
7use once_cell::sync::Lazy;
8use regex::Regex;
9use serde::{Deserialize, Serialize};
10use std::path::Path;
11
12/// File type classification based on purpose and content patterns
13#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
14pub enum FileType {
15    /// Business logic and application code (strict thresholds)
16    BusinessLogic,
17    /// Generated code from code generators (lenient/suppressed)
18    GeneratedCode { tool: Option<String> },
19    /// Test code (unit, integration, property, benchmark)
20    TestCode { test_type: TestType },
21    /// Declarative configuration (flags, schemas, routes, builders)
22    DeclarativeConfig { config_type: ConfigType },
23    /// Procedural macros
24    ProceduralMacro,
25    /// Build scripts
26    BuildScript,
27    /// Unknown/unclassified
28    Unknown,
29}
30
31#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
32pub enum TestType {
33    Unit,
34    Integration,
35    Property,
36    Benchmark,
37}
38
39#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
40pub enum ConfigType {
41    Flags,
42    Schema,
43    Routes,
44    Builder,
45}
46
47#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
48pub enum ReductionTarget {
49    /// Single target line count
50    Single(usize),
51    /// Phased reduction for very large files
52    Phased {
53        phase1: usize,
54        phase2: usize,
55        final_target: usize,
56    },
57    /// Reduction not recommended (with reason)
58    NotRecommended { reason: String },
59}
60
61/// Size thresholds for a specific file type
62#[derive(Debug, Clone)]
63pub struct FileSizeThresholds {
64    pub base_threshold: usize,
65    pub max_threshold: usize,
66    pub min_lines_per_function: f32,
67}
68
69/// Complete file size analysis with context
70#[derive(Debug)]
71pub struct FileSizeAnalysis {
72    pub file_type: FileType,
73    pub current_lines: usize,
74    pub threshold: FileSizeThresholds,
75    pub reduction_target: ReductionTarget,
76    pub function_density: f32,
77    pub recommendation_level: RecommendationLevel,
78}
79
80#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
81pub enum RecommendationLevel {
82    Critical,   // >2x threshold, business logic
83    High,       // >1.5x threshold, business logic
84    Medium,     // >threshold but <1.5x
85    Low,        // Slightly over threshold
86    Suppressed, // Generated/declarative code
87}
88
89// Pre-compiled regex patterns for performance
90static FIELD_PATTERN: Lazy<Regex> =
91    Lazy::new(|| Regex::new(r"(?m)^\s*pub\s+\w+:\s+\w+,?\s*$").unwrap());
92
93static DERIVE_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?m)^\s*#\[derive\(").unwrap());
94
95static BUILDER_METHOD_PATTERN: Lazy<Regex> =
96    Lazy::new(|| Regex::new(r"(?m)^\s*pub\s+fn\s+\w+\(mut\s+self").unwrap());
97
98/// Classify a file based on its content and path
99pub fn classify_file(source: &str, path: &Path) -> FileType {
100    // Multi-stage classification with priority order
101    if is_generated_code(source) {
102        FileType::GeneratedCode {
103            tool: detect_generator(source),
104        }
105    } else if is_test_file(path, source) {
106        FileType::TestCode {
107            test_type: detect_test_type(source),
108        }
109    } else if is_declarative_config(source) {
110        FileType::DeclarativeConfig {
111            config_type: detect_config_type(source),
112        }
113    } else if is_proc_macro(path) {
114        FileType::ProceduralMacro
115    } else if is_build_script(path) {
116        FileType::BuildScript
117    } else {
118        FileType::BusinessLogic
119    }
120}
121
122/// Check if file is generated code
123fn is_generated_code(source: &str) -> bool {
124    let markers = [
125        "DO NOT EDIT",
126        "automatically generated",
127        "AUTO-GENERATED",
128        "@generated",
129        "Code generated by",
130        "autogenerated",
131    ];
132
133    source
134        .lines()
135        .take(20)
136        .any(|line| markers.iter().any(|m| line.contains(m)))
137}
138
139/// Detect the code generation tool if possible
140fn detect_generator(source: &str) -> Option<String> {
141    if source.contains("prost::Message") {
142        Some("prost".to_string())
143    } else if source.contains("diesel::") {
144        Some("diesel".to_string())
145    } else if source.contains("tonic::") {
146        Some("tonic".to_string())
147    } else if source.contains("sea_orm::") {
148        Some("sea-orm".to_string())
149    } else {
150        None
151    }
152}
153
154/// Check if file is a test file
155fn is_test_file(path: &Path, source: &str) -> bool {
156    // Check path patterns
157    let path_str = path.to_string_lossy();
158    let is_test_path = path_str.contains("/tests/")
159        || path_str.contains("/benches/")
160        || path_str.ends_with("_test.rs")
161        || path_str.ends_with("_tests.rs");
162
163    // Check for test attributes
164    let has_test_attrs = source.contains("#[test]")
165        || source.contains("#[cfg(test)]")
166        || source.contains("#[bench]");
167
168    is_test_path || has_test_attrs
169}
170
171/// Detect the type of test code
172fn detect_test_type(source: &str) -> TestType {
173    if source.contains("proptest!") || source.contains("quickcheck!") {
174        TestType::Property
175    } else if source.contains("#[bench]") || source.contains("criterion") {
176        TestType::Benchmark
177    } else if source.contains("tests/integration") {
178        TestType::Integration
179    } else {
180        TestType::Unit
181    }
182}
183
184/// Check if file is primarily declarative configuration
185fn is_declarative_config(source: &str) -> bool {
186    let field_matches = FIELD_PATTERN.find_iter(source).count();
187    let derive_matches = DERIVE_PATTERN.find_iter(source).count();
188    let builder_matches = BUILDER_METHOD_PATTERN.find_iter(source).count();
189
190    let total_matches = field_matches + derive_matches + builder_matches;
191    let total_lines = source.lines().count();
192
193    if total_lines == 0 {
194        return false;
195    }
196
197    // If >50% of lines match declarative patterns
198    (total_matches as f32 / total_lines as f32) > 0.5
199}
200
201/// Detect the type of declarative configuration
202fn detect_config_type(source: &str) -> ConfigType {
203    if source.contains("clap::Parser") || source.contains("structopt") {
204        ConfigType::Flags
205    } else if source.contains("serde::") && source.contains("Deserialize") {
206        ConfigType::Schema
207    } else if source.contains("Router") || source.contains("routes") {
208        ConfigType::Routes
209    } else {
210        ConfigType::Builder
211    }
212}
213
214/// Check if file is a procedural macro
215fn is_proc_macro(path: &Path) -> bool {
216    let path_str = path.to_string_lossy();
217    path_str.contains("/proc-macro/") || path_str.contains("/macros/")
218}
219
220/// Check if file is a build script
221fn is_build_script(path: &Path) -> bool {
222    path.file_name()
223        .and_then(|n| n.to_str())
224        .map(|n| n == "build.rs")
225        .unwrap_or(false)
226}
227
228/// Get context-aware thresholds for a file type
229pub fn get_threshold(
230    file_type: &FileType,
231    function_count: usize,
232    lines: usize,
233) -> FileSizeThresholds {
234    let base = match file_type {
235        FileType::BusinessLogic => 400,
236        FileType::GeneratedCode { .. } => 5000,
237        FileType::TestCode { .. } => 650,
238        FileType::DeclarativeConfig { .. } => 1200,
239        FileType::ProceduralMacro => 500,
240        FileType::BuildScript => 300,
241        FileType::Unknown => 400,
242    };
243
244    // Adjust based on function density
245    let density = if function_count > 0 {
246        lines as f32 / function_count as f32
247    } else {
248        0.0
249    };
250    let adjusted = adjust_for_density(base, density);
251
252    FileSizeThresholds {
253        base_threshold: adjusted,
254        max_threshold: adjusted * 2,
255        min_lines_per_function: 3.0,
256    }
257}
258
259/// Adjust threshold based on function density
260fn adjust_for_density(base_threshold: usize, density: f32) -> usize {
261    match density {
262        d if d < 5.0 => base_threshold, // Many small functions: strict
263        d if d < 10.0 => (base_threshold as f32 * 1.2) as usize,
264        d if d < 20.0 => (base_threshold as f32 * 1.5) as usize,
265        _ => (base_threshold as f32 * 2.0) as usize, // Few large functions: lenient
266    }
267}
268
269/// Calculate a practical reduction target
270pub fn calculate_reduction_target(
271    current_lines: usize,
272    threshold: &FileSizeThresholds,
273    function_count: usize,
274) -> ReductionTarget {
275    // Minimum achievable size based on function count
276    let min_achievable = (function_count as f32 * threshold.min_lines_per_function) as usize;
277
278    // Don't suggest reducing below achievable minimum
279    let target = threshold.base_threshold.max(min_achievable);
280
281    if current_lines > threshold.base_threshold * 3 {
282        // Phased reduction for very large files
283        ReductionTarget::Phased {
284            phase1: current_lines / 2,
285            phase2: (threshold.base_threshold as f32 * 1.5) as usize,
286            final_target: target,
287        }
288    } else if current_lines <= threshold.base_threshold {
289        // Already within threshold
290        ReductionTarget::NotRecommended {
291            reason: "File is already within size threshold".to_string(),
292        }
293    } else {
294        ReductionTarget::Single(target)
295    }
296}
297
298/// Determine recommendation level based on file type and size
299pub fn recommendation_level(
300    file_type: &FileType,
301    current_lines: usize,
302    threshold: &FileSizeThresholds,
303) -> RecommendationLevel {
304    match file_type {
305        FileType::GeneratedCode { .. } => RecommendationLevel::Suppressed,
306        FileType::BusinessLogic => {
307            let ratio = current_lines as f32 / threshold.base_threshold as f32;
308            if ratio > 2.0 {
309                RecommendationLevel::Critical
310            } else if ratio > 1.5 {
311                RecommendationLevel::High
312            } else if ratio > 1.0 {
313                RecommendationLevel::Medium
314            } else {
315                RecommendationLevel::Low
316            }
317        }
318        _ => {
319            let ratio = current_lines as f32 / threshold.base_threshold as f32;
320            if ratio > 2.0 {
321                RecommendationLevel::High
322            } else if ratio > 1.5 {
323                RecommendationLevel::Medium
324            } else {
325                RecommendationLevel::Low
326            }
327        }
328    }
329}
330
331#[cfg(test)]
332mod tests {
333    use super::*;
334
335    #[test]
336    fn test_generated_code_detection() {
337        let generated = r#"
338// DO NOT EDIT
339// This file is automatically generated
340pub struct Generated {}
341        "#;
342        assert!(is_generated_code(generated));
343    }
344
345    #[test]
346    fn test_not_generated_code() {
347        let normal = r#"
348pub struct Normal {
349    field: String,
350}
351        "#;
352        assert!(!is_generated_code(normal));
353    }
354
355    #[test]
356    fn test_declarative_config_detection() {
357        let flags = r#"
358#[derive(Debug)]
359pub struct Flags {
360    pub verbose: bool,
361    pub quiet: bool,
362    pub output: PathBuf,
363    pub debug: bool,
364    pub trace: bool,
365    pub log_level: String,
366    pub log_file: PathBuf,
367}
368        "#;
369        assert!(is_declarative_config(flags));
370    }
371
372    #[test]
373    fn test_reduction_target_respects_function_count() {
374        let threshold = FileSizeThresholds {
375            base_threshold: 500,
376            max_threshold: 1000,
377            min_lines_per_function: 3.0,
378        };
379
380        let target = calculate_reduction_target(2000, &threshold, 600);
381        // Should not suggest <1800 lines (600 functions * 3 lines)
382        match target {
383            ReductionTarget::Single(t) => assert!(t >= 1800),
384            ReductionTarget::Phased { final_target, .. } => assert!(final_target >= 1800),
385            _ => panic!("Expected reduction target"),
386        }
387    }
388
389    #[test]
390    fn test_function_density_adjustment() {
391        let low_density = adjust_for_density(400, 4.0); // Many small functions
392        let high_density = adjust_for_density(400, 25.0); // Few large functions
393
394        assert_eq!(low_density, 400); // Strict threshold
395        assert!(high_density > 600); // More lenient
396    }
397
398    #[test]
399    fn test_test_file_detection() {
400        let test_code = r#"
401#[cfg(test)]
402mod tests {
403    #[test]
404    fn test_something() {}
405}
406        "#;
407        let path = Path::new("src/main.rs");
408        assert!(is_test_file(path, test_code));
409    }
410
411    #[test]
412    fn test_build_script_detection() {
413        assert!(is_build_script(Path::new("build.rs")));
414        assert!(!is_build_script(Path::new("src/main.rs")));
415    }
416
417    #[test]
418    fn test_recommendation_level_for_business_logic() {
419        let file_type = FileType::BusinessLogic;
420        let threshold = FileSizeThresholds {
421            base_threshold: 400,
422            max_threshold: 800,
423            min_lines_per_function: 3.0,
424        };
425
426        // >2x threshold
427        assert_eq!(
428            recommendation_level(&file_type, 900, &threshold),
429            RecommendationLevel::Critical
430        );
431
432        // >1.5x threshold
433        assert_eq!(
434            recommendation_level(&file_type, 650, &threshold),
435            RecommendationLevel::High
436        );
437
438        // >1x threshold
439        assert_eq!(
440            recommendation_level(&file_type, 450, &threshold),
441            RecommendationLevel::Medium
442        );
443    }
444
445    #[test]
446    fn test_generated_code_suppressed() {
447        let file_type = FileType::GeneratedCode { tool: None };
448        let threshold = FileSizeThresholds {
449            base_threshold: 400,
450            max_threshold: 800,
451            min_lines_per_function: 3.0,
452        };
453
454        assert_eq!(
455            recommendation_level(&file_type, 10000, &threshold),
456            RecommendationLevel::Suppressed
457        );
458    }
459}