Skip to main content

tokmd_analysis/
util.rs

1use std::path::{Path, PathBuf};
2use std::time::{SystemTime, UNIX_EPOCH};
3
4use tokmd_analysis_types::FileStatRow;
5
6pub(crate) fn now_ms() -> u128 {
7    SystemTime::now()
8        .duration_since(UNIX_EPOCH)
9        .unwrap_or_default()
10        .as_millis()
11}
12
13#[cfg(any(feature = "git", feature = "content"))]
14pub(crate) fn normalize_path(path: &str, root: &Path) -> String {
15    let mut out = path.replace('\\', "/");
16    if let Ok(stripped) = Path::new(&out).strip_prefix(root) {
17        out = stripped.to_string_lossy().replace('\\', "/");
18    }
19    if let Some(stripped) = out.strip_prefix("./") {
20        out = stripped.to_string();
21    }
22    out
23}
24
25pub(crate) fn path_depth(path: &str) -> usize {
26    path.split('/').filter(|seg| !seg.is_empty()).count().max(1)
27}
28
29pub(crate) fn is_test_path(path: &str) -> bool {
30    let lower = path.to_lowercase();
31    if lower.contains("/test/") || lower.contains("/tests/") || lower.contains("__tests__") {
32        return true;
33    }
34    if lower.contains("/spec/") || lower.contains("/specs/") {
35        return true;
36    }
37    let name = lower.rsplit('/').next().unwrap_or(&lower);
38    name.contains("_test")
39        || name.contains(".test.")
40        || name.contains(".spec.")
41        || name.starts_with("test_")
42        || name.ends_with("_test.rs")
43}
44
45pub(crate) fn is_infra_lang(lang: &str) -> bool {
46    let l = lang.to_lowercase();
47    matches!(
48        l.as_str(),
49        "json"
50            | "yaml"
51            | "toml"
52            | "markdown"
53            | "xml"
54            | "html"
55            | "css"
56            | "scss"
57            | "less"
58            | "makefile"
59            | "dockerfile"
60            | "hcl"
61            | "terraform"
62            | "nix"
63            | "cmake"
64            | "ini"
65            | "properties"
66            | "gitignore"
67            | "gitconfig"
68            | "editorconfig"
69            | "csv"
70            | "tsv"
71            | "svg"
72    )
73}
74
75pub(crate) fn percentile(sorted: &[usize], pct: f64) -> f64 {
76    if sorted.is_empty() {
77        return 0.0;
78    }
79    let idx = (pct * (sorted.len() as f64 - 1.0)).ceil() as usize;
80    sorted[idx.min(sorted.len() - 1)] as f64
81}
82
83pub(crate) fn gini_coefficient(sorted: &[usize]) -> f64 {
84    if sorted.is_empty() {
85        return 0.0;
86    }
87    let n = sorted.len() as f64;
88    let sum: f64 = sorted.iter().map(|v| *v as f64).sum();
89    if sum == 0.0 {
90        return 0.0;
91    }
92    let mut accum = 0.0;
93    for (i, value) in sorted.iter().enumerate() {
94        let i = i as f64 + 1.0;
95        accum += (2.0 * i - n - 1.0) * (*value as f64);
96    }
97    accum / (n * sum)
98}
99
100pub(crate) fn safe_ratio(numer: usize, denom: usize) -> f64 {
101    if denom == 0 {
102        0.0
103    } else {
104        round_f64(numer as f64 / denom as f64, 4)
105    }
106}
107
108pub(crate) fn round_f64(value: f64, decimals: u32) -> f64 {
109    let factor = 10f64.powi(decimals as i32);
110    (value * factor).round() / factor
111}
112
113pub(crate) fn empty_file_row() -> FileStatRow {
114    FileStatRow {
115        path: String::new(),
116        module: String::new(),
117        lang: String::new(),
118        code: 0,
119        comments: 0,
120        blanks: 0,
121        lines: 0,
122        bytes: 0,
123        tokens: 0,
124        doc_pct: None,
125        bytes_per_line: None,
126        depth: 0,
127    }
128}
129
130pub fn normalize_root(root: &Path) -> PathBuf {
131    root.canonicalize().unwrap_or_else(|_| root.to_path_buf())
132}
133
134#[cfg(test)]
135mod tests {
136    use super::*;
137    use proptest::prelude::*;
138
139    proptest! {
140        // ========================
141        // Percentile Properties
142        // ========================
143
144        #[test]
145        fn percentile_empty_is_zero(pct in 0.0f64..=1.0) {
146            prop_assert_eq!(percentile(&[], pct), 0.0);
147        }
148
149        #[test]
150        fn percentile_in_bounds(mut values in prop::collection::vec(0usize..10000, 1..100),
151                                 pct in 0.0f64..=1.0) {
152            values.sort();
153            let result = percentile(&values, pct);
154            let min = *values.first().unwrap() as f64;
155            let max = *values.last().unwrap() as f64;
156            prop_assert!(result >= min, "Percentile {} below min {}", result, min);
157            prop_assert!(result <= max, "Percentile {} above max {}", result, max);
158        }
159
160        #[test]
161        fn percentile_zero_is_min(mut values in prop::collection::vec(0usize..10000, 1..100)) {
162            values.sort();
163            let p0 = percentile(&values, 0.0);
164            // 0th percentile uses ceil(0 * (n-1)) = 0, so it's the first element
165            prop_assert_eq!(p0, *values.first().unwrap() as f64);
166        }
167
168        #[test]
169        fn percentile_one_is_max(mut values in prop::collection::vec(0usize..10000, 1..100)) {
170            values.sort();
171            let p100 = percentile(&values, 1.0);
172            prop_assert_eq!(p100, *values.last().unwrap() as f64);
173        }
174
175        #[test]
176        fn percentile_monotonic(mut values in prop::collection::vec(0usize..10000, 2..100),
177                                 pct1 in 0.0f64..=1.0,
178                                 pct2 in 0.0f64..=1.0) {
179            values.sort();
180            let p1 = percentile(&values, pct1);
181            let p2 = percentile(&values, pct2);
182            if pct1 <= pct2 {
183                prop_assert!(p1 <= p2, "Percentile should be monotonic: p({})={} > p({})={}", pct1, p1, pct2, p2);
184            } else {
185                prop_assert!(p1 >= p2, "Percentile should be monotonic: p({})={} < p({})={}", pct1, p1, pct2, p2);
186            }
187        }
188
189        // ========================
190        // Gini Coefficient Properties
191        // ========================
192
193        #[test]
194        fn gini_empty_is_zero(_dummy in 0..1u8) {
195            prop_assert_eq!(gini_coefficient(&[]), 0.0);
196        }
197
198        #[test]
199        fn gini_all_zeros_is_zero(len in 1usize..100) {
200            let values = vec![0usize; len];
201            prop_assert_eq!(gini_coefficient(&values), 0.0);
202        }
203
204        #[test]
205        fn gini_in_bounds(values in prop::collection::vec(0usize..1000, 1..100)) {
206            let mut sorted = values;
207            sorted.sort();
208            let gini = gini_coefficient(&sorted);
209            prop_assert!(gini >= 0.0, "Gini must be non-negative: got {}", gini);
210            prop_assert!(gini <= 1.0, "Gini must be at most 1: got {}", gini);
211        }
212
213        #[test]
214        fn gini_uniform_is_zero(value in 1usize..1000, len in 2usize..100) {
215            // Perfect equality: all same non-zero value
216            let values = vec![value; len];
217            let gini = gini_coefficient(&values);
218            prop_assert!(gini.abs() < 0.0001, "Uniform distribution should have Gini ~0: got {}", gini);
219        }
220
221        #[test]
222        fn gini_one_nonzero_high(len in 2usize..100) {
223            // Maximum inequality: one person has everything
224            let mut values = vec![0usize; len - 1];
225            values.push(1000);
226            values.sort();
227            let gini = gini_coefficient(&values);
228            // Gini approaches (n-1)/n as inequality increases
229            let expected_max = (len - 1) as f64 / len as f64;
230            prop_assert!(gini >= expected_max - 0.01, "Extreme inequality should have high Gini: got {}, expected ~{}", gini, expected_max);
231        }
232
233        // ========================
234        // Safe Ratio Properties
235        // ========================
236
237        #[test]
238        fn safe_ratio_zero_denom_is_zero(numer in 0usize..10000) {
239            prop_assert_eq!(safe_ratio(numer, 0), 0.0);
240        }
241
242        #[test]
243        fn safe_ratio_zero_numer_is_zero(denom in 1usize..10000) {
244            prop_assert_eq!(safe_ratio(0, denom), 0.0);
245        }
246
247        #[test]
248        fn safe_ratio_same_is_one(value in 1usize..10000) {
249            prop_assert_eq!(safe_ratio(value, value), 1.0);
250        }
251
252        #[test]
253        fn safe_ratio_has_limited_decimals(numer in 0usize..10000, denom in 1usize..10000) {
254            let ratio = safe_ratio(numer, denom);
255            let s = format!("{}", ratio);
256            // Split on decimal point and check digits after
257            if let Some(dot_pos) = s.find('.') {
258                let decimals = s.len() - dot_pos - 1;
259                prop_assert!(decimals <= 4, "Should have at most 4 decimals: {} has {}", s, decimals);
260            }
261        }
262
263        // ========================
264        // Round Properties
265        // ========================
266
267        #[test]
268        fn round_idempotent(value in -1000.0f64..1000.0, decimals in 0u32..6) {
269            let once = round_f64(value, decimals);
270            let twice = round_f64(once, decimals);
271            prop_assert!((once - twice).abs() < 1e-10, "Rounding should be idempotent");
272        }
273
274        #[test]
275        fn round_preserves_integer(value in -1000i64..1000) {
276            let f = value as f64;
277            for decimals in 0..6 {
278                let rounded = round_f64(f, decimals);
279                prop_assert_eq!(rounded, f, "Rounding integer should preserve it");
280            }
281        }
282
283        // ========================
284        // Path Depth Properties
285        // ========================
286
287        #[test]
288        fn path_depth_always_at_least_one(path in "\\PC*") {
289            let depth = path_depth(&path);
290            prop_assert!(depth >= 1, "Path depth should always be at least 1");
291        }
292
293        #[test]
294        fn path_depth_counts_segments(segments in prop::collection::vec("[a-zA-Z0-9_]+", 1..10)) {
295            let path = segments.join("/");
296            let depth = path_depth(&path);
297            prop_assert_eq!(depth, segments.len(), "Depth should equal segment count for {}", path);
298        }
299
300        #[test]
301        fn path_depth_ignores_empty_segments(segments in prop::collection::vec("[a-zA-Z0-9_]+", 1..5)) {
302            let path_normal = segments.join("/");
303            let path_with_double = segments.join("//");
304            let path_with_trailing = format!("{}/", path_normal);
305            let path_with_leading = format!("/{}", path_normal);
306
307            let d_normal = path_depth(&path_normal);
308            let d_double = path_depth(&path_with_double);
309            let d_trailing = path_depth(&path_with_trailing);
310            let d_leading = path_depth(&path_with_leading);
311
312            prop_assert_eq!(d_normal, d_double, "Double slashes should not add depth");
313            prop_assert_eq!(d_normal, d_trailing, "Trailing slash should not add depth");
314            prop_assert_eq!(d_normal, d_leading, "Leading slash should not add depth");
315        }
316
317        // ========================
318        // Is Test Path Properties
319        // ========================
320
321        #[test]
322        fn is_test_path_case_insensitive_for_dirs(prefix in "[a-zA-Z0-9_/]+", suffix in "[a-zA-Z0-9_/]+\\.rs") {
323            // Test directory markers should be case-insensitive
324            let lower = format!("{}/test/{}", prefix, suffix);
325            let upper = format!("{}/TEST/{}", prefix, suffix);
326            let mixed = format!("{}/TeSt/{}", prefix, suffix);
327
328            prop_assert_eq!(is_test_path(&lower), is_test_path(&upper), "Case sensitivity issue with TEST dir");
329            prop_assert_eq!(is_test_path(&lower), is_test_path(&mixed), "Case sensitivity issue with TeSt dir");
330        }
331
332        #[test]
333        fn is_test_path_known_test_dirs_detected(dir in prop::sample::select(vec!["test", "tests", "__tests__", "spec", "specs"])) {
334            let path = format!("src/{}/foo.rs", dir);
335            prop_assert!(is_test_path(&path), "Should detect test dir: {}", dir);
336        }
337
338        #[test]
339        fn is_test_path_file_patterns_detected(pattern in prop::sample::select(vec!["foo_test.rs", "test_foo.rs", "foo.test.js", "foo.spec.ts"])) {
340            let path = format!("src/{}", pattern);
341            prop_assert!(is_test_path(&path), "Should detect test file pattern: {}", pattern);
342        }
343
344        // ========================
345        // Is Infra Lang Properties
346        // ========================
347
348        #[test]
349        fn is_infra_lang_case_insensitive(lang in prop::sample::select(vec!["json", "yaml", "toml", "markdown", "xml", "html", "css"])) {
350            prop_assert!(is_infra_lang(lang), "Should detect infra lang: {}", lang);
351            prop_assert!(is_infra_lang(&lang.to_uppercase()), "Should detect infra lang (upper): {}", lang.to_uppercase());
352        }
353
354        #[test]
355        fn is_infra_lang_known_infra_detected(lang in prop::sample::select(vec![
356            "json", "yaml", "toml", "markdown", "xml", "html", "css", "scss", "less",
357            "makefile", "dockerfile", "hcl", "terraform", "nix", "cmake", "ini",
358            "properties", "gitignore", "gitconfig", "editorconfig", "csv", "tsv", "svg"
359        ])) {
360            prop_assert!(is_infra_lang(lang), "Should detect known infra lang: {}", lang);
361        }
362
363        #[test]
364        fn is_infra_lang_code_langs_not_infra(lang in prop::sample::select(vec![
365            "rust", "python", "javascript", "typescript", "go", "java", "c", "cpp"
366        ])) {
367            prop_assert!(!is_infra_lang(lang), "Code lang should not be infra: {}", lang);
368        }
369    }
370}